gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "alias.h"
  28 #include "symtab.h"
  29 #include "tree.h"
  30 #include "fold-const.h"
  31 #include "stor-layout.h"
  32 #include "predict.h"
  33 #include "hard-reg-set.h"
  34 #include "function.h"
  35 #include "dominance.h"
  36 #include "cfg.h"
  37 #include "cfganal.h"
  38 #include "basic-block.h"
  39 #include "gimple-pretty-print.h"
  40 #include "tree-ssa-alias.h"
  41 #include "internal-fn.h"
  42 #include "gimple-expr.h"
  43 #include "gimple.h"
  44 #include "gimplify.h"
  45 #include "gimple-iterator.h"
  46 #include "gimplify-me.h"
  47 #include "gimple-ssa.h"
  48 #include "tree-phinodes.h"
  49 #include "ssa-iterators.h"
  50 #include "stringpool.h"
  51 #include "tree-ssanames.h"
  52 #include "tree-ssa-loop-ivopts.h"
  53 #include "tree-ssa-loop-manip.h"
  54 #include "tree-ssa-loop-niter.h"
  55 #include "tree-pass.h"
  56 #include "cfgloop.h"
  57 #include "rtl.h"
  58 #include "flags.h"
  59 #include "insn-config.h"
  60 #include "expmed.h"
  61 #include "dojump.h"
  62 #include "explow.h"
  63 #include "calls.h"
  64 #include "emit-rtl.h"
  65 #include "varasm.h"
  66 #include "stmt.h"
  67 #include "expr.h"
  68 #include "recog.h"
  69 #include "insn-codes.h"
  70 #include "optabs.h"
  71 #include "params.h"
  72 #include "diagnostic-core.h"
  73 #include "tree-chrec.h"
  74 #include "tree-scalar-evolution.h"
  75 #include "tree-vectorizer.h"
  76 #include "target.h"
  77
  78 /* Loop Vectorization Pass.
  79
  80    This pass tries to vectorize loops.
  81
  82    For example, the vectorizer transforms the following simple loop:
  83
  84         short a[N]; short b[N]; short c[N]; int i;
  85
  86         for (i=0; i<N; i++){
  87           a[i] = b[i] + c[i];
  88         }
  89
  90    as if it was manually vectorized by rewriting the source code into:
  91
  92         typedef int __attribute__((mode(V8HI))) v8hi;
  93         short a[N];  short b[N]; short c[N];   int i;
  94         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  95         v8hi va, vb, vc;
  96
  97         for (i=0; i<N/8; i++){
  98           vb = pb[i];
  99           vc = pc[i];
 100           va = vb + vc;
 101           pa[i] = va;
 102         }
 103
 104         The main entry to this pass is vectorize_loops(), in which
 105    the vectorizer applies a set of analyses on a given set of loops,
 106    followed by the actual vectorization transformation for the loops that
 107    had successfully passed the analysis phase.
 108         Throughout this pass we make a distinction between two types of
 109    data: scalars (which are represented by SSA_NAMES), and memory references
 110    ("data-refs").  These two types of data require different handling both
 111    during analysis and transformation. The types of data-refs that the
 112    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 113    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 114    accesses are required to have a simple (consecutive) access pattern.
 115
 116    Analysis phase:
 117    ===============
 118         The driver for the analysis phase is vect_analyze_loop().
 119    It applies a set of analyses, some of which rely on the scalar evolution
 120    analyzer (scev) developed by Sebastian Pop.
 121
 122         During the analysis phase the vectorizer records some information
 123    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 124    loop, as well as general information about the loop as a whole, which is
 125    recorded in a "loop_vec_info" struct attached to each loop.
 126
 127    Transformation phase:
 128    =====================
 129         The loop transformation phase scans all the stmts in the loop, and
 130    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 131    the loop that needs to be vectorized.  It inserts the vector code sequence
 132    just before the scalar stmt S, and records a pointer to the vector code
 133    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 134    attached to S).  This pointer will be used for the vectorization of following
 135    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 136    otherwise, we rely on dead code elimination for removing it.
 137
 138         For example, say stmt S1 was vectorized into stmt VS1:
 139
 140    VS1: vb = px[i];
 141    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 142    S2:  a = b;
 143
 144    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 145    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 146    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 147    resulting sequence would be:
 148
 149    VS1: vb = px[i];
 150    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 151    VS2: va = vb;
 152    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 153
 154         Operands that are not SSA_NAMEs, are data-refs that appear in
 155    load/store operations (like 'x[i]' in S1), and are handled differently.
 156
 157    Target modeling:
 158    =================
 159         Currently the only target specific information that is used is the
 160    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 161    Targets that can support different sizes of vectors, for now will need
 162    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 163    flexibility will be added in the future.
 164
 165         Since we only vectorize operations which vector form can be
 166    expressed using existing tree codes, to verify that an operation is
 167    supported, the vectorizer checks the relevant optab at the relevant
 168    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 169    the value found is CODE_FOR_nothing, then there's no target support, and
 170    we can't vectorize the stmt.
 171
 172    For additional information on this project see:
 173    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 174 */
 175
 176 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 177
 178 /* Function vect_determine_vectorization_factor
 179
 180    Determine the vectorization factor (VF).  VF is the number of data elements
 181    that are operated upon in parallel in a single iteration of the vectorized
 182    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 183    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 184    elements can fit in a single vector register.
 185
 186    We currently support vectorization of loops in which all types operated upon
 187    are of the same size.  Therefore this function currently sets VF according to
 188    the size of the types operated upon, and fails if there are multiple sizes
 189    in the loop.
 190
 191    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 192    original loop:
 193         for (i=0; i<N; i++){
 194           a[i] = b[i] + c[i];
 195         }
 196
 197    vectorized loop:
 198         for (i=0; i<N; i+=VF){
 199           a[i:VF] = b[i:VF] + c[i:VF];
 200         }
 201 */
 202
 203 static bool
 204 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 205 {
 206   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 207   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 208   int nbbs = loop->num_nodes;
 209   unsigned int vectorization_factor = 0;
 210   tree scalar_type;
 211   gphi *phi;
 212   tree vectype;
 213   unsigned int nunits;
 214   stmt_vec_info stmt_info;
 215   int i;
 216   HOST_WIDE_INT dummy;
 217   gimple stmt, pattern_stmt = NULL;
 218   gimple_seq pattern_def_seq = NULL;
 219   gimple_stmt_iterator pattern_def_si = gsi_none ();
 220   bool analyze_pattern_stmt = false;
 221
 222   if (dump_enabled_p ())
 223     dump_printf_loc (MSG_NOTE, vect_location,
 224                      "=== vect_determine_vectorization_factor ===\n");
 225
 226   for (i = 0; i < nbbs; i++)
 227     {
 228       basic_block bb = bbs[i];
 229
 230       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 231            gsi_next (&si))
 232         {
 233           phi = si.phi ();
 234           stmt_info = vinfo_for_stmt (phi);
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 238               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 239               dump_printf (MSG_NOTE, "\n");
 240             }
 241
 242           gcc_assert (stmt_info);
 243
 244           if (STMT_VINFO_RELEVANT_P (stmt_info))
 245             {
 246               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 247               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 248
 249               if (dump_enabled_p ())
 250                 {
 251                   dump_printf_loc (MSG_NOTE, vect_location,
 252                                    "get vectype for scalar type:  ");
 253                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 254                   dump_printf (MSG_NOTE, "\n");
 255                 }
 256
 257               vectype = get_vectype_for_scalar_type (scalar_type);
 258               if (!vectype)
 259                 {
 260                   if (dump_enabled_p ())
 261                     {
 262                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 263                                        "not vectorized: unsupported "
 264                                        "data-type ");
 265                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 266                                          scalar_type);
 267                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 268                     }
 269                   return false;
 270                 }
 271               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 272
 273               if (dump_enabled_p ())
 274                 {
 275                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 276                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 277                   dump_printf (MSG_NOTE, "\n");
 278                 }
 279
 280               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 281               if (dump_enabled_p ())
 282                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 283                                  nunits);
 284
 285               if (!vectorization_factor
 286                   || (nunits > vectorization_factor))
 287                 vectorization_factor = nunits;
 288             }
 289         }
 290
 291       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 292            !gsi_end_p (si) || analyze_pattern_stmt;)
 293         {
 294           tree vf_vectype;
 295
 296           if (analyze_pattern_stmt)
 297             stmt = pattern_stmt;
 298           else
 299             stmt = gsi_stmt (si);
 300
 301           stmt_info = vinfo_for_stmt (stmt);
 302
 303           if (dump_enabled_p ())
 304             {
 305               dump_printf_loc (MSG_NOTE, vect_location,
 306                                "==> examining statement: ");
 307               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 308               dump_printf (MSG_NOTE, "\n");
 309             }
 310
 311           gcc_assert (stmt_info);
 312
 313           /* Skip stmts which do not need to be vectorized.  */
 314           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 315                && !STMT_VINFO_LIVE_P (stmt_info))
 316               || gimple_clobber_p (stmt))
 317             {
 318               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 319                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 320                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 321                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 322                 {
 323                   stmt = pattern_stmt;
 324                   stmt_info = vinfo_for_stmt (pattern_stmt);
 325                   if (dump_enabled_p ())
 326                     {
 327                       dump_printf_loc (MSG_NOTE, vect_location,
 328                                        "==> examining pattern statement: ");
 329                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 330                       dump_printf (MSG_NOTE, "\n");
 331                     }
 332                 }
 333               else
 334                 {
 335                   if (dump_enabled_p ())
 336                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 337                   gsi_next (&si);
 338                   continue;
 339                 }
 340             }
 341           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 342                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 343                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 344                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 345             analyze_pattern_stmt = true;
 346
 347           /* If a pattern statement has def stmts, analyze them too.  */
 348           if (is_pattern_stmt_p (stmt_info))
 349             {
 350               if (pattern_def_seq == NULL)
 351                 {
 352                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 353                   pattern_def_si = gsi_start (pattern_def_seq);
 354                 }
 355               else if (!gsi_end_p (pattern_def_si))
 356                 gsi_next (&pattern_def_si);
 357               if (pattern_def_seq != NULL)
 358                 {
 359                   gimple pattern_def_stmt = NULL;
 360                   stmt_vec_info pattern_def_stmt_info = NULL;
 361
 362                   while (!gsi_end_p (pattern_def_si))
 363                     {
 364                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 365                       pattern_def_stmt_info
 366                         = vinfo_for_stmt (pattern_def_stmt);
 367                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 368                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 369                         break;
 370                       gsi_next (&pattern_def_si);
 371                     }
 372
 373                   if (!gsi_end_p (pattern_def_si))
 374                     {
 375                       if (dump_enabled_p ())
 376                         {
 377                           dump_printf_loc (MSG_NOTE, vect_location,
 378                                            "==> examining pattern def stmt: ");
 379                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 380                                             pattern_def_stmt, 0);
 381                           dump_printf (MSG_NOTE, "\n");
 382                         }
 383
 384                       stmt = pattern_def_stmt;
 385                       stmt_info = pattern_def_stmt_info;
 386                     }
 387                   else
 388                     {
 389                       pattern_def_si = gsi_none ();
 390                       analyze_pattern_stmt = false;
 391                     }
 392                 }
 393               else
 394                 analyze_pattern_stmt = false;
 395             }
 396
 397           if (gimple_get_lhs (stmt) == NULL_TREE
 398               /* MASK_STORE has no lhs, but is ok.  */
 399               && (!is_gimple_call (stmt)
 400                   || !gimple_call_internal_p (stmt)
 401                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 402             {
 403               if (is_gimple_call (stmt))
 404                 {
 405                   /* Ignore calls with no lhs.  These must be calls to
 406                      #pragma omp simd functions, and what vectorization factor
 407                      it really needs can't be determined until
 408                      vectorizable_simd_clone_call.  */
 409                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 410                     {
 411                       pattern_def_seq = NULL;
 412                       gsi_next (&si);
 413                     }
 414                   continue;
 415                 }
 416               if (dump_enabled_p ())
 417                 {
 418                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 419                                    "not vectorized: irregular stmt.");
 420                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 421                                     0);
 422                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 423                 }
 424               return false;
 425             }
 426
 427           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 428             {
 429               if (dump_enabled_p ())
 430                 {
 431                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 432                                    "not vectorized: vector stmt in loop:");
 433                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 435                 }
 436               return false;
 437             }
 438
 439           if (STMT_VINFO_VECTYPE (stmt_info))
 440             {
 441               /* The only case when a vectype had been already set is for stmts
 442                  that contain a dataref, or for "pattern-stmts" (stmts
 443                  generated by the vectorizer to represent/replace a certain
 444                  idiom).  */
 445               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 446                           || is_pattern_stmt_p (stmt_info)
 447                           || !gsi_end_p (pattern_def_si));
 448               vectype = STMT_VINFO_VECTYPE (stmt_info);
 449             }
 450           else
 451             {
 452               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 453               if (is_gimple_call (stmt)
 454                   && gimple_call_internal_p (stmt)
 455                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 456                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 457               else
 458                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 459               if (dump_enabled_p ())
 460                 {
 461                   dump_printf_loc (MSG_NOTE, vect_location,
 462                                    "get vectype for scalar type:  ");
 463                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 464                   dump_printf (MSG_NOTE, "\n");
 465                 }
 466               vectype = get_vectype_for_scalar_type (scalar_type);
 467               if (!vectype)
 468                 {
 469                   if (dump_enabled_p ())
 470                     {
 471                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 472                                        "not vectorized: unsupported "
 473                                        "data-type ");
 474                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 475                                          scalar_type);
 476                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 477                     }
 478                   return false;
 479                 }
 480
 481               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 482
 483               if (dump_enabled_p ())
 484                 {
 485                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 486                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 487                   dump_printf (MSG_NOTE, "\n");
 488                 }
 489             }
 490
 491           /* The vectorization factor is according to the smallest
 492              scalar type (or the largest vector size, but we only
 493              support one vector size per loop).  */
 494           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 495                                                        &dummy);
 496           if (dump_enabled_p ())
 497             {
 498               dump_printf_loc (MSG_NOTE, vect_location,
 499                                "get vectype for scalar type:  ");
 500               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 501               dump_printf (MSG_NOTE, "\n");
 502             }
 503           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 504           if (!vf_vectype)
 505             {
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 509                                    "not vectorized: unsupported data-type ");
 510                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 511                                      scalar_type);
 512                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 513                 }
 514               return false;
 515             }
 516
 517           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 518                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 519             {
 520               if (dump_enabled_p ())
 521                 {
 522                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                                    "not vectorized: different sized vector "
 524                                    "types in statement, ");
 525                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 526                                      vectype);
 527                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 528                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 529                                      vf_vectype);
 530                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 531                 }
 532               return false;
 533             }
 534
 535           if (dump_enabled_p ())
 536             {
 537               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 538               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 539               dump_printf (MSG_NOTE, "\n");
 540             }
 541
 542           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 543           if (dump_enabled_p ())
 544             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 545           if (!vectorization_factor
 546               || (nunits > vectorization_factor))
 547             vectorization_factor = nunits;
 548
 549           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 550             {
 551               pattern_def_seq = NULL;
 552               gsi_next (&si);
 553             }
 554         }
 555     }
 556
 557   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 558   if (dump_enabled_p ())
 559     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 560                      vectorization_factor);
 561   if (vectorization_factor <= 1)
 562     {
 563       if (dump_enabled_p ())
 564         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 565                          "not vectorized: unsupported data-type\n");
 566       return false;
 567     }
 568   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 569
 570   return true;
 571 }
 572
 573
 574 /* Function vect_is_simple_iv_evolution.
 575
 576    FORNOW: A simple evolution of an induction variables in the loop is
 577    considered a polynomial evolution.  */
 578
 579 static bool
 580 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 581                              tree * step)
 582 {
 583   tree init_expr;
 584   tree step_expr;
 585   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 586   basic_block bb;
 587
 588   /* When there is no evolution in this loop, the evolution function
 589      is not "simple".  */
 590   if (evolution_part == NULL_TREE)
 591     return false;
 592
 593   /* When the evolution is a polynomial of degree >= 2
 594      the evolution function is not "simple".  */
 595   if (tree_is_chrec (evolution_part))
 596     return false;
 597
 598   step_expr = evolution_part;
 599   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 600
 601   if (dump_enabled_p ())
 602     {
 603       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 604       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 605       dump_printf (MSG_NOTE, ",  init: ");
 606       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 607       dump_printf (MSG_NOTE, "\n");
 608     }
 609
 610   *init = init_expr;
 611   *step = step_expr;
 612
 613   if (TREE_CODE (step_expr) != INTEGER_CST
 614       && (TREE_CODE (step_expr) != SSA_NAME
 615           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 616               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 617           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 618               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 619                   || !flag_associative_math)))
 620       && (TREE_CODE (step_expr) != REAL_CST
 621           || !flag_associative_math))
 622     {
 623       if (dump_enabled_p ())
 624         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 625                          "step unknown.\n");
 626       return false;
 627     }
 628
 629   return true;
 630 }
 631
 632 /* Function vect_analyze_scalar_cycles_1.
 633
 634    Examine the cross iteration def-use cycles of scalar variables
 635    in LOOP.  LOOP_VINFO represents the loop that is now being
 636    considered for vectorization (can be LOOP, or an outer-loop
 637    enclosing LOOP).  */
 638
 639 static void
 640 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 641 {
 642   basic_block bb = loop->header;
 643   tree init, step;
 644   auto_vec<gimple, 64> worklist;
 645   gphi_iterator gsi;
 646   bool double_reduc;
 647
 648   if (dump_enabled_p ())
 649     dump_printf_loc (MSG_NOTE, vect_location,
 650                      "=== vect_analyze_scalar_cycles ===\n");
 651
 652   /* First - identify all inductions.  Reduction detection assumes that all the
 653      inductions have been identified, therefore, this order must not be
 654      changed.  */
 655   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 656     {
 657       gphi *phi = gsi.phi ();
 658       tree access_fn = NULL;
 659       tree def = PHI_RESULT (phi);
 660       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 661
 662       if (dump_enabled_p ())
 663         {
 664           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 665           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 666           dump_printf (MSG_NOTE, "\n");
 667         }
 668
 669       /* Skip virtual phi's.  The data dependences that are associated with
 670          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 671       if (virtual_operand_p (def))
 672         continue;
 673
 674       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 675
 676       /* Analyze the evolution function.  */
 677       access_fn = analyze_scalar_evolution (loop, def);
 678       if (access_fn)
 679         {
 680           STRIP_NOPS (access_fn);
 681           if (dump_enabled_p ())
 682             {
 683               dump_printf_loc (MSG_NOTE, vect_location,
 684                                "Access function of PHI: ");
 685               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 686               dump_printf (MSG_NOTE, "\n");
 687             }
 688           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 689             = evolution_part_in_loop_num (access_fn, loop->num);
 690         }
 691
 692       if (!access_fn
 693           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 694           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 695               && TREE_CODE (step) != INTEGER_CST))
 696         {
 697           worklist.safe_push (phi);
 698           continue;
 699         }
 700
 701       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 702
 703       if (dump_enabled_p ())
 704         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 705       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 706     }
 707
 708
 709   /* Second - identify all reductions and nested cycles.  */
 710   while (worklist.length () > 0)
 711     {
 712       gimple phi = worklist.pop ();
 713       tree def = PHI_RESULT (phi);
 714       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 715       gimple reduc_stmt;
 716       bool nested_cycle;
 717
 718       if (dump_enabled_p ())
 719         {
 720           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 721           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 722           dump_printf (MSG_NOTE, "\n");
 723         }
 724
 725       gcc_assert (!virtual_operand_p (def)
 726                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 727
 728       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 729       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 730                                                 &double_reduc);
 731       if (reduc_stmt)
 732         {
 733           if (double_reduc)
 734             {
 735               if (dump_enabled_p ())
 736                 dump_printf_loc (MSG_NOTE, vect_location,
 737                                  "Detected double reduction.\n");
 738
 739               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 740               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 741                                                     vect_double_reduction_def;
 742             }
 743           else
 744             {
 745               if (nested_cycle)
 746                 {
 747                   if (dump_enabled_p ())
 748                     dump_printf_loc (MSG_NOTE, vect_location,
 749                                      "Detected vectorizable nested cycle.\n");
 750
 751                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 752                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 753                                                              vect_nested_cycle;
 754                 }
 755               else
 756                 {
 757                   if (dump_enabled_p ())
 758                     dump_printf_loc (MSG_NOTE, vect_location,
 759                                      "Detected reduction.\n");
 760
 761                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 762                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 763                                                            vect_reduction_def;
 764                   /* Store the reduction cycles for possible vectorization in
 765                      loop-aware SLP.  */
 766                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 767                 }
 768             }
 769         }
 770       else
 771         if (dump_enabled_p ())
 772           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 773                            "Unknown def-use cycle pattern.\n");
 774     }
 775 }
 776
 777
 778 /* Function vect_analyze_scalar_cycles.
 779
 780    Examine the cross iteration def-use cycles of scalar variables, by
 781    analyzing the loop-header PHIs of scalar variables.  Classify each
 782    cycle as one of the following: invariant, induction, reduction, unknown.
 783    We do that for the loop represented by LOOP_VINFO, and also to its
 784    inner-loop, if exists.
 785    Examples for scalar cycles:
 786
 787    Example1: reduction:
 788
 789               loop1:
 790               for (i=0; i<N; i++)
 791                  sum += a[i];
 792
 793    Example2: induction:
 794
 795               loop2:
 796               for (i=0; i<N; i++)
 797                  a[i] = i;  */
 798
 799 static void
 800 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 801 {
 802   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 803
 804   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 805
 806   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 807      Reductions in such inner-loop therefore have different properties than
 808      the reductions in the nest that gets vectorized:
 809      1. When vectorized, they are executed in the same order as in the original
 810         scalar loop, so we can't change the order of computation when
 811         vectorizing them.
 812      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 813         current checks are too strict.  */
 814
 815   if (loop->inner)
 816     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 817 }
 818
 819 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 820
 821 static void
 822 vect_fixup_reduc_chain (gimple stmt)
 823 {
 824   gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 825   gimple stmtp;
 826   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 827               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 828   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 829   do
 830     {
 831       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 832       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 833       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 834       if (stmt)
 835         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 836           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 837     }
 838   while (stmt);
 839   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 840 }
 841
 842 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 843
 844 static void
 845 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 846 {
 847   gimple first;
 848   unsigned i;
 849
 850   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 851     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 852       {
 853         vect_fixup_reduc_chain (first);
 854         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 855           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 856       }
 857 }
 858
 859 /* Function vect_get_loop_niters.
 860
 861    Determine how many iterations the loop is executed and place it
 862    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 863    in NUMBER_OF_ITERATIONSM1.
 864
 865    Return the loop exit condition.  */
 866
 867
 868 static gcond *
 869 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 870                       tree *number_of_iterationsm1)
 871 {
 872   tree niters;
 873
 874   if (dump_enabled_p ())
 875     dump_printf_loc (MSG_NOTE, vect_location,
 876                      "=== get_loop_niters ===\n");
 877
 878   niters = number_of_latch_executions (loop);
 879   *number_of_iterationsm1 = niters;
 880
 881   /* We want the number of loop header executions which is the number
 882      of latch executions plus one.
 883      ???  For UINT_MAX latch executions this number overflows to zero
 884      for loops like do { n++; } while (n != 0);  */
 885   if (niters && !chrec_contains_undetermined (niters))
 886     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 887                           build_int_cst (TREE_TYPE (niters), 1));
 888   *number_of_iterations = niters;
 889
 890   return get_loop_exit_condition (loop);
 891 }
 892
 893
 894 /* Function bb_in_loop_p
 895
 896    Used as predicate for dfs order traversal of the loop bbs.  */
 897
 898 static bool
 899 bb_in_loop_p (const_basic_block bb, const void *data)
 900 {
 901   const struct loop *const loop = (const struct loop *)data;
 902   if (flow_bb_inside_loop_p (loop, bb))
 903     return true;
 904   return false;
 905 }
 906
 907
 908 /* Function new_loop_vec_info.
 909
 910    Create and initialize a new loop_vec_info struct for LOOP, as well as
 911    stmt_vec_info structs for all the stmts in LOOP.  */
 912
 913 static loop_vec_info
 914 new_loop_vec_info (struct loop *loop)
 915 {
 916   loop_vec_info res;
 917   basic_block *bbs;
 918   gimple_stmt_iterator si;
 919   unsigned int i, nbbs;
 920
 921   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 922   LOOP_VINFO_LOOP (res) = loop;
 923
 924   bbs = get_loop_body (loop);
 925
 926   /* Create/Update stmt_info for all stmts in the loop.  */
 927   for (i = 0; i < loop->num_nodes; i++)
 928     {
 929       basic_block bb = bbs[i];
 930
 931       /* BBs in a nested inner-loop will have been already processed (because
 932          we will have called vect_analyze_loop_form for any nested inner-loop).
 933          Therefore, for stmts in an inner-loop we just want to update the
 934          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 935          loop_info of the outer-loop we are currently considering to vectorize
 936          (instead of the loop_info of the inner-loop).
 937          For stmts in other BBs we need to create a stmt_info from scratch.  */
 938       if (bb->loop_father != loop)
 939         {
 940           /* Inner-loop bb.  */
 941           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 942           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 943             {
 944               gimple phi = gsi_stmt (si);
 945               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 946               loop_vec_info inner_loop_vinfo =
 947                 STMT_VINFO_LOOP_VINFO (stmt_info);
 948               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 949               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 950             }
 951           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 952            {
 953               gimple stmt = gsi_stmt (si);
 954               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 955               loop_vec_info inner_loop_vinfo =
 956                  STMT_VINFO_LOOP_VINFO (stmt_info);
 957               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 958               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 959            }
 960         }
 961       else
 962         {
 963           /* bb in current nest.  */
 964           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 965             {
 966               gimple phi = gsi_stmt (si);
 967               gimple_set_uid (phi, 0);
 968               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 969             }
 970
 971           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 972             {
 973               gimple stmt = gsi_stmt (si);
 974               gimple_set_uid (stmt, 0);
 975               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 976             }
 977         }
 978     }
 979
 980   /* CHECKME: We want to visit all BBs before their successors (except for
 981      latch blocks, for which this assertion wouldn't hold).  In the simple
 982      case of the loop forms we allow, a dfs order of the BBs would the same
 983      as reversed postorder traversal, so we are safe.  */
 984
 985    free (bbs);
 986    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 987    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 988                               bbs, loop->num_nodes, loop);
 989    gcc_assert (nbbs == loop->num_nodes);
 990
 991   LOOP_VINFO_BBS (res) = bbs;
 992   LOOP_VINFO_NITERSM1 (res) = NULL;
 993   LOOP_VINFO_NITERS (res) = NULL;
 994   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 995   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 996   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 997   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 998   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 999   LOOP_VINFO_VECT_FACTOR (res) = 0;
1000   LOOP_VINFO_LOOP_NEST (res).create (3);
1001   LOOP_VINFO_DATAREFS (res).create (10);
1002   LOOP_VINFO_DDRS (res).create (10 * 10);
1003   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1004   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
1005              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1006   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
1007              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1008   LOOP_VINFO_GROUPED_STORES (res).create (10);
1009   LOOP_VINFO_REDUCTIONS (res).create (10);
1010   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
1011   LOOP_VINFO_SLP_INSTANCES (res).create (10);
1012   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1013   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1014   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1015   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1016   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1017
1018   return res;
1019 }
1020
1021
1022 /* Function destroy_loop_vec_info.
1023
1024    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1025    stmts in the loop.  */
1026
1027 void
1028 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1029 {
1030   struct loop *loop;
1031   basic_block *bbs;
1032   int nbbs;
1033   gimple_stmt_iterator si;
1034   int j;
1035   vec<slp_instance> slp_instances;
1036   slp_instance instance;
1037   bool swapped;
1038
1039   if (!loop_vinfo)
1040     return;
1041
1042   loop = LOOP_VINFO_LOOP (loop_vinfo);
1043
1044   bbs = LOOP_VINFO_BBS (loop_vinfo);
1045   nbbs = clean_stmts ? loop->num_nodes : 0;
1046   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1047
1048   for (j = 0; j < nbbs; j++)
1049     {
1050       basic_block bb = bbs[j];
1051       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1052         free_stmt_vec_info (gsi_stmt (si));
1053
1054       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1055         {
1056           gimple stmt = gsi_stmt (si);
1057
1058           /* We may have broken canonical form by moving a constant
1059              into RHS1 of a commutative op.  Fix such occurrences.  */
1060           if (swapped && is_gimple_assign (stmt))
1061             {
1062               enum tree_code code = gimple_assign_rhs_code (stmt);
1063
1064               if ((code == PLUS_EXPR
1065                    || code == POINTER_PLUS_EXPR
1066                    || code == MULT_EXPR)
1067                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1068                 swap_ssa_operands (stmt,
1069                                    gimple_assign_rhs1_ptr (stmt),
1070                                    gimple_assign_rhs2_ptr (stmt));
1071             }
1072
1073           /* Free stmt_vec_info.  */
1074           free_stmt_vec_info (stmt);
1075           gsi_next (&si);
1076         }
1077     }
1078
1079   free (LOOP_VINFO_BBS (loop_vinfo));
1080   vect_destroy_datarefs (loop_vinfo, NULL);
1081   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1082   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1083   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1084   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1085   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1086   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1087     vect_free_slp_instance (instance);
1088
1089   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1090   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1091   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1092   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1093
1094   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1095   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1096
1097   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1098   loop_vinfo->scalar_cost_vec.release ();
1099
1100   free (loop_vinfo);
1101   loop->aux = NULL;
1102 }
1103
1104
1105 /* Calculate the cost of one scalar iteration of the loop.  */
1106 static void
1107 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1108 {
1109   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1110   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1111   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1112   int innerloop_iters, i;
1113
1114   /* Count statements in scalar loop.  Using this as scalar cost for a single
1115      iteration for now.
1116
1117      TODO: Add outer loop support.
1118
1119      TODO: Consider assigning different costs to different scalar
1120      statements.  */
1121
1122   /* FORNOW.  */
1123   innerloop_iters = 1;
1124   if (loop->inner)
1125     innerloop_iters = 50; /* FIXME */
1126
1127   for (i = 0; i < nbbs; i++)
1128     {
1129       gimple_stmt_iterator si;
1130       basic_block bb = bbs[i];
1131
1132       if (bb->loop_father == loop->inner)
1133         factor = innerloop_iters;
1134       else
1135         factor = 1;
1136
1137       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1138         {
1139           gimple stmt = gsi_stmt (si);
1140           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1141
1142           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1143             continue;
1144
1145           /* Skip stmts that are not vectorized inside the loop.  */
1146           if (stmt_info
1147               && !STMT_VINFO_RELEVANT_P (stmt_info)
1148               && (!STMT_VINFO_LIVE_P (stmt_info)
1149                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1150               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1151             continue;
1152
1153           vect_cost_for_stmt kind;
1154           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1155             {
1156               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1157                kind = scalar_load;
1158              else
1159                kind = scalar_store;
1160             }
1161           else
1162             kind = scalar_stmt;
1163
1164           scalar_single_iter_cost
1165             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1166                                  factor, kind, NULL, 0, vect_prologue);
1167         }
1168     }
1169   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1170     = scalar_single_iter_cost;
1171 }
1172
1173
1174 /* Function vect_analyze_loop_1.
1175
1176    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1177    for it. The different analyses will record information in the
1178    loop_vec_info struct.  This is a subset of the analyses applied in
1179    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1180    that is now considered for (outer-loop) vectorization.  */
1181
1182 static loop_vec_info
1183 vect_analyze_loop_1 (struct loop *loop)
1184 {
1185   loop_vec_info loop_vinfo;
1186
1187   if (dump_enabled_p ())
1188     dump_printf_loc (MSG_NOTE, vect_location,
1189                      "===== analyze_loop_nest_1 =====\n");
1190
1191   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1192
1193   loop_vinfo = vect_analyze_loop_form (loop);
1194   if (!loop_vinfo)
1195     {
1196       if (dump_enabled_p ())
1197         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1198                          "bad inner-loop form.\n");
1199       return NULL;
1200     }
1201
1202   return loop_vinfo;
1203 }
1204
1205
1206 /* Function vect_analyze_loop_form.
1207
1208    Verify that certain CFG restrictions hold, including:
1209    - the loop has a pre-header
1210    - the loop has a single entry and exit
1211    - the loop exit condition is simple enough, and the number of iterations
1212      can be analyzed (a countable loop).  */
1213
1214 loop_vec_info
1215 vect_analyze_loop_form (struct loop *loop)
1216 {
1217   loop_vec_info loop_vinfo;
1218   gcond *loop_cond;
1219   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1220   loop_vec_info inner_loop_vinfo = NULL;
1221
1222   if (dump_enabled_p ())
1223     dump_printf_loc (MSG_NOTE, vect_location,
1224                      "=== vect_analyze_loop_form ===\n");
1225
1226   /* Different restrictions apply when we are considering an inner-most loop,
1227      vs. an outer (nested) loop.
1228      (FORNOW. May want to relax some of these restrictions in the future).  */
1229
1230   if (!loop->inner)
1231     {
1232       /* Inner-most loop.  We currently require that the number of BBs is
1233          exactly 2 (the header and latch).  Vectorizable inner-most loops
1234          look like this:
1235
1236                         (pre-header)
1237                            |
1238                           header <--------+
1239                            | |            |
1240                            | +--> latch --+
1241                            |
1242                         (exit-bb)  */
1243
1244       if (loop->num_nodes != 2)
1245         {
1246           if (dump_enabled_p ())
1247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                              "not vectorized: control flow in loop.\n");
1249           return NULL;
1250         }
1251
1252       if (empty_block_p (loop->header))
1253         {
1254           if (dump_enabled_p ())
1255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                              "not vectorized: empty loop.\n");
1257           return NULL;
1258         }
1259     }
1260   else
1261     {
1262       struct loop *innerloop = loop->inner;
1263       edge entryedge;
1264
1265       /* Nested loop. We currently require that the loop is doubly-nested,
1266          contains a single inner loop, and the number of BBs is exactly 5.
1267          Vectorizable outer-loops look like this:
1268
1269                         (pre-header)
1270                            |
1271                           header <---+
1272                            |         |
1273                           inner-loop |
1274                            |         |
1275                           tail ------+
1276                            |
1277                         (exit-bb)
1278
1279          The inner-loop has the properties expected of inner-most loops
1280          as described above.  */
1281
1282       if ((loop->inner)->inner || (loop->inner)->next)
1283         {
1284           if (dump_enabled_p ())
1285             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1286                              "not vectorized: multiple nested loops.\n");
1287           return NULL;
1288         }
1289
1290       /* Analyze the inner-loop.  */
1291       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1292       if (!inner_loop_vinfo)
1293         {
1294           if (dump_enabled_p ())
1295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296                              "not vectorized: Bad inner loop.\n");
1297           return NULL;
1298         }
1299
1300       if (!expr_invariant_in_loop_p (loop,
1301                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1302         {
1303           if (dump_enabled_p ())
1304             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305                              "not vectorized: inner-loop count not"
1306                              " invariant.\n");
1307           destroy_loop_vec_info (inner_loop_vinfo, true);
1308           return NULL;
1309         }
1310
1311       if (loop->num_nodes != 5)
1312         {
1313           if (dump_enabled_p ())
1314             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                              "not vectorized: control flow in loop.\n");
1316           destroy_loop_vec_info (inner_loop_vinfo, true);
1317           return NULL;
1318         }
1319
1320       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1321       entryedge = EDGE_PRED (innerloop->header, 0);
1322       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1323         entryedge = EDGE_PRED (innerloop->header, 1);
1324
1325       if (entryedge->src != loop->header
1326           || !single_exit (innerloop)
1327           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1328         {
1329           if (dump_enabled_p ())
1330             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331                              "not vectorized: unsupported outerloop form.\n");
1332           destroy_loop_vec_info (inner_loop_vinfo, true);
1333           return NULL;
1334         }
1335
1336       if (dump_enabled_p ())
1337         dump_printf_loc (MSG_NOTE, vect_location,
1338                          "Considering outer-loop vectorization.\n");
1339     }
1340
1341   if (!single_exit (loop)
1342       || EDGE_COUNT (loop->header->preds) != 2)
1343     {
1344       if (dump_enabled_p ())
1345         {
1346           if (!single_exit (loop))
1347             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348                              "not vectorized: multiple exits.\n");
1349           else if (EDGE_COUNT (loop->header->preds) != 2)
1350             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1351                              "not vectorized: too many incoming edges.\n");
1352         }
1353       if (inner_loop_vinfo)
1354         destroy_loop_vec_info (inner_loop_vinfo, true);
1355       return NULL;
1356     }
1357
1358   /* We assume that the loop exit condition is at the end of the loop. i.e,
1359      that the loop is represented as a do-while (with a proper if-guard
1360      before the loop if needed), where the loop header contains all the
1361      executable statements, and the latch is empty.  */
1362   if (!empty_block_p (loop->latch)
1363       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1364     {
1365       if (dump_enabled_p ())
1366         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1367                          "not vectorized: latch block not empty.\n");
1368       if (inner_loop_vinfo)
1369         destroy_loop_vec_info (inner_loop_vinfo, true);
1370       return NULL;
1371     }
1372
1373   /* Make sure there exists a single-predecessor exit bb:  */
1374   if (!single_pred_p (single_exit (loop)->dest))
1375     {
1376       edge e = single_exit (loop);
1377       if (!(e->flags & EDGE_ABNORMAL))
1378         {
1379           split_loop_exit_edge (e);
1380           if (dump_enabled_p ())
1381             dump_printf (MSG_NOTE, "split exit edge.\n");
1382         }
1383       else
1384         {
1385           if (dump_enabled_p ())
1386             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387                              "not vectorized: abnormal loop exit edge.\n");
1388           if (inner_loop_vinfo)
1389             destroy_loop_vec_info (inner_loop_vinfo, true);
1390           return NULL;
1391         }
1392     }
1393
1394   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1395                                     &number_of_iterationsm1);
1396   if (!loop_cond)
1397     {
1398       if (dump_enabled_p ())
1399         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                          "not vectorized: complicated exit condition.\n");
1401       if (inner_loop_vinfo)
1402         destroy_loop_vec_info (inner_loop_vinfo, true);
1403       return NULL;
1404     }
1405
1406   if (!number_of_iterations
1407       || chrec_contains_undetermined (number_of_iterations))
1408     {
1409       if (dump_enabled_p ())
1410         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411                          "not vectorized: number of iterations cannot be "
1412                          "computed.\n");
1413       if (inner_loop_vinfo)
1414         destroy_loop_vec_info (inner_loop_vinfo, true);
1415       return NULL;
1416     }
1417
1418   if (integer_zerop (number_of_iterations))
1419     {
1420       if (dump_enabled_p ())
1421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1422                          "not vectorized: number of iterations = 0.\n");
1423       if (inner_loop_vinfo)
1424         destroy_loop_vec_info (inner_loop_vinfo, true);
1425       return NULL;
1426     }
1427
1428   loop_vinfo = new_loop_vec_info (loop);
1429   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1430   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1431   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1432
1433   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1434     {
1435       if (dump_enabled_p ())
1436         {
1437           dump_printf_loc (MSG_NOTE, vect_location,
1438                            "Symbolic number of iterations is ");
1439           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1440           dump_printf (MSG_NOTE, "\n");
1441         }
1442     }
1443
1444   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1445
1446   /* CHECKME: May want to keep it around it in the future.  */
1447   if (inner_loop_vinfo)
1448     destroy_loop_vec_info (inner_loop_vinfo, false);
1449
1450   gcc_assert (!loop->aux);
1451   loop->aux = loop_vinfo;
1452   return loop_vinfo;
1453 }
1454
1455 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1456    statements update the vectorization factor.  */
1457
1458 static void
1459 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1460 {
1461   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1462   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1463   int nbbs = loop->num_nodes;
1464   unsigned int vectorization_factor;
1465   int i;
1466
1467   if (dump_enabled_p ())
1468     dump_printf_loc (MSG_NOTE, vect_location,
1469                      "=== vect_update_vf_for_slp ===\n");
1470
1471   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1472   gcc_assert (vectorization_factor != 0);
1473
1474   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1475      vectorization factor of the loop is the unrolling factor required by
1476      the SLP instances.  If that unrolling factor is 1, we say, that we
1477      perform pure SLP on loop - cross iteration parallelism is not
1478      exploited.  */
1479   bool only_slp_in_loop = true;
1480   for (i = 0; i < nbbs; i++)
1481     {
1482       basic_block bb = bbs[i];
1483       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1484            gsi_next (&si))
1485         {
1486           gimple stmt = gsi_stmt (si);
1487           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1488           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1489               && STMT_VINFO_RELATED_STMT (stmt_info))
1490             {
1491               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1492               stmt_info = vinfo_for_stmt (stmt);
1493             }
1494           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1495                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1496               && !PURE_SLP_STMT (stmt_info))
1497             /* STMT needs both SLP and loop-based vectorization.  */
1498             only_slp_in_loop = false;
1499         }
1500     }
1501
1502   if (only_slp_in_loop)
1503     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1504   else
1505     vectorization_factor
1506       = least_common_multiple (vectorization_factor,
1507                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1508
1509   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1510   if (dump_enabled_p ())
1511     dump_printf_loc (MSG_NOTE, vect_location,
1512                      "Updating vectorization factor to %d\n",
1513                      vectorization_factor);
1514 }
1515
1516 /* Function vect_analyze_loop_operations.
1517
1518    Scan the loop stmts and make sure they are all vectorizable.  */
1519
1520 static bool
1521 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1522 {
1523   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525   int nbbs = loop->num_nodes;
1526   unsigned int vectorization_factor;
1527   int i;
1528   stmt_vec_info stmt_info;
1529   bool need_to_vectorize = false;
1530   int min_profitable_iters;
1531   int min_scalar_loop_bound;
1532   unsigned int th;
1533   bool ok;
1534   HOST_WIDE_INT max_niter;
1535   HOST_WIDE_INT estimated_niter;
1536   int min_profitable_estimate;
1537
1538   if (dump_enabled_p ())
1539     dump_printf_loc (MSG_NOTE, vect_location,
1540                      "=== vect_analyze_loop_operations ===\n");
1541
1542   for (i = 0; i < nbbs; i++)
1543     {
1544       basic_block bb = bbs[i];
1545
1546       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1547            gsi_next (&si))
1548         {
1549           gphi *phi = si.phi ();
1550           ok = true;
1551
1552           stmt_info = vinfo_for_stmt (phi);
1553           if (dump_enabled_p ())
1554             {
1555               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1556               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1557               dump_printf (MSG_NOTE, "\n");
1558             }
1559
1560           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1561              (i.e., a phi in the tail of the outer-loop).  */
1562           if (! is_loop_header_bb_p (bb))
1563             {
1564               /* FORNOW: we currently don't support the case that these phis
1565                  are not used in the outerloop (unless it is double reduction,
1566                  i.e., this phi is vect_reduction_def), cause this case
1567                  requires to actually do something here.  */
1568               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1569                    || STMT_VINFO_LIVE_P (stmt_info))
1570                   && STMT_VINFO_DEF_TYPE (stmt_info)
1571                      != vect_double_reduction_def)
1572                 {
1573                   if (dump_enabled_p ())
1574                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1575                                      "Unsupported loop-closed phi in "
1576                                      "outer-loop.\n");
1577                   return false;
1578                 }
1579
1580               /* If PHI is used in the outer loop, we check that its operand
1581                  is defined in the inner loop.  */
1582               if (STMT_VINFO_RELEVANT_P (stmt_info))
1583                 {
1584                   tree phi_op;
1585                   gimple op_def_stmt;
1586
1587                   if (gimple_phi_num_args (phi) != 1)
1588                     return false;
1589
1590                   phi_op = PHI_ARG_DEF (phi, 0);
1591                   if (TREE_CODE (phi_op) != SSA_NAME)
1592                     return false;
1593
1594                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1595                   if (gimple_nop_p (op_def_stmt)
1596                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1597                       || !vinfo_for_stmt (op_def_stmt))
1598                     return false;
1599
1600                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1601                         != vect_used_in_outer
1602                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1603                            != vect_used_in_outer_by_reduction)
1604                     return false;
1605                 }
1606
1607               continue;
1608             }
1609
1610           gcc_assert (stmt_info);
1611
1612           if (STMT_VINFO_LIVE_P (stmt_info))
1613             {
1614               /* FORNOW: not yet supported.  */
1615               if (dump_enabled_p ())
1616                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1617                                  "not vectorized: value used after loop.\n");
1618               return false;
1619             }
1620
1621           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1622               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1623             {
1624               /* A scalar-dependence cycle that we don't support.  */
1625               if (dump_enabled_p ())
1626                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1627                                  "not vectorized: scalar dependence cycle.\n");
1628               return false;
1629             }
1630
1631           if (STMT_VINFO_RELEVANT_P (stmt_info))
1632             {
1633               need_to_vectorize = true;
1634               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1635                 ok = vectorizable_induction (phi, NULL, NULL);
1636             }
1637
1638           if (!ok)
1639             {
1640               if (dump_enabled_p ())
1641                 {
1642                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                                    "not vectorized: relevant phi not "
1644                                    "supported: ");
1645                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1646                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1647                 }
1648               return false;
1649             }
1650         }
1651
1652       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1653            gsi_next (&si))
1654         {
1655           gimple stmt = gsi_stmt (si);
1656           if (!gimple_clobber_p (stmt)
1657               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1658             return false;
1659         }
1660     } /* bbs */
1661
1662   /* All operations in the loop are either irrelevant (deal with loop
1663      control, or dead), or only used outside the loop and can be moved
1664      out of the loop (e.g. invariants, inductions).  The loop can be
1665      optimized away by scalar optimizations.  We're better off not
1666      touching this loop.  */
1667   if (!need_to_vectorize)
1668     {
1669       if (dump_enabled_p ())
1670         dump_printf_loc (MSG_NOTE, vect_location,
1671                          "All the computation can be taken out of the loop.\n");
1672       if (dump_enabled_p ())
1673         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1674                          "not vectorized: redundant loop. no profit to "
1675                          "vectorize.\n");
1676       return false;
1677     }
1678
1679   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1680   gcc_assert (vectorization_factor != 0);
1681
1682   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1683     dump_printf_loc (MSG_NOTE, vect_location,
1684                      "vectorization_factor = %d, niters = "
1685                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1686                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1687
1688   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1689        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1690       || ((max_niter = max_stmt_executions_int (loop)) != -1
1691           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1692     {
1693       if (dump_enabled_p ())
1694         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1695                          "not vectorized: iteration count too small.\n");
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                          "not vectorized: iteration count smaller than "
1699                          "vectorization factor.\n");
1700       return false;
1701     }
1702
1703   /* Analyze cost.  Decide if worth while to vectorize.  */
1704
1705   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1706                                       &min_profitable_estimate);
1707   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1708
1709   if (min_profitable_iters < 0)
1710     {
1711       if (dump_enabled_p ())
1712         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713                          "not vectorized: vectorization not profitable.\n");
1714       if (dump_enabled_p ())
1715         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1716                          "not vectorized: vector version will never be "
1717                          "profitable.\n");
1718       return false;
1719     }
1720
1721   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1722                             * vectorization_factor) - 1);
1723
1724
1725   /* Use the cost model only if it is more conservative than user specified
1726      threshold.  */
1727
1728   th = (unsigned) min_scalar_loop_bound;
1729   if (min_profitable_iters
1730       && (!min_scalar_loop_bound
1731           || min_profitable_iters > min_scalar_loop_bound))
1732     th = (unsigned) min_profitable_iters;
1733
1734   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1735
1736   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1737       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1738     {
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741                          "not vectorized: vectorization not profitable.\n");
1742       if (dump_enabled_p ())
1743         dump_printf_loc (MSG_NOTE, vect_location,
1744                          "not vectorized: iteration count smaller than user "
1745                          "specified loop bound parameter or minimum profitable "
1746                          "iterations (whichever is more conservative).\n");
1747       return false;
1748     }
1749
1750   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1751       && ((unsigned HOST_WIDE_INT) estimated_niter
1752           <= MAX (th, (unsigned)min_profitable_estimate)))
1753     {
1754       if (dump_enabled_p ())
1755         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756                          "not vectorized: estimated iteration count too "
1757                          "small.\n");
1758       if (dump_enabled_p ())
1759         dump_printf_loc (MSG_NOTE, vect_location,
1760                          "not vectorized: estimated iteration count smaller "
1761                          "than specified loop bound parameter or minimum "
1762                          "profitable iterations (whichever is more "
1763                          "conservative).\n");
1764       return false;
1765     }
1766
1767   return true;
1768 }
1769
1770
1771 /* Function vect_analyze_loop_2.
1772
1773    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1774    for it.  The different analyses will record information in the
1775    loop_vec_info struct.  */
1776 static bool
1777 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1778 {
1779   bool ok;
1780   int max_vf = MAX_VECTORIZATION_FACTOR;
1781   int min_vf = 2;
1782   unsigned int th;
1783   unsigned int n_stmts = 0;
1784
1785   /* Find all data references in the loop (which correspond to vdefs/vuses)
1786      and analyze their evolution in the loop.  Also adjust the minimal
1787      vectorization factor according to the loads and stores.
1788
1789      FORNOW: Handle only simple, array references, which
1790      alignment can be forced, and aligned pointer-references.  */
1791
1792   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1793   if (!ok)
1794     {
1795       if (dump_enabled_p ())
1796         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1797                          "bad data references.\n");
1798       return false;
1799     }
1800
1801   /* Classify all cross-iteration scalar data-flow cycles.
1802      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1803
1804   vect_analyze_scalar_cycles (loop_vinfo);
1805
1806   vect_pattern_recog (loop_vinfo, NULL);
1807
1808   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1809
1810   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1811      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1812
1813   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1814   if (!ok)
1815     {
1816       if (dump_enabled_p ())
1817         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818                          "bad data access.\n");
1819       return false;
1820     }
1821
1822   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1823
1824   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1825   if (!ok)
1826     {
1827       if (dump_enabled_p ())
1828         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1829                          "unexpected pattern.\n");
1830       return false;
1831     }
1832
1833   /* Analyze data dependences between the data-refs in the loop
1834      and adjust the maximum vectorization factor according to
1835      the dependences.
1836      FORNOW: fail at the first data dependence that we encounter.  */
1837
1838   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1839   if (!ok
1840       || max_vf < min_vf)
1841     {
1842       if (dump_enabled_p ())
1843             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844                              "bad data dependence.\n");
1845       return false;
1846     }
1847
1848   ok = vect_determine_vectorization_factor (loop_vinfo);
1849   if (!ok)
1850     {
1851       if (dump_enabled_p ())
1852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853                          "can't determine vectorization factor.\n");
1854       return false;
1855     }
1856   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1857     {
1858       if (dump_enabled_p ())
1859         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860                          "bad data dependence.\n");
1861       return false;
1862     }
1863
1864   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1865   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1866   if (!ok)
1867     return false;
1868
1869   /* If there are any SLP instances mark them as pure_slp.  */
1870   bool slp = vect_make_slp_decision (loop_vinfo);
1871   if (slp)
1872     {
1873       /* Find stmts that need to be both vectorized and SLPed.  */
1874       vect_detect_hybrid_slp (loop_vinfo);
1875
1876       /* Update the vectorization factor based on the SLP decision.  */
1877       vect_update_vf_for_slp (loop_vinfo);
1878     }
1879
1880   /* Analyze the alignment of the data-refs in the loop.
1881      Fail if a data reference is found that cannot be vectorized.  */
1882
1883   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1884   if (!ok)
1885     {
1886       if (dump_enabled_p ())
1887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888                          "bad data alignment.\n");
1889       return false;
1890     }
1891
1892   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1893      It is important to call pruning after vect_analyze_data_ref_accesses,
1894      since we use grouping information gathered by interleaving analysis.  */
1895   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1896   if (!ok)
1897     {
1898       if (dump_enabled_p ())
1899         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1900                          "number of versioning for alias "
1901                          "run-time tests exceeds %d "
1902                          "(--param vect-max-version-for-alias-checks)\n",
1903                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1904       return false;
1905     }
1906
1907   /* Compute the scalar iteration cost.  */
1908   vect_get_single_scalar_iteration_cost (loop_vinfo);
1909
1910   /* This pass will decide on using loop versioning and/or loop peeling in
1911      order to enhance the alignment of data references in the loop.  */
1912
1913   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1914   if (!ok)
1915     {
1916       if (dump_enabled_p ())
1917         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918                          "bad data alignment.\n");
1919       return false;
1920     }
1921
1922   if (slp)
1923     {
1924       /* Analyze operations in the SLP instances.  Note this may
1925          remove unsupported SLP instances which makes the above
1926          SLP kind detection invalid.  */
1927       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1928       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1929                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1930       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1931         return false;
1932     }
1933
1934   /* Scan all the remaining operations in the loop that are not subject
1935      to SLP and make sure they are vectorizable.  */
1936   ok = vect_analyze_loop_operations (loop_vinfo);
1937   if (!ok)
1938     {
1939       if (dump_enabled_p ())
1940         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941                          "bad operation or unsupported loop bound.\n");
1942       return false;
1943     }
1944
1945   /* Decide whether we need to create an epilogue loop to handle
1946      remaining scalar iterations.  */
1947   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1948         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1949        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1950
1951   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1952       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1953     {
1954       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1955                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1956           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1957         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1958     }
1959   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1960            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1961                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1962                /* In case of versioning, check if the maximum number of
1963                   iterations is greater than th.  If they are identical,
1964                   the epilogue is unnecessary.  */
1965                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1966                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1967                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1968                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1969     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1970
1971   /* If an epilogue loop is required make sure we can create one.  */
1972   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1973       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1974     {
1975       if (dump_enabled_p ())
1976         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1977       if (!vect_can_advance_ivs_p (loop_vinfo)
1978           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1979                                            single_exit (LOOP_VINFO_LOOP
1980                                                          (loop_vinfo))))
1981         {
1982           if (dump_enabled_p ())
1983             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1984                              "not vectorized: can't create required "
1985                              "epilog loop\n");
1986           return false;
1987         }
1988     }
1989
1990   return true;
1991 }
1992
1993 /* Function vect_analyze_loop.
1994
1995    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1996    for it.  The different analyses will record information in the
1997    loop_vec_info struct.  */
1998 loop_vec_info
1999 vect_analyze_loop (struct loop *loop)
2000 {
2001   loop_vec_info loop_vinfo;
2002   unsigned int vector_sizes;
2003
2004   /* Autodetect first vector size we try.  */
2005   current_vector_size = 0;
2006   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2007
2008   if (dump_enabled_p ())
2009     dump_printf_loc (MSG_NOTE, vect_location,
2010                      "===== analyze_loop_nest =====\n");
2011
2012   if (loop_outer (loop)
2013       && loop_vec_info_for_loop (loop_outer (loop))
2014       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_NOTE, vect_location,
2018                          "outer-loop already vectorized.\n");
2019       return NULL;
2020     }
2021
2022   while (1)
2023     {
2024       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2025       loop_vinfo = vect_analyze_loop_form (loop);
2026       if (!loop_vinfo)
2027         {
2028           if (dump_enabled_p ())
2029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2030                              "bad loop form.\n");
2031           return NULL;
2032         }
2033
2034       if (vect_analyze_loop_2 (loop_vinfo))
2035         {
2036           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2037
2038           return loop_vinfo;
2039         }
2040
2041       destroy_loop_vec_info (loop_vinfo, true);
2042
2043       vector_sizes &= ~current_vector_size;
2044       if (vector_sizes == 0
2045           || current_vector_size == 0)
2046         return NULL;
2047
2048       /* Try the next biggest vector size.  */
2049       current_vector_size = 1 << floor_log2 (vector_sizes);
2050       if (dump_enabled_p ())
2051         dump_printf_loc (MSG_NOTE, vect_location,
2052                          "***** Re-trying analysis with "
2053                          "vector size %d\n", current_vector_size);
2054     }
2055 }
2056
2057
2058 /* Function reduction_code_for_scalar_code
2059
2060    Input:
2061    CODE - tree_code of a reduction operations.
2062
2063    Output:
2064    REDUC_CODE - the corresponding tree-code to be used to reduce the
2065       vector of partial results into a single scalar result, or ERROR_MARK
2066       if the operation is a supported reduction operation, but does not have
2067       such a tree-code.
2068
2069    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2070
2071 static bool
2072 reduction_code_for_scalar_code (enum tree_code code,
2073                                 enum tree_code *reduc_code)
2074 {
2075   switch (code)
2076     {
2077       case MAX_EXPR:
2078         *reduc_code = REDUC_MAX_EXPR;
2079         return true;
2080
2081       case MIN_EXPR:
2082         *reduc_code = REDUC_MIN_EXPR;
2083         return true;
2084
2085       case PLUS_EXPR:
2086         *reduc_code = REDUC_PLUS_EXPR;
2087         return true;
2088
2089       case MULT_EXPR:
2090       case MINUS_EXPR:
2091       case BIT_IOR_EXPR:
2092       case BIT_XOR_EXPR:
2093       case BIT_AND_EXPR:
2094         *reduc_code = ERROR_MARK;
2095         return true;
2096
2097       default:
2098        return false;
2099     }
2100 }
2101
2102
2103 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2104    STMT is printed with a message MSG. */
2105
2106 static void
2107 report_vect_op (int msg_type, gimple stmt, const char *msg)
2108 {
2109   dump_printf_loc (msg_type, vect_location, "%s", msg);
2110   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2111   dump_printf (msg_type, "\n");
2112 }
2113
2114
2115 /* Detect SLP reduction of the form:
2116
2117    #a1 = phi <a5, a0>
2118    a2 = operation (a1)
2119    a3 = operation (a2)
2120    a4 = operation (a3)
2121    a5 = operation (a4)
2122
2123    #a = phi <a5>
2124
2125    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2126    FIRST_STMT is the first reduction stmt in the chain
2127    (a2 = operation (a1)).
2128
2129    Return TRUE if a reduction chain was detected.  */
2130
2131 static bool
2132 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2133 {
2134   struct loop *loop = (gimple_bb (phi))->loop_father;
2135   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2136   enum tree_code code;
2137   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2138   stmt_vec_info use_stmt_info, current_stmt_info;
2139   tree lhs;
2140   imm_use_iterator imm_iter;
2141   use_operand_p use_p;
2142   int nloop_uses, size = 0, n_out_of_loop_uses;
2143   bool found = false;
2144
2145   if (loop != vect_loop)
2146     return false;
2147
2148   lhs = PHI_RESULT (phi);
2149   code = gimple_assign_rhs_code (first_stmt);
2150   while (1)
2151     {
2152       nloop_uses = 0;
2153       n_out_of_loop_uses = 0;
2154       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2155         {
2156           gimple use_stmt = USE_STMT (use_p);
2157           if (is_gimple_debug (use_stmt))
2158             continue;
2159
2160           /* Check if we got back to the reduction phi.  */
2161           if (use_stmt == phi)
2162             {
2163               loop_use_stmt = use_stmt;
2164               found = true;
2165               break;
2166             }
2167
2168           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2169             {
2170               loop_use_stmt = use_stmt;
2171               nloop_uses++;
2172             }
2173            else
2174              n_out_of_loop_uses++;
2175
2176            /* There are can be either a single use in the loop or two uses in
2177               phi nodes.  */
2178            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2179              return false;
2180         }
2181
2182       if (found)
2183         break;
2184
2185       /* We reached a statement with no loop uses.  */
2186       if (nloop_uses == 0)
2187         return false;
2188
2189       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2190       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2191         return false;
2192
2193       if (!is_gimple_assign (loop_use_stmt)
2194           || code != gimple_assign_rhs_code (loop_use_stmt)
2195           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2196         return false;
2197
2198       /* Insert USE_STMT into reduction chain.  */
2199       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2200       if (current_stmt)
2201         {
2202           current_stmt_info = vinfo_for_stmt (current_stmt);
2203           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2204           GROUP_FIRST_ELEMENT (use_stmt_info)
2205             = GROUP_FIRST_ELEMENT (current_stmt_info);
2206         }
2207       else
2208         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2209
2210       lhs = gimple_assign_lhs (loop_use_stmt);
2211       current_stmt = loop_use_stmt;
2212       size++;
2213    }
2214
2215   if (!found || loop_use_stmt != phi || size < 2)
2216     return false;
2217
2218   /* Swap the operands, if needed, to make the reduction operand be the second
2219      operand.  */
2220   lhs = PHI_RESULT (phi);
2221   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2222   while (next_stmt)
2223     {
2224       if (gimple_assign_rhs2 (next_stmt) == lhs)
2225         {
2226           tree op = gimple_assign_rhs1 (next_stmt);
2227           gimple def_stmt = NULL;
2228
2229           if (TREE_CODE (op) == SSA_NAME)
2230             def_stmt = SSA_NAME_DEF_STMT (op);
2231
2232           /* Check that the other def is either defined in the loop
2233              ("vect_internal_def"), or it's an induction (defined by a
2234              loop-header phi-node).  */
2235           if (def_stmt
2236               && gimple_bb (def_stmt)
2237               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2238               && (is_gimple_assign (def_stmt)
2239                   || is_gimple_call (def_stmt)
2240                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2241                            == vect_induction_def
2242                   || (gimple_code (def_stmt) == GIMPLE_PHI
2243                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2244                                   == vect_internal_def
2245                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2246             {
2247               lhs = gimple_assign_lhs (next_stmt);
2248               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2249               continue;
2250             }
2251
2252           return false;
2253         }
2254       else
2255         {
2256           tree op = gimple_assign_rhs2 (next_stmt);
2257           gimple def_stmt = NULL;
2258
2259           if (TREE_CODE (op) == SSA_NAME)
2260             def_stmt = SSA_NAME_DEF_STMT (op);
2261
2262           /* Check that the other def is either defined in the loop
2263             ("vect_internal_def"), or it's an induction (defined by a
2264             loop-header phi-node).  */
2265           if (def_stmt
2266               && gimple_bb (def_stmt)
2267               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2268               && (is_gimple_assign (def_stmt)
2269                   || is_gimple_call (def_stmt)
2270                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2271                               == vect_induction_def
2272                   || (gimple_code (def_stmt) == GIMPLE_PHI
2273                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2274                                   == vect_internal_def
2275                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2276             {
2277               if (dump_enabled_p ())
2278                 {
2279                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2280                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2281                   dump_printf (MSG_NOTE, "\n");
2282                 }
2283
2284               swap_ssa_operands (next_stmt,
2285                                  gimple_assign_rhs1_ptr (next_stmt),
2286                                  gimple_assign_rhs2_ptr (next_stmt));
2287               update_stmt (next_stmt);
2288
2289               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2290                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2291             }
2292           else
2293             return false;
2294         }
2295
2296       lhs = gimple_assign_lhs (next_stmt);
2297       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2298     }
2299
2300   /* Save the chain for further analysis in SLP detection.  */
2301   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2302   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2303   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2304
2305   return true;
2306 }
2307
2308
2309 /* Function vect_is_simple_reduction_1
2310
2311    (1) Detect a cross-iteration def-use cycle that represents a simple
2312    reduction computation.  We look for the following pattern:
2313
2314    loop_header:
2315      a1 = phi < a0, a2 >
2316      a3 = ...
2317      a2 = operation (a3, a1)
2318
2319    or
2320
2321    a3 = ...
2322    loop_header:
2323      a1 = phi < a0, a2 >
2324      a2 = operation (a3, a1)
2325
2326    such that:
2327    1. operation is commutative and associative and it is safe to
2328       change the order of the computation (if CHECK_REDUCTION is true)
2329    2. no uses for a2 in the loop (a2 is used out of the loop)
2330    3. no uses of a1 in the loop besides the reduction operation
2331    4. no uses of a1 outside the loop.
2332
2333    Conditions 1,4 are tested here.
2334    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2335
2336    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2337    nested cycles, if CHECK_REDUCTION is false.
2338
2339    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2340    reductions:
2341
2342      a1 = phi < a0, a2 >
2343      inner loop (def of a3)
2344      a2 = phi < a3 >
2345
2346    If MODIFY is true it tries also to rework the code in-place to enable
2347    detection of more reduction patterns.  For the time being we rewrite
2348    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2349 */
2350
2351 static gimple
2352 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2353                             bool check_reduction, bool *double_reduc,
2354                             bool modify)
2355 {
2356   struct loop *loop = (gimple_bb (phi))->loop_father;
2357   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2358   edge latch_e = loop_latch_edge (loop);
2359   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2360   gimple def_stmt, def1 = NULL, def2 = NULL;
2361   enum tree_code orig_code, code;
2362   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2363   tree type;
2364   int nloop_uses;
2365   tree name;
2366   imm_use_iterator imm_iter;
2367   use_operand_p use_p;
2368   bool phi_def;
2369
2370   *double_reduc = false;
2371
2372   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2373      otherwise, we assume outer loop vectorization.  */
2374   gcc_assert ((check_reduction && loop == vect_loop)
2375               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2376
2377   name = PHI_RESULT (phi);
2378   /* ???  If there are no uses of the PHI result the inner loop reduction
2379      won't be detected as possibly double-reduction by vectorizable_reduction
2380      because that tries to walk the PHI arg from the preheader edge which
2381      can be constant.  See PR60382.  */
2382   if (has_zero_uses (name))
2383     return NULL;
2384   nloop_uses = 0;
2385   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2386     {
2387       gimple use_stmt = USE_STMT (use_p);
2388       if (is_gimple_debug (use_stmt))
2389         continue;
2390
2391       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2392         {
2393           if (dump_enabled_p ())
2394             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395                              "intermediate value used outside loop.\n");
2396
2397           return NULL;
2398         }
2399
2400       nloop_uses++;
2401       if (nloop_uses > 1)
2402         {
2403           if (dump_enabled_p ())
2404             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405                              "reduction used in loop.\n");
2406           return NULL;
2407         }
2408     }
2409
2410   if (TREE_CODE (loop_arg) != SSA_NAME)
2411     {
2412       if (dump_enabled_p ())
2413         {
2414           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415                            "reduction: not ssa_name: ");
2416           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2417           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2418         }
2419       return NULL;
2420     }
2421
2422   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2423   if (!def_stmt)
2424     {
2425       if (dump_enabled_p ())
2426         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427                          "reduction: no def_stmt.\n");
2428       return NULL;
2429     }
2430
2431   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2432     {
2433       if (dump_enabled_p ())
2434         {
2435           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2436           dump_printf (MSG_NOTE, "\n");
2437         }
2438       return NULL;
2439     }
2440
2441   if (is_gimple_assign (def_stmt))
2442     {
2443       name = gimple_assign_lhs (def_stmt);
2444       phi_def = false;
2445     }
2446   else
2447     {
2448       name = PHI_RESULT (def_stmt);
2449       phi_def = true;
2450     }
2451
2452   nloop_uses = 0;
2453   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2454     {
2455       gimple use_stmt = USE_STMT (use_p);
2456       if (is_gimple_debug (use_stmt))
2457         continue;
2458       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2459         nloop_uses++;
2460       if (nloop_uses > 1)
2461         {
2462           if (dump_enabled_p ())
2463             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464                              "reduction used in loop.\n");
2465           return NULL;
2466         }
2467     }
2468
2469   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2470      defined in the inner loop.  */
2471   if (phi_def)
2472     {
2473       op1 = PHI_ARG_DEF (def_stmt, 0);
2474
2475       if (gimple_phi_num_args (def_stmt) != 1
2476           || TREE_CODE (op1) != SSA_NAME)
2477         {
2478           if (dump_enabled_p ())
2479             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2480                              "unsupported phi node definition.\n");
2481
2482           return NULL;
2483         }
2484
2485       def1 = SSA_NAME_DEF_STMT (op1);
2486       if (gimple_bb (def1)
2487           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2488           && loop->inner
2489           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2490           && is_gimple_assign (def1))
2491         {
2492           if (dump_enabled_p ())
2493             report_vect_op (MSG_NOTE, def_stmt,
2494                             "detected double reduction: ");
2495
2496           *double_reduc = true;
2497           return def_stmt;
2498         }
2499
2500       return NULL;
2501     }
2502
2503   code = orig_code = gimple_assign_rhs_code (def_stmt);
2504
2505   /* We can handle "res -= x[i]", which is non-associative by
2506      simply rewriting this into "res += -x[i]".  Avoid changing
2507      gimple instruction for the first simple tests and only do this
2508      if we're allowed to change code at all.  */
2509   if (code == MINUS_EXPR
2510       && modify
2511       && (op1 = gimple_assign_rhs1 (def_stmt))
2512       && TREE_CODE (op1) == SSA_NAME
2513       && SSA_NAME_DEF_STMT (op1) == phi)
2514     code = PLUS_EXPR;
2515
2516   if (check_reduction
2517       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2518     {
2519       if (dump_enabled_p ())
2520         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2521                         "reduction: not commutative/associative: ");
2522       return NULL;
2523     }
2524
2525   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2526     {
2527       if (code != COND_EXPR)
2528         {
2529           if (dump_enabled_p ())
2530             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2531                             "reduction: not binary operation: ");
2532
2533           return NULL;
2534         }
2535
2536       op3 = gimple_assign_rhs1 (def_stmt);
2537       if (COMPARISON_CLASS_P (op3))
2538         {
2539           op4 = TREE_OPERAND (op3, 1);
2540           op3 = TREE_OPERAND (op3, 0);
2541         }
2542
2543       op1 = gimple_assign_rhs2 (def_stmt);
2544       op2 = gimple_assign_rhs3 (def_stmt);
2545
2546       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2547         {
2548           if (dump_enabled_p ())
2549             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2550                             "reduction: uses not ssa_names: ");
2551
2552           return NULL;
2553         }
2554     }
2555   else
2556     {
2557       op1 = gimple_assign_rhs1 (def_stmt);
2558       op2 = gimple_assign_rhs2 (def_stmt);
2559
2560       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2561         {
2562           if (dump_enabled_p ())
2563             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2564                             "reduction: uses not ssa_names: ");
2565
2566           return NULL;
2567         }
2568    }
2569
2570   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2571   if ((TREE_CODE (op1) == SSA_NAME
2572        && !types_compatible_p (type,TREE_TYPE (op1)))
2573       || (TREE_CODE (op2) == SSA_NAME
2574           && !types_compatible_p (type, TREE_TYPE (op2)))
2575       || (op3 && TREE_CODE (op3) == SSA_NAME
2576           && !types_compatible_p (type, TREE_TYPE (op3)))
2577       || (op4 && TREE_CODE (op4) == SSA_NAME
2578           && !types_compatible_p (type, TREE_TYPE (op4))))
2579     {
2580       if (dump_enabled_p ())
2581         {
2582           dump_printf_loc (MSG_NOTE, vect_location,
2583                            "reduction: multiple types: operation type: ");
2584           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2585           dump_printf (MSG_NOTE, ", operands types: ");
2586           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2587                              TREE_TYPE (op1));
2588           dump_printf (MSG_NOTE, ",");
2589           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2590                              TREE_TYPE (op2));
2591           if (op3)
2592             {
2593               dump_printf (MSG_NOTE, ",");
2594               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2595                                  TREE_TYPE (op3));
2596             }
2597
2598           if (op4)
2599             {
2600               dump_printf (MSG_NOTE, ",");
2601               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2602                                  TREE_TYPE (op4));
2603             }
2604           dump_printf (MSG_NOTE, "\n");
2605         }
2606
2607       return NULL;
2608     }
2609
2610   /* Check that it's ok to change the order of the computation.
2611      Generally, when vectorizing a reduction we change the order of the
2612      computation.  This may change the behavior of the program in some
2613      cases, so we need to check that this is ok.  One exception is when
2614      vectorizing an outer-loop: the inner-loop is executed sequentially,
2615      and therefore vectorizing reductions in the inner-loop during
2616      outer-loop vectorization is safe.  */
2617
2618   /* CHECKME: check for !flag_finite_math_only too?  */
2619   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2620       && check_reduction)
2621     {
2622       /* Changing the order of operations changes the semantics.  */
2623       if (dump_enabled_p ())
2624         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2625                         "reduction: unsafe fp math optimization: ");
2626       return NULL;
2627     }
2628   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2629            && check_reduction)
2630     {
2631       /* Changing the order of operations changes the semantics.  */
2632       if (dump_enabled_p ())
2633         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2634                         "reduction: unsafe int math optimization: ");
2635       return NULL;
2636     }
2637   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2638     {
2639       /* Changing the order of operations changes the semantics.  */
2640       if (dump_enabled_p ())
2641         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2642                         "reduction: unsafe fixed-point math optimization: ");
2643       return NULL;
2644     }
2645
2646   /* If we detected "res -= x[i]" earlier, rewrite it into
2647      "res += -x[i]" now.  If this turns out to be useless reassoc
2648      will clean it up again.  */
2649   if (orig_code == MINUS_EXPR)
2650     {
2651       tree rhs = gimple_assign_rhs2 (def_stmt);
2652       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2653       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2654       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2655       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2656                                                           loop_info, NULL));
2657       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2658       gimple_assign_set_rhs2 (def_stmt, negrhs);
2659       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2660       update_stmt (def_stmt);
2661     }
2662
2663   /* Reduction is safe. We're dealing with one of the following:
2664      1) integer arithmetic and no trapv
2665      2) floating point arithmetic, and special flags permit this optimization
2666      3) nested cycle (i.e., outer loop vectorization).  */
2667   if (TREE_CODE (op1) == SSA_NAME)
2668     def1 = SSA_NAME_DEF_STMT (op1);
2669
2670   if (TREE_CODE (op2) == SSA_NAME)
2671     def2 = SSA_NAME_DEF_STMT (op2);
2672
2673   if (code != COND_EXPR
2674       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2675     {
2676       if (dump_enabled_p ())
2677         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2678       return NULL;
2679     }
2680
2681   /* Check that one def is the reduction def, defined by PHI,
2682      the other def is either defined in the loop ("vect_internal_def"),
2683      or it's an induction (defined by a loop-header phi-node).  */
2684
2685   if (def2 && def2 == phi
2686       && (code == COND_EXPR
2687           || !def1 || gimple_nop_p (def1)
2688           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2689           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2690               && (is_gimple_assign (def1)
2691                   || is_gimple_call (def1)
2692                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2693                       == vect_induction_def
2694                   || (gimple_code (def1) == GIMPLE_PHI
2695                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2696                           == vect_internal_def
2697                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2698     {
2699       if (dump_enabled_p ())
2700         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2701       return def_stmt;
2702     }
2703
2704   if (def1 && def1 == phi
2705       && (code == COND_EXPR
2706           || !def2 || gimple_nop_p (def2)
2707           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2708           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2709               && (is_gimple_assign (def2)
2710                   || is_gimple_call (def2)
2711                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2712                       == vect_induction_def
2713                   || (gimple_code (def2) == GIMPLE_PHI
2714                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2715                           == vect_internal_def
2716                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2717     {
2718       if (check_reduction)
2719         {
2720           /* Swap operands (just for simplicity - so that the rest of the code
2721              can assume that the reduction variable is always the last (second)
2722              argument).  */
2723           if (dump_enabled_p ())
2724             report_vect_op (MSG_NOTE, def_stmt,
2725                             "detected reduction: need to swap operands: ");
2726
2727           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2728                              gimple_assign_rhs2_ptr (def_stmt));
2729
2730           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2731             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2732         }
2733       else
2734         {
2735           if (dump_enabled_p ())
2736             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2737         }
2738
2739       return def_stmt;
2740     }
2741
2742   /* Try to find SLP reduction chain.  */
2743   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2744     {
2745       if (dump_enabled_p ())
2746         report_vect_op (MSG_NOTE, def_stmt,
2747                         "reduction: detected reduction chain: ");
2748
2749       return def_stmt;
2750     }
2751
2752   if (dump_enabled_p ())
2753     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2754                     "reduction: unknown pattern: ");
2755
2756   return NULL;
2757 }
2758
2759 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2760    in-place.  Arguments as there.  */
2761
2762 static gimple
2763 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2764                           bool check_reduction, bool *double_reduc)
2765 {
2766   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2767                                      double_reduc, false);
2768 }
2769
2770 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2771    in-place if it enables detection of more reductions.  Arguments
2772    as there.  */
2773
2774 gimple
2775 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2776                           bool check_reduction, bool *double_reduc)
2777 {
2778   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2779                                      double_reduc, true);
2780 }
2781
2782 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2783 int
2784 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2785                              int *peel_iters_epilogue,
2786                              stmt_vector_for_cost *scalar_cost_vec,
2787                              stmt_vector_for_cost *prologue_cost_vec,
2788                              stmt_vector_for_cost *epilogue_cost_vec)
2789 {
2790   int retval = 0;
2791   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2792
2793   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2794     {
2795       *peel_iters_epilogue = vf/2;
2796       if (dump_enabled_p ())
2797         dump_printf_loc (MSG_NOTE, vect_location,
2798                          "cost model: epilogue peel iters set to vf/2 "
2799                          "because loop iterations are unknown .\n");
2800
2801       /* If peeled iterations are known but number of scalar loop
2802          iterations are unknown, count a taken branch per peeled loop.  */
2803       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2804                                  NULL, 0, vect_prologue);
2805       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2806                                  NULL, 0, vect_epilogue);
2807     }
2808   else
2809     {
2810       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2811       peel_iters_prologue = niters < peel_iters_prologue ?
2812                             niters : peel_iters_prologue;
2813       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2814       /* If we need to peel for gaps, but no peeling is required, we have to
2815          peel VF iterations.  */
2816       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2817         *peel_iters_epilogue = vf;
2818     }
2819
2820   stmt_info_for_cost *si;
2821   int j;
2822   if (peel_iters_prologue)
2823     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2824       retval += record_stmt_cost (prologue_cost_vec,
2825                                   si->count * peel_iters_prologue,
2826                                   si->kind, NULL, si->misalign,
2827                                   vect_prologue);
2828   if (*peel_iters_epilogue)
2829     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2830       retval += record_stmt_cost (epilogue_cost_vec,
2831                                   si->count * *peel_iters_epilogue,
2832                                   si->kind, NULL, si->misalign,
2833                                   vect_epilogue);
2834
2835   return retval;
2836 }
2837
2838 /* Function vect_estimate_min_profitable_iters
2839
2840    Return the number of iterations required for the vector version of the
2841    loop to be profitable relative to the cost of the scalar version of the
2842    loop.  */
2843
2844 static void
2845 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2846                                     int *ret_min_profitable_niters,
2847                                     int *ret_min_profitable_estimate)
2848 {
2849   int min_profitable_iters;
2850   int min_profitable_estimate;
2851   int peel_iters_prologue;
2852   int peel_iters_epilogue;
2853   unsigned vec_inside_cost = 0;
2854   int vec_outside_cost = 0;
2855   unsigned vec_prologue_cost = 0;
2856   unsigned vec_epilogue_cost = 0;
2857   int scalar_single_iter_cost = 0;
2858   int scalar_outside_cost = 0;
2859   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2860   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2861   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2862
2863   /* Cost model disabled.  */
2864   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2865     {
2866       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2867       *ret_min_profitable_niters = 0;
2868       *ret_min_profitable_estimate = 0;
2869       return;
2870     }
2871
2872   /* Requires loop versioning tests to handle misalignment.  */
2873   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2874     {
2875       /*  FIXME: Make cost depend on complexity of individual check.  */
2876       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2877       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2878                             vect_prologue);
2879       dump_printf (MSG_NOTE,
2880                    "cost model: Adding cost of checks for loop "
2881                    "versioning to treat misalignment.\n");
2882     }
2883
2884   /* Requires loop versioning with alias checks.  */
2885   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2886     {
2887       /*  FIXME: Make cost depend on complexity of individual check.  */
2888       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2889       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2890                             vect_prologue);
2891       dump_printf (MSG_NOTE,
2892                    "cost model: Adding cost of checks for loop "
2893                    "versioning aliasing.\n");
2894     }
2895
2896   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2897       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2898     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2899                           vect_prologue);
2900
2901   /* Count statements in scalar loop.  Using this as scalar cost for a single
2902      iteration for now.
2903
2904      TODO: Add outer loop support.
2905
2906      TODO: Consider assigning different costs to different scalar
2907      statements.  */
2908
2909   scalar_single_iter_cost
2910     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2911
2912   /* Add additional cost for the peeled instructions in prologue and epilogue
2913      loop.
2914
2915      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2916      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2917
2918      TODO: Build an expression that represents peel_iters for prologue and
2919      epilogue to be used in a run-time test.  */
2920
2921   if (npeel  < 0)
2922     {
2923       peel_iters_prologue = vf/2;
2924       dump_printf (MSG_NOTE, "cost model: "
2925                    "prologue peel iters set to vf/2.\n");
2926
2927       /* If peeling for alignment is unknown, loop bound of main loop becomes
2928          unknown.  */
2929       peel_iters_epilogue = vf/2;
2930       dump_printf (MSG_NOTE, "cost model: "
2931                    "epilogue peel iters set to vf/2 because "
2932                    "peeling for alignment is unknown.\n");
2933
2934       /* If peeled iterations are unknown, count a taken branch and a not taken
2935          branch per peeled loop. Even if scalar loop iterations are known,
2936          vector iterations are not known since peeled prologue iterations are
2937          not known. Hence guards remain the same.  */
2938       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2939                             NULL, 0, vect_prologue);
2940       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2941                             NULL, 0, vect_prologue);
2942       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2943                             NULL, 0, vect_epilogue);
2944       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2945                             NULL, 0, vect_epilogue);
2946       stmt_info_for_cost *si;
2947       int j;
2948       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2949         {
2950           struct _stmt_vec_info *stmt_info
2951             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2952           (void) add_stmt_cost (target_cost_data,
2953                                 si->count * peel_iters_prologue,
2954                                 si->kind, stmt_info, si->misalign,
2955                                 vect_prologue);
2956           (void) add_stmt_cost (target_cost_data,
2957                                 si->count * peel_iters_epilogue,
2958                                 si->kind, stmt_info, si->misalign,
2959                                 vect_epilogue);
2960         }
2961     }
2962   else
2963     {
2964       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2965       stmt_info_for_cost *si;
2966       int j;
2967       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2968
2969       prologue_cost_vec.create (2);
2970       epilogue_cost_vec.create (2);
2971       peel_iters_prologue = npeel;
2972
2973       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2974                                           &peel_iters_epilogue,
2975                                           &LOOP_VINFO_SCALAR_ITERATION_COST
2976                                             (loop_vinfo),
2977                                           &prologue_cost_vec,
2978                                           &epilogue_cost_vec);
2979
2980       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2981         {
2982           struct _stmt_vec_info *stmt_info
2983             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2984           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2985                                 si->misalign, vect_prologue);
2986         }
2987
2988       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2989         {
2990           struct _stmt_vec_info *stmt_info
2991             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2992           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2993                                 si->misalign, vect_epilogue);
2994         }
2995
2996       prologue_cost_vec.release ();
2997       epilogue_cost_vec.release ();
2998     }
2999
3000   /* FORNOW: The scalar outside cost is incremented in one of the
3001      following ways:
3002
3003      1. The vectorizer checks for alignment and aliasing and generates
3004      a condition that allows dynamic vectorization.  A cost model
3005      check is ANDED with the versioning condition.  Hence scalar code
3006      path now has the added cost of the versioning check.
3007
3008        if (cost > th & versioning_check)
3009          jmp to vector code
3010
3011      Hence run-time scalar is incremented by not-taken branch cost.
3012
3013      2. The vectorizer then checks if a prologue is required.  If the
3014      cost model check was not done before during versioning, it has to
3015      be done before the prologue check.
3016
3017        if (cost <= th)
3018          prologue = scalar_iters
3019        if (prologue == 0)
3020          jmp to vector code
3021        else
3022          execute prologue
3023        if (prologue == num_iters)
3024          go to exit
3025
3026      Hence the run-time scalar cost is incremented by a taken branch,
3027      plus a not-taken branch, plus a taken branch cost.
3028
3029      3. The vectorizer then checks if an epilogue is required.  If the
3030      cost model check was not done before during prologue check, it
3031      has to be done with the epilogue check.
3032
3033        if (prologue == 0)
3034          jmp to vector code
3035        else
3036          execute prologue
3037        if (prologue == num_iters)
3038          go to exit
3039        vector code:
3040          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3041            jmp to epilogue
3042
3043      Hence the run-time scalar cost should be incremented by 2 taken
3044      branches.
3045
3046      TODO: The back end may reorder the BBS's differently and reverse
3047      conditions/branch directions.  Change the estimates below to
3048      something more reasonable.  */
3049
3050   /* If the number of iterations is known and we do not do versioning, we can
3051      decide whether to vectorize at compile time.  Hence the scalar version
3052      do not carry cost model guard costs.  */
3053   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3054       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3055       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3056     {
3057       /* Cost model check occurs at versioning.  */
3058       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3059           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3060         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3061       else
3062         {
3063           /* Cost model check occurs at prologue generation.  */
3064           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3065             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3066               + vect_get_stmt_cost (cond_branch_not_taken);
3067           /* Cost model check occurs at epilogue generation.  */
3068           else
3069             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3070         }
3071     }
3072
3073   /* Complete the target-specific cost calculations.  */
3074   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3075                &vec_inside_cost, &vec_epilogue_cost);
3076
3077   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3078
3079   if (dump_enabled_p ())
3080     {
3081       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3082       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3083                    vec_inside_cost);
3084       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3085                    vec_prologue_cost);
3086       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3087                    vec_epilogue_cost);
3088       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3089                    scalar_single_iter_cost);
3090       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3091                    scalar_outside_cost);
3092       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3093                    vec_outside_cost);
3094       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3095                    peel_iters_prologue);
3096       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3097                    peel_iters_epilogue);
3098     }
3099
3100   /* Calculate number of iterations required to make the vector version
3101      profitable, relative to the loop bodies only.  The following condition
3102      must hold true:
3103      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3104      where
3105      SIC = scalar iteration cost, VIC = vector iteration cost,
3106      VOC = vector outside cost, VF = vectorization factor,
3107      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3108      SOC = scalar outside cost for run time cost model check.  */
3109
3110   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3111     {
3112       if (vec_outside_cost <= 0)
3113         min_profitable_iters = 1;
3114       else
3115         {
3116           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3117                                   - vec_inside_cost * peel_iters_prologue
3118                                   - vec_inside_cost * peel_iters_epilogue)
3119                                  / ((scalar_single_iter_cost * vf)
3120                                     - vec_inside_cost);
3121
3122           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3123               <= (((int) vec_inside_cost * min_profitable_iters)
3124                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3125             min_profitable_iters++;
3126         }
3127     }
3128   /* vector version will never be profitable.  */
3129   else
3130     {
3131       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3132         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3133                     "did not happen for a simd loop");
3134
3135       if (dump_enabled_p ())
3136         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3137                          "cost model: the vector iteration cost = %d "
3138                          "divided by the scalar iteration cost = %d "
3139                          "is greater or equal to the vectorization factor = %d"
3140                          ".\n",
3141                          vec_inside_cost, scalar_single_iter_cost, vf);
3142       *ret_min_profitable_niters = -1;
3143       *ret_min_profitable_estimate = -1;
3144       return;
3145     }
3146
3147   dump_printf (MSG_NOTE,
3148                "  Calculated minimum iters for profitability: %d\n",
3149                min_profitable_iters);
3150
3151   min_profitable_iters =
3152         min_profitable_iters < vf ? vf : min_profitable_iters;
3153
3154   /* Because the condition we create is:
3155      if (niters <= min_profitable_iters)
3156        then skip the vectorized loop.  */
3157   min_profitable_iters--;
3158
3159   if (dump_enabled_p ())
3160     dump_printf_loc (MSG_NOTE, vect_location,
3161                      "  Runtime profitability threshold = %d\n",
3162                      min_profitable_iters);
3163
3164   *ret_min_profitable_niters = min_profitable_iters;
3165
3166   /* Calculate number of iterations required to make the vector version
3167      profitable, relative to the loop bodies only.
3168
3169      Non-vectorized variant is SIC * niters and it must win over vector
3170      variant on the expected loop trip count.  The following condition must hold true:
3171      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3172
3173   if (vec_outside_cost <= 0)
3174     min_profitable_estimate = 1;
3175   else
3176     {
3177       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3178                                  - vec_inside_cost * peel_iters_prologue
3179                                  - vec_inside_cost * peel_iters_epilogue)
3180                                  / ((scalar_single_iter_cost * vf)
3181                                    - vec_inside_cost);
3182     }
3183   min_profitable_estimate --;
3184   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3185   if (dump_enabled_p ())
3186     dump_printf_loc (MSG_NOTE, vect_location,
3187                      "  Static estimate profitability threshold = %d\n",
3188                       min_profitable_iters);
3189
3190   *ret_min_profitable_estimate = min_profitable_estimate;
3191 }
3192
3193 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3194    vector elements (not bits) for a vector of mode MODE.  */
3195 static void
3196 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3197                               unsigned char *sel)
3198 {
3199   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3200
3201   for (i = 0; i < nelt; i++)
3202     sel[i] = (i + offset) & (2*nelt - 1);
3203 }
3204
3205 /* Checks whether the target supports whole-vector shifts for vectors of mode
3206    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3207    it supports vec_perm_const with masks for all necessary shift amounts.  */
3208 static bool
3209 have_whole_vector_shift (enum machine_mode mode)
3210 {
3211   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3212     return true;
3213
3214   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3215     return false;
3216
3217   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3218   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3219
3220   for (i = nelt/2; i >= 1; i/=2)
3221     {
3222       calc_vec_perm_mask_for_shift (mode, i, sel);
3223       if (!can_vec_perm_p (mode, false, sel))
3224         return false;
3225     }
3226   return true;
3227 }
3228
3229 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3230
3231 static tree
3232 get_reduction_op (gimple stmt, int reduc_index)
3233 {
3234   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3235     {
3236     case GIMPLE_SINGLE_RHS:
3237       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3238                   == ternary_op);
3239       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3240     case GIMPLE_UNARY_RHS:
3241       return gimple_assign_rhs1 (stmt);
3242     case GIMPLE_BINARY_RHS:
3243       return (reduc_index
3244               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3245     case GIMPLE_TERNARY_RHS:
3246       return gimple_op (stmt, reduc_index + 1);
3247     default:
3248       gcc_unreachable ();
3249     }
3250 }
3251
3252 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3253    functions. Design better to avoid maintenance issues.  */
3254
3255 /* Function vect_model_reduction_cost.
3256
3257    Models cost for a reduction operation, including the vector ops
3258    generated within the strip-mine loop, the initial definition before
3259    the loop, and the epilogue code that must be generated.  */
3260
3261 static bool
3262 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3263                            int ncopies, int reduc_index)
3264 {
3265   int prologue_cost = 0, epilogue_cost = 0;
3266   enum tree_code code;
3267   optab optab;
3268   tree vectype;
3269   gimple stmt, orig_stmt;
3270   tree reduction_op;
3271   machine_mode mode;
3272   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3273   struct loop *loop = NULL;
3274   void *target_cost_data;
3275
3276   if (loop_vinfo)
3277     {
3278       loop = LOOP_VINFO_LOOP (loop_vinfo);
3279       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3280     }
3281   else
3282     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3283
3284   /* Cost of reduction op inside loop.  */
3285   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3286                                         stmt_info, 0, vect_body);
3287   stmt = STMT_VINFO_STMT (stmt_info);
3288
3289   reduction_op = get_reduction_op (stmt, reduc_index);
3290
3291   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3292   if (!vectype)
3293     {
3294       if (dump_enabled_p ())
3295         {
3296           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3297                            "unsupported data-type ");
3298           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3299                              TREE_TYPE (reduction_op));
3300           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3301         }
3302       return false;
3303    }
3304
3305   mode = TYPE_MODE (vectype);
3306   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3307
3308   if (!orig_stmt)
3309     orig_stmt = STMT_VINFO_STMT (stmt_info);
3310
3311   code = gimple_assign_rhs_code (orig_stmt);
3312
3313   /* Add in cost for initial definition.  */
3314   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3315                                   stmt_info, 0, vect_prologue);
3316
3317   /* Determine cost of epilogue code.
3318
3319      We have a reduction operator that will reduce the vector in one statement.
3320      Also requires scalar extract.  */
3321
3322   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3323     {
3324       if (reduc_code != ERROR_MARK)
3325         {
3326           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3327                                           stmt_info, 0, vect_epilogue);
3328           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3329                                           stmt_info, 0, vect_epilogue);
3330         }
3331       else
3332         {
3333           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3334           tree bitsize =
3335             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3336           int element_bitsize = tree_to_uhwi (bitsize);
3337           int nelements = vec_size_in_bits / element_bitsize;
3338
3339           optab = optab_for_tree_code (code, vectype, optab_default);
3340
3341           /* We have a whole vector shift available.  */
3342           if (VECTOR_MODE_P (mode)
3343               && optab_handler (optab, mode) != CODE_FOR_nothing
3344               && have_whole_vector_shift (mode))
3345             {
3346               /* Final reduction via vector shifts and the reduction operator.
3347                  Also requires scalar extract.  */
3348               epilogue_cost += add_stmt_cost (target_cost_data,
3349                                               exact_log2 (nelements) * 2,
3350                                               vector_stmt, stmt_info, 0,
3351                                               vect_epilogue);
3352               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3353                                               vec_to_scalar, stmt_info, 0,
3354                                               vect_epilogue);
3355             }
3356           else
3357             /* Use extracts and reduction op for final reduction.  For N
3358                elements, we have N extracts and N-1 reduction ops.  */
3359             epilogue_cost += add_stmt_cost (target_cost_data,
3360                                             nelements + nelements - 1,
3361                                             vector_stmt, stmt_info, 0,
3362                                             vect_epilogue);
3363         }
3364     }
3365
3366   if (dump_enabled_p ())
3367     dump_printf (MSG_NOTE,
3368                  "vect_model_reduction_cost: inside_cost = %d, "
3369                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3370                  prologue_cost, epilogue_cost);
3371
3372   return true;
3373 }
3374
3375
3376 /* Function vect_model_induction_cost.
3377
3378    Models cost for induction operations.  */
3379
3380 static void
3381 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3382 {
3383   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3384   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3385   unsigned inside_cost, prologue_cost;
3386
3387   /* loop cost for vec_loop.  */
3388   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3389                                stmt_info, 0, vect_body);
3390
3391   /* prologue cost for vec_init and vec_step.  */
3392   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3393                                  stmt_info, 0, vect_prologue);
3394
3395   if (dump_enabled_p ())
3396     dump_printf_loc (MSG_NOTE, vect_location,
3397                      "vect_model_induction_cost: inside_cost = %d, "
3398                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3399 }
3400
3401
3402 /* Function get_initial_def_for_induction
3403
3404    Input:
3405    STMT - a stmt that performs an induction operation in the loop.
3406    IV_PHI - the initial value of the induction variable
3407
3408    Output:
3409    Return a vector variable, initialized with the first VF values of
3410    the induction variable.  E.g., for an iv with IV_PHI='X' and
3411    evolution S, for a vector of 4 units, we want to return:
3412    [X, X + S, X + 2*S, X + 3*S].  */
3413
3414 static tree
3415 get_initial_def_for_induction (gimple iv_phi)
3416 {
3417   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3418   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3419   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3420   tree vectype;
3421   int nunits;
3422   edge pe = loop_preheader_edge (loop);
3423   struct loop *iv_loop;
3424   basic_block new_bb;
3425   tree new_vec, vec_init, vec_step, t;
3426   tree new_var;
3427   tree new_name;
3428   gimple init_stmt, new_stmt;
3429   gphi *induction_phi;
3430   tree induc_def, vec_def, vec_dest;
3431   tree init_expr, step_expr;
3432   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3433   int i;
3434   int ncopies;
3435   tree expr;
3436   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3437   bool nested_in_vect_loop = false;
3438   gimple_seq stmts = NULL;
3439   imm_use_iterator imm_iter;
3440   use_operand_p use_p;
3441   gimple exit_phi;
3442   edge latch_e;
3443   tree loop_arg;
3444   gimple_stmt_iterator si;
3445   basic_block bb = gimple_bb (iv_phi);
3446   tree stepvectype;
3447   tree resvectype;
3448
3449   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3450   if (nested_in_vect_loop_p (loop, iv_phi))
3451     {
3452       nested_in_vect_loop = true;
3453       iv_loop = loop->inner;
3454     }
3455   else
3456     iv_loop = loop;
3457   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3458
3459   latch_e = loop_latch_edge (iv_loop);
3460   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3461
3462   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3463   gcc_assert (step_expr != NULL_TREE);
3464
3465   pe = loop_preheader_edge (iv_loop);
3466   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3467                                      loop_preheader_edge (iv_loop));
3468
3469   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3470   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3471   gcc_assert (vectype);
3472   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3473   ncopies = vf / nunits;
3474
3475   gcc_assert (phi_info);
3476   gcc_assert (ncopies >= 1);
3477
3478   /* Convert the step to the desired type.  */
3479   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3480                                                   step_expr),
3481                                     &stmts, true, NULL_TREE);
3482   if (stmts)
3483     {
3484       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3485       gcc_assert (!new_bb);
3486     }
3487
3488   /* Find the first insertion point in the BB.  */
3489   si = gsi_after_labels (bb);
3490
3491   /* Create the vector that holds the initial_value of the induction.  */
3492   if (nested_in_vect_loop)
3493     {
3494       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3495          been created during vectorization of previous stmts.  We obtain it
3496          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3497       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3498       /* If the initial value is not of proper type, convert it.  */
3499       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3500         {
3501           new_stmt
3502             = gimple_build_assign (vect_get_new_vect_var (vectype,
3503                                                           vect_simple_var,
3504                                                           "vec_iv_"),
3505                                    VIEW_CONVERT_EXPR,
3506                                    build1 (VIEW_CONVERT_EXPR, vectype,
3507                                            vec_init));
3508           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3509           gimple_assign_set_lhs (new_stmt, vec_init);
3510           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3511                                                  new_stmt);
3512           gcc_assert (!new_bb);
3513           set_vinfo_for_stmt (new_stmt,
3514                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3515         }
3516     }
3517   else
3518     {
3519       vec<constructor_elt, va_gc> *v;
3520
3521       /* iv_loop is the loop to be vectorized. Create:
3522          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3523       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3524                                        vect_scalar_var, "var_");
3525       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3526                                                      init_expr),
3527                                        &stmts, false, new_var);
3528       if (stmts)
3529         {
3530           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3531           gcc_assert (!new_bb);
3532         }
3533
3534       vec_alloc (v, nunits);
3535       bool constant_p = is_gimple_min_invariant (new_name);
3536       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3537       for (i = 1; i < nunits; i++)
3538         {
3539           /* Create: new_name_i = new_name + step_expr  */
3540           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3541                                   new_name, step_expr);
3542           if (!is_gimple_min_invariant (new_name))
3543             {
3544               init_stmt = gimple_build_assign (new_var, new_name);
3545               new_name = make_ssa_name (new_var, init_stmt);
3546               gimple_assign_set_lhs (init_stmt, new_name);
3547               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3548               gcc_assert (!new_bb);
3549               if (dump_enabled_p ())
3550                 {
3551                   dump_printf_loc (MSG_NOTE, vect_location,
3552                                    "created new init_stmt: ");
3553                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3554                   dump_printf (MSG_NOTE, "\n");
3555                 }
3556               constant_p = false;
3557             }
3558           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3559         }
3560       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3561       if (constant_p)
3562         new_vec = build_vector_from_ctor (vectype, v);
3563       else
3564         new_vec = build_constructor (vectype, v);
3565       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3566     }
3567
3568
3569   /* Create the vector that holds the step of the induction.  */
3570   if (nested_in_vect_loop)
3571     /* iv_loop is nested in the loop to be vectorized. Generate:
3572        vec_step = [S, S, S, S]  */
3573     new_name = step_expr;
3574   else
3575     {
3576       /* iv_loop is the loop to be vectorized. Generate:
3577           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3578       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3579         {
3580           expr = build_int_cst (integer_type_node, vf);
3581           expr = fold_convert (TREE_TYPE (step_expr), expr);
3582         }
3583       else
3584         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3585       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3586                               expr, step_expr);
3587       if (TREE_CODE (step_expr) == SSA_NAME)
3588         new_name = vect_init_vector (iv_phi, new_name,
3589                                      TREE_TYPE (step_expr), NULL);
3590     }
3591
3592   t = unshare_expr (new_name);
3593   gcc_assert (CONSTANT_CLASS_P (new_name)
3594               || TREE_CODE (new_name) == SSA_NAME);
3595   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3596   gcc_assert (stepvectype);
3597   new_vec = build_vector_from_val (stepvectype, t);
3598   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3599
3600
3601   /* Create the following def-use cycle:
3602      loop prolog:
3603          vec_init = ...
3604          vec_step = ...
3605      loop:
3606          vec_iv = PHI <vec_init, vec_loop>
3607          ...
3608          STMT
3609          ...
3610          vec_loop = vec_iv + vec_step;  */
3611
3612   /* Create the induction-phi that defines the induction-operand.  */
3613   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3614   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3615   set_vinfo_for_stmt (induction_phi,
3616                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3617   induc_def = PHI_RESULT (induction_phi);
3618
3619   /* Create the iv update inside the loop  */
3620   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3621   vec_def = make_ssa_name (vec_dest, new_stmt);
3622   gimple_assign_set_lhs (new_stmt, vec_def);
3623   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3624   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3625                                                    NULL));
3626
3627   /* Set the arguments of the phi node:  */
3628   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3629   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3630                UNKNOWN_LOCATION);
3631
3632
3633   /* In case that vectorization factor (VF) is bigger than the number
3634      of elements that we can fit in a vectype (nunits), we have to generate
3635      more than one vector stmt - i.e - we need to "unroll" the
3636      vector stmt by a factor VF/nunits.  For more details see documentation
3637      in vectorizable_operation.  */
3638
3639   if (ncopies > 1)
3640     {
3641       stmt_vec_info prev_stmt_vinfo;
3642       /* FORNOW. This restriction should be relaxed.  */
3643       gcc_assert (!nested_in_vect_loop);
3644
3645       /* Create the vector that holds the step of the induction.  */
3646       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3647         {
3648           expr = build_int_cst (integer_type_node, nunits);
3649           expr = fold_convert (TREE_TYPE (step_expr), expr);
3650         }
3651       else
3652         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3653       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3654                               expr, step_expr);
3655       if (TREE_CODE (step_expr) == SSA_NAME)
3656         new_name = vect_init_vector (iv_phi, new_name,
3657                                      TREE_TYPE (step_expr), NULL);
3658       t = unshare_expr (new_name);
3659       gcc_assert (CONSTANT_CLASS_P (new_name)
3660                   || TREE_CODE (new_name) == SSA_NAME);
3661       new_vec = build_vector_from_val (stepvectype, t);
3662       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3663
3664       vec_def = induc_def;
3665       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3666       for (i = 1; i < ncopies; i++)
3667         {
3668           /* vec_i = vec_prev + vec_step  */
3669           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3670                                           vec_def, vec_step);
3671           vec_def = make_ssa_name (vec_dest, new_stmt);
3672           gimple_assign_set_lhs (new_stmt, vec_def);
3673
3674           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3675           if (!useless_type_conversion_p (resvectype, vectype))
3676             {
3677               new_stmt
3678                 = gimple_build_assign
3679                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3680                                                 "vec_iv_"),
3681                          VIEW_CONVERT_EXPR,
3682                          build1 (VIEW_CONVERT_EXPR, resvectype,
3683                                  gimple_assign_lhs (new_stmt)));
3684               gimple_assign_set_lhs (new_stmt,
3685                                      make_ssa_name
3686                                        (gimple_assign_lhs (new_stmt), new_stmt));
3687               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3688             }
3689           set_vinfo_for_stmt (new_stmt,
3690                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3691           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3692           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3693         }
3694     }
3695
3696   if (nested_in_vect_loop)
3697     {
3698       /* Find the loop-closed exit-phi of the induction, and record
3699          the final vector of induction results:  */
3700       exit_phi = NULL;
3701       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3702         {
3703           gimple use_stmt = USE_STMT (use_p);
3704           if (is_gimple_debug (use_stmt))
3705             continue;
3706
3707           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3708             {
3709               exit_phi = use_stmt;
3710               break;
3711             }
3712         }
3713       if (exit_phi)
3714         {
3715           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3716           /* FORNOW. Currently not supporting the case that an inner-loop induction
3717              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3718           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3719                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3720
3721           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3722           if (dump_enabled_p ())
3723             {
3724               dump_printf_loc (MSG_NOTE, vect_location,
3725                                "vector of inductions after inner-loop:");
3726               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3727               dump_printf (MSG_NOTE, "\n");
3728             }
3729         }
3730     }
3731
3732
3733   if (dump_enabled_p ())
3734     {
3735       dump_printf_loc (MSG_NOTE, vect_location,
3736                        "transform induction: created def-use cycle: ");
3737       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3738       dump_printf (MSG_NOTE, "\n");
3739       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3740                         SSA_NAME_DEF_STMT (vec_def), 0);
3741       dump_printf (MSG_NOTE, "\n");
3742     }
3743
3744   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3745   if (!useless_type_conversion_p (resvectype, vectype))
3746     {
3747       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3748                                                              vect_simple_var,
3749                                                              "vec_iv_"),
3750                                       VIEW_CONVERT_EXPR,
3751                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3752                                               induc_def));
3753       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3754       gimple_assign_set_lhs (new_stmt, induc_def);
3755       si = gsi_after_labels (bb);
3756       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3757       set_vinfo_for_stmt (new_stmt,
3758                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3759       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3760         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3761     }
3762
3763   return induc_def;
3764 }
3765
3766
3767 /* Function get_initial_def_for_reduction
3768
3769    Input:
3770    STMT - a stmt that performs a reduction operation in the loop.
3771    INIT_VAL - the initial value of the reduction variable
3772
3773    Output:
3774    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3775         of the reduction (used for adjusting the epilog - see below).
3776    Return a vector variable, initialized according to the operation that STMT
3777         performs. This vector will be used as the initial value of the
3778         vector of partial results.
3779
3780    Option1 (adjust in epilog): Initialize the vector as follows:
3781      add/bit or/xor:    [0,0,...,0,0]
3782      mult/bit and:      [1,1,...,1,1]
3783      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3784    and when necessary (e.g. add/mult case) let the caller know
3785    that it needs to adjust the result by init_val.
3786
3787    Option2: Initialize the vector as follows:
3788      add/bit or/xor:    [init_val,0,0,...,0]
3789      mult/bit and:      [init_val,1,1,...,1]
3790      min/max/cond_expr: [init_val,init_val,...,init_val]
3791    and no adjustments are needed.
3792
3793    For example, for the following code:
3794
3795    s = init_val;
3796    for (i=0;i<n;i++)
3797      s = s + a[i];
3798
3799    STMT is 's = s + a[i]', and the reduction variable is 's'.
3800    For a vector of 4 units, we want to return either [0,0,0,init_val],
3801    or [0,0,0,0] and let the caller know that it needs to adjust
3802    the result at the end by 'init_val'.
3803
3804    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3805    initialization vector is simpler (same element in all entries), if
3806    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3807
3808    A cost model should help decide between these two schemes.  */
3809
3810 tree
3811 get_initial_def_for_reduction (gimple stmt, tree init_val,
3812                                tree *adjustment_def)
3813 {
3814   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3815   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3816   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3817   tree scalar_type = TREE_TYPE (init_val);
3818   tree vectype = get_vectype_for_scalar_type (scalar_type);
3819   int nunits;
3820   enum tree_code code = gimple_assign_rhs_code (stmt);
3821   tree def_for_init;
3822   tree init_def;
3823   tree *elts;
3824   int i;
3825   bool nested_in_vect_loop = false;
3826   tree init_value;
3827   REAL_VALUE_TYPE real_init_val = dconst0;
3828   int int_init_val = 0;
3829   gimple def_stmt = NULL;
3830
3831   gcc_assert (vectype);
3832   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3833
3834   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3835               || SCALAR_FLOAT_TYPE_P (scalar_type));
3836
3837   if (nested_in_vect_loop_p (loop, stmt))
3838     nested_in_vect_loop = true;
3839   else
3840     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3841
3842   /* In case of double reduction we only create a vector variable to be put
3843      in the reduction phi node.  The actual statement creation is done in
3844      vect_create_epilog_for_reduction.  */
3845   if (adjustment_def && nested_in_vect_loop
3846       && TREE_CODE (init_val) == SSA_NAME
3847       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3848       && gimple_code (def_stmt) == GIMPLE_PHI
3849       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3850       && vinfo_for_stmt (def_stmt)
3851       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3852           == vect_double_reduction_def)
3853     {
3854       *adjustment_def = NULL;
3855       return vect_create_destination_var (init_val, vectype);
3856     }
3857
3858   if (TREE_CONSTANT (init_val))
3859     {
3860       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3861         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3862       else
3863         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3864     }
3865   else
3866     init_value = init_val;
3867
3868   switch (code)
3869     {
3870       case WIDEN_SUM_EXPR:
3871       case DOT_PROD_EXPR:
3872       case SAD_EXPR:
3873       case PLUS_EXPR:
3874       case MINUS_EXPR:
3875       case BIT_IOR_EXPR:
3876       case BIT_XOR_EXPR:
3877       case MULT_EXPR:
3878       case BIT_AND_EXPR:
3879         /* ADJUSMENT_DEF is NULL when called from
3880            vect_create_epilog_for_reduction to vectorize double reduction.  */
3881         if (adjustment_def)
3882           {
3883             if (nested_in_vect_loop)
3884               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3885                                                               NULL);
3886             else
3887               *adjustment_def = init_val;
3888           }
3889
3890         if (code == MULT_EXPR)
3891           {
3892             real_init_val = dconst1;
3893             int_init_val = 1;
3894           }
3895
3896         if (code == BIT_AND_EXPR)
3897           int_init_val = -1;
3898
3899         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3900           def_for_init = build_real (scalar_type, real_init_val);
3901         else
3902           def_for_init = build_int_cst (scalar_type, int_init_val);
3903
3904         /* Create a vector of '0' or '1' except the first element.  */
3905         elts = XALLOCAVEC (tree, nunits);
3906         for (i = nunits - 2; i >= 0; --i)
3907           elts[i + 1] = def_for_init;
3908
3909         /* Option1: the first element is '0' or '1' as well.  */
3910         if (adjustment_def)
3911           {
3912             elts[0] = def_for_init;
3913             init_def = build_vector (vectype, elts);
3914             break;
3915           }
3916
3917         /* Option2: the first element is INIT_VAL.  */
3918         elts[0] = init_val;
3919         if (TREE_CONSTANT (init_val))
3920           init_def = build_vector (vectype, elts);
3921         else
3922           {
3923             vec<constructor_elt, va_gc> *v;
3924             vec_alloc (v, nunits);
3925             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3926             for (i = 1; i < nunits; ++i)
3927               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3928             init_def = build_constructor (vectype, v);
3929           }
3930
3931         break;
3932
3933       case MIN_EXPR:
3934       case MAX_EXPR:
3935       case COND_EXPR:
3936         if (adjustment_def)
3937           {
3938             *adjustment_def = NULL_TREE;
3939             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3940             break;
3941           }
3942
3943         init_def = build_vector_from_val (vectype, init_value);
3944         break;
3945
3946       default:
3947         gcc_unreachable ();
3948     }
3949
3950   return init_def;
3951 }
3952
3953 /* Function vect_create_epilog_for_reduction
3954
3955    Create code at the loop-epilog to finalize the result of a reduction
3956    computation.
3957
3958    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3959      reduction statements.
3960    STMT is the scalar reduction stmt that is being vectorized.
3961    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3962      number of elements that we can fit in a vectype (nunits).  In this case
3963      we have to generate more than one vector stmt - i.e - we need to "unroll"
3964      the vector stmt by a factor VF/nunits.  For more details see documentation
3965      in vectorizable_operation.
3966    REDUC_CODE is the tree-code for the epilog reduction.
3967    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3968      computation.
3969    REDUC_INDEX is the index of the operand in the right hand side of the
3970      statement that is defined by REDUCTION_PHI.
3971    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3972    SLP_NODE is an SLP node containing a group of reduction statements. The
3973      first one in this group is STMT.
3974
3975    This function:
3976    1. Creates the reduction def-use cycles: sets the arguments for
3977       REDUCTION_PHIS:
3978       The loop-entry argument is the vectorized initial-value of the reduction.
3979       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3980       sums.
3981    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3982       by applying the operation specified by REDUC_CODE if available, or by
3983       other means (whole-vector shifts or a scalar loop).
3984       The function also creates a new phi node at the loop exit to preserve
3985       loop-closed form, as illustrated below.
3986
3987      The flow at the entry to this function:
3988
3989         loop:
3990           vec_def = phi <null, null>            # REDUCTION_PHI
3991           VECT_DEF = vector_stmt                # vectorized form of STMT
3992           s_loop = scalar_stmt                  # (scalar) STMT
3993         loop_exit:
3994           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3995           use <s_out0>
3996           use <s_out0>
3997
3998      The above is transformed by this function into:
3999
4000         loop:
4001           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4002           VECT_DEF = vector_stmt                # vectorized form of STMT
4003           s_loop = scalar_stmt                  # (scalar) STMT
4004         loop_exit:
4005           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4006           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4007           v_out2 = reduce <v_out1>
4008           s_out3 = extract_field <v_out2, 0>
4009           s_out4 = adjust_result <s_out3>
4010           use <s_out4>
4011           use <s_out4>
4012 */
4013
4014 static void
4015 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
4016                                   int ncopies, enum tree_code reduc_code,
4017                                   vec<gimple> reduction_phis,
4018                                   int reduc_index, bool double_reduc,
4019                                   slp_tree slp_node)
4020 {
4021   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4022   stmt_vec_info prev_phi_info;
4023   tree vectype;
4024   machine_mode mode;
4025   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4026   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4027   basic_block exit_bb;
4028   tree scalar_dest;
4029   tree scalar_type;
4030   gimple new_phi = NULL, phi;
4031   gimple_stmt_iterator exit_gsi;
4032   tree vec_dest;
4033   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4034   gimple epilog_stmt = NULL;
4035   enum tree_code code = gimple_assign_rhs_code (stmt);
4036   gimple exit_phi;
4037   tree bitsize;
4038   tree adjustment_def = NULL;
4039   tree vec_initial_def = NULL;
4040   tree reduction_op, expr, def;
4041   tree orig_name, scalar_result;
4042   imm_use_iterator imm_iter, phi_imm_iter;
4043   use_operand_p use_p, phi_use_p;
4044   gimple use_stmt, orig_stmt, reduction_phi = NULL;
4045   bool nested_in_vect_loop = false;
4046   auto_vec<gimple> new_phis;
4047   auto_vec<gimple> inner_phis;
4048   enum vect_def_type dt = vect_unknown_def_type;
4049   int j, i;
4050   auto_vec<tree> scalar_results;
4051   unsigned int group_size = 1, k, ratio;
4052   auto_vec<tree> vec_initial_defs;
4053   auto_vec<gimple> phis;
4054   bool slp_reduc = false;
4055   tree new_phi_result;
4056   gimple inner_phi = NULL;
4057
4058   if (slp_node)
4059     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4060
4061   if (nested_in_vect_loop_p (loop, stmt))
4062     {
4063       outer_loop = loop;
4064       loop = loop->inner;
4065       nested_in_vect_loop = true;
4066       gcc_assert (!slp_node);
4067     }
4068
4069   reduction_op = get_reduction_op (stmt, reduc_index);
4070
4071   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4072   gcc_assert (vectype);
4073   mode = TYPE_MODE (vectype);
4074
4075   /* 1. Create the reduction def-use cycle:
4076      Set the arguments of REDUCTION_PHIS, i.e., transform
4077
4078         loop:
4079           vec_def = phi <null, null>            # REDUCTION_PHI
4080           VECT_DEF = vector_stmt                # vectorized form of STMT
4081           ...
4082
4083      into:
4084
4085         loop:
4086           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4087           VECT_DEF = vector_stmt                # vectorized form of STMT
4088           ...
4089
4090      (in case of SLP, do it for all the phis). */
4091
4092   /* Get the loop-entry arguments.  */
4093   if (slp_node)
4094     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4095                        NULL, slp_node, reduc_index);
4096   else
4097     {
4098       vec_initial_defs.create (1);
4099      /* For the case of reduction, vect_get_vec_def_for_operand returns
4100         the scalar def before the loop, that defines the initial value
4101         of the reduction variable.  */
4102       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4103                                                       &adjustment_def);
4104       vec_initial_defs.quick_push (vec_initial_def);
4105     }
4106
4107   /* Set phi nodes arguments.  */
4108   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4109     {
4110       tree vec_init_def, def;
4111       gimple_seq stmts;
4112       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4113                                            true, NULL_TREE);
4114       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4115       def = vect_defs[i];
4116       for (j = 0; j < ncopies; j++)
4117         {
4118           /* Set the loop-entry arg of the reduction-phi.  */
4119           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4120                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4121
4122           /* Set the loop-latch arg for the reduction-phi.  */
4123           if (j > 0)
4124             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4125
4126           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4127                        UNKNOWN_LOCATION);
4128
4129           if (dump_enabled_p ())
4130             {
4131               dump_printf_loc (MSG_NOTE, vect_location,
4132                                "transform reduction: created def-use cycle: ");
4133               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4134               dump_printf (MSG_NOTE, "\n");
4135               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4136               dump_printf (MSG_NOTE, "\n");
4137             }
4138
4139           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4140         }
4141     }
4142
4143   /* 2. Create epilog code.
4144         The reduction epilog code operates across the elements of the vector
4145         of partial results computed by the vectorized loop.
4146         The reduction epilog code consists of:
4147
4148         step 1: compute the scalar result in a vector (v_out2)
4149         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4150         step 3: adjust the scalar result (s_out3) if needed.
4151
4152         Step 1 can be accomplished using one the following three schemes:
4153           (scheme 1) using reduc_code, if available.
4154           (scheme 2) using whole-vector shifts, if available.
4155           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4156                      combined.
4157
4158           The overall epilog code looks like this:
4159
4160           s_out0 = phi <s_loop>         # original EXIT_PHI
4161           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4162           v_out2 = reduce <v_out1>              # step 1
4163           s_out3 = extract_field <v_out2, 0>    # step 2
4164           s_out4 = adjust_result <s_out3>       # step 3
4165
4166           (step 3 is optional, and steps 1 and 2 may be combined).
4167           Lastly, the uses of s_out0 are replaced by s_out4.  */
4168
4169
4170   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4171          v_out1 = phi <VECT_DEF>
4172          Store them in NEW_PHIS.  */
4173
4174   exit_bb = single_exit (loop)->dest;
4175   prev_phi_info = NULL;
4176   new_phis.create (vect_defs.length ());
4177   FOR_EACH_VEC_ELT (vect_defs, i, def)
4178     {
4179       for (j = 0; j < ncopies; j++)
4180         {
4181           tree new_def = copy_ssa_name (def);
4182           phi = create_phi_node (new_def, exit_bb);
4183           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4184           if (j == 0)
4185             new_phis.quick_push (phi);
4186           else
4187             {
4188               def = vect_get_vec_def_for_stmt_copy (dt, def);
4189               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4190             }
4191
4192           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4193           prev_phi_info = vinfo_for_stmt (phi);
4194         }
4195     }
4196
4197   /* The epilogue is created for the outer-loop, i.e., for the loop being
4198      vectorized.  Create exit phis for the outer loop.  */
4199   if (double_reduc)
4200     {
4201       loop = outer_loop;
4202       exit_bb = single_exit (loop)->dest;
4203       inner_phis.create (vect_defs.length ());
4204       FOR_EACH_VEC_ELT (new_phis, i, phi)
4205         {
4206           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4207           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4208           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4209                            PHI_RESULT (phi));
4210           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4211                                                             loop_vinfo, NULL));
4212           inner_phis.quick_push (phi);
4213           new_phis[i] = outer_phi;
4214           prev_phi_info = vinfo_for_stmt (outer_phi);
4215           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4216             {
4217               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4218               new_result = copy_ssa_name (PHI_RESULT (phi));
4219               outer_phi = create_phi_node (new_result, exit_bb);
4220               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4221                                PHI_RESULT (phi));
4222               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4223                                                         loop_vinfo, NULL));
4224               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4225               prev_phi_info = vinfo_for_stmt (outer_phi);
4226             }
4227         }
4228     }
4229
4230   exit_gsi = gsi_after_labels (exit_bb);
4231
4232   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4233          (i.e. when reduc_code is not available) and in the final adjustment
4234          code (if needed).  Also get the original scalar reduction variable as
4235          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4236          represents a reduction pattern), the tree-code and scalar-def are
4237          taken from the original stmt that the pattern-stmt (STMT) replaces.
4238          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4239          are taken from STMT.  */
4240
4241   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4242   if (!orig_stmt)
4243     {
4244       /* Regular reduction  */
4245       orig_stmt = stmt;
4246     }
4247   else
4248     {
4249       /* Reduction pattern  */
4250       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4251       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4252       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4253     }
4254
4255   code = gimple_assign_rhs_code (orig_stmt);
4256   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4257      partial results are added and not subtracted.  */
4258   if (code == MINUS_EXPR)
4259     code = PLUS_EXPR;
4260
4261   scalar_dest = gimple_assign_lhs (orig_stmt);
4262   scalar_type = TREE_TYPE (scalar_dest);
4263   scalar_results.create (group_size);
4264   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4265   bitsize = TYPE_SIZE (scalar_type);
4266
4267   /* In case this is a reduction in an inner-loop while vectorizing an outer
4268      loop - we don't need to extract a single scalar result at the end of the
4269      inner-loop (unless it is double reduction, i.e., the use of reduction is
4270      outside the outer-loop).  The final vector of partial results will be used
4271      in the vectorized outer-loop, or reduced to a scalar result at the end of
4272      the outer-loop.  */
4273   if (nested_in_vect_loop && !double_reduc)
4274     goto vect_finalize_reduction;
4275
4276   /* SLP reduction without reduction chain, e.g.,
4277      # a1 = phi <a2, a0>
4278      # b1 = phi <b2, b0>
4279      a2 = operation (a1)
4280      b2 = operation (b1)  */
4281   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4282
4283   /* In case of reduction chain, e.g.,
4284      # a1 = phi <a3, a0>
4285      a2 = operation (a1)
4286      a3 = operation (a2),
4287
4288      we may end up with more than one vector result.  Here we reduce them to
4289      one vector.  */
4290   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4291     {
4292       tree first_vect = PHI_RESULT (new_phis[0]);
4293       tree tmp;
4294       gassign *new_vec_stmt = NULL;
4295
4296       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4297       for (k = 1; k < new_phis.length (); k++)
4298         {
4299           gimple next_phi = new_phis[k];
4300           tree second_vect = PHI_RESULT (next_phi);
4301
4302           tmp = build2 (code, vectype,  first_vect, second_vect);
4303           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4304           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4305           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4306           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4307         }
4308
4309       new_phi_result = first_vect;
4310       if (new_vec_stmt)
4311         {
4312           new_phis.truncate (0);
4313           new_phis.safe_push (new_vec_stmt);
4314         }
4315     }
4316   else
4317     new_phi_result = PHI_RESULT (new_phis[0]);
4318
4319   /* 2.3 Create the reduction code, using one of the three schemes described
4320          above. In SLP we simply need to extract all the elements from the
4321          vector (without reducing them), so we use scalar shifts.  */
4322   if (reduc_code != ERROR_MARK && !slp_reduc)
4323     {
4324       tree tmp;
4325       tree vec_elem_type;
4326
4327       /*** Case 1:  Create:
4328            v_out2 = reduc_expr <v_out1>  */
4329
4330       if (dump_enabled_p ())
4331         dump_printf_loc (MSG_NOTE, vect_location,
4332                          "Reduce using direct vector reduction.\n");
4333
4334       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4335       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4336         {
4337           tree tmp_dest =
4338               vect_create_destination_var (scalar_dest, vec_elem_type);
4339           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4340           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4341           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4342           gimple_assign_set_lhs (epilog_stmt, new_temp);
4343           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4344
4345           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4346         }
4347       else
4348         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4349       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4350       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4351       gimple_assign_set_lhs (epilog_stmt, new_temp);
4352       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4353       scalar_results.safe_push (new_temp);
4354     }
4355   else
4356     {
4357       bool reduce_with_shift = have_whole_vector_shift (mode);
4358       int element_bitsize = tree_to_uhwi (bitsize);
4359       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4360       tree vec_temp;
4361
4362       /* Regardless of whether we have a whole vector shift, if we're
4363          emulating the operation via tree-vect-generic, we don't want
4364          to use it.  Only the first round of the reduction is likely
4365          to still be profitable via emulation.  */
4366       /* ??? It might be better to emit a reduction tree code here, so that
4367          tree-vect-generic can expand the first round via bit tricks.  */
4368       if (!VECTOR_MODE_P (mode))
4369         reduce_with_shift = false;
4370       else
4371         {
4372           optab optab = optab_for_tree_code (code, vectype, optab_default);
4373           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4374             reduce_with_shift = false;
4375         }
4376
4377       if (reduce_with_shift && !slp_reduc)
4378         {
4379           int nelements = vec_size_in_bits / element_bitsize;
4380           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4381
4382           int elt_offset;
4383
4384           tree zero_vec = build_zero_cst (vectype);
4385           /*** Case 2: Create:
4386              for (offset = nelements/2; offset >= 1; offset/=2)
4387                 {
4388                   Create:  va' = vec_shift <va, offset>
4389                   Create:  va = vop <va, va'>
4390                 }  */
4391
4392           tree rhs;
4393
4394           if (dump_enabled_p ())
4395             dump_printf_loc (MSG_NOTE, vect_location,
4396                              "Reduce using vector shifts\n");
4397
4398           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4399           new_temp = new_phi_result;
4400           for (elt_offset = nelements / 2;
4401                elt_offset >= 1;
4402                elt_offset /= 2)
4403             {
4404               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4405               tree mask = vect_gen_perm_mask_any (vectype, sel);
4406               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4407                                                  new_temp, zero_vec, mask);
4408               new_name = make_ssa_name (vec_dest, epilog_stmt);
4409               gimple_assign_set_lhs (epilog_stmt, new_name);
4410               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4411
4412               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4413                                                  new_temp);
4414               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4415               gimple_assign_set_lhs (epilog_stmt, new_temp);
4416               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4417             }
4418
4419           /* 2.4  Extract the final scalar result.  Create:
4420              s_out3 = extract_field <v_out2, bitpos>  */
4421
4422           if (dump_enabled_p ())
4423             dump_printf_loc (MSG_NOTE, vect_location,
4424                              "extract scalar result\n");
4425
4426           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4427                         bitsize, bitsize_zero_node);
4428           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4429           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4430           gimple_assign_set_lhs (epilog_stmt, new_temp);
4431           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4432           scalar_results.safe_push (new_temp);
4433         }
4434       else
4435         {
4436           /*** Case 3: Create:
4437              s = extract_field <v_out2, 0>
4438              for (offset = element_size;
4439                   offset < vector_size;
4440                   offset += element_size;)
4441                {
4442                  Create:  s' = extract_field <v_out2, offset>
4443                  Create:  s = op <s, s'>  // For non SLP cases
4444                }  */
4445
4446           if (dump_enabled_p ())
4447             dump_printf_loc (MSG_NOTE, vect_location,
4448                              "Reduce using scalar code.\n");
4449
4450           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4451           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4452             {
4453               int bit_offset;
4454               if (gimple_code (new_phi) == GIMPLE_PHI)
4455                 vec_temp = PHI_RESULT (new_phi);
4456               else
4457                 vec_temp = gimple_assign_lhs (new_phi);
4458               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4459                             bitsize_zero_node);
4460               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4461               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4462               gimple_assign_set_lhs (epilog_stmt, new_temp);
4463               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4464
4465               /* In SLP we don't need to apply reduction operation, so we just
4466                  collect s' values in SCALAR_RESULTS.  */
4467               if (slp_reduc)
4468                 scalar_results.safe_push (new_temp);
4469
4470               for (bit_offset = element_bitsize;
4471                    bit_offset < vec_size_in_bits;
4472                    bit_offset += element_bitsize)
4473                 {
4474                   tree bitpos = bitsize_int (bit_offset);
4475                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4476                                      bitsize, bitpos);
4477
4478                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4479                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4480                   gimple_assign_set_lhs (epilog_stmt, new_name);
4481                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4482
4483                   if (slp_reduc)
4484                     {
4485                       /* In SLP we don't need to apply reduction operation, so
4486                          we just collect s' values in SCALAR_RESULTS.  */
4487                       new_temp = new_name;
4488                       scalar_results.safe_push (new_name);
4489                     }
4490                   else
4491                     {
4492                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4493                                                          new_name, new_temp);
4494                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4495                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4496                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4497                     }
4498                 }
4499             }
4500
4501           /* The only case where we need to reduce scalar results in SLP, is
4502              unrolling.  If the size of SCALAR_RESULTS is greater than
4503              GROUP_SIZE, we reduce them combining elements modulo
4504              GROUP_SIZE.  */
4505           if (slp_reduc)
4506             {
4507               tree res, first_res, new_res;
4508               gimple new_stmt;
4509
4510               /* Reduce multiple scalar results in case of SLP unrolling.  */
4511               for (j = group_size; scalar_results.iterate (j, &res);
4512                    j++)
4513                 {
4514                   first_res = scalar_results[j % group_size];
4515                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4516                                                   first_res, res);
4517                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4518                   gimple_assign_set_lhs (new_stmt, new_res);
4519                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4520                   scalar_results[j % group_size] = new_res;
4521                 }
4522             }
4523           else
4524             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4525             scalar_results.safe_push (new_temp);
4526         }
4527     }
4528
4529 vect_finalize_reduction:
4530
4531   if (double_reduc)
4532     loop = loop->inner;
4533
4534   /* 2.5 Adjust the final result by the initial value of the reduction
4535          variable. (When such adjustment is not needed, then
4536          'adjustment_def' is zero).  For example, if code is PLUS we create:
4537          new_temp = loop_exit_def + adjustment_def  */
4538
4539   if (adjustment_def)
4540     {
4541       gcc_assert (!slp_reduc);
4542       if (nested_in_vect_loop)
4543         {
4544           new_phi = new_phis[0];
4545           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4546           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4547           new_dest = vect_create_destination_var (scalar_dest, vectype);
4548         }
4549       else
4550         {
4551           new_temp = scalar_results[0];
4552           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4553           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4554           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4555         }
4556
4557       epilog_stmt = gimple_build_assign (new_dest, expr);
4558       new_temp = make_ssa_name (new_dest, epilog_stmt);
4559       gimple_assign_set_lhs (epilog_stmt, new_temp);
4560       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4561       if (nested_in_vect_loop)
4562         {
4563           set_vinfo_for_stmt (epilog_stmt,
4564                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4565                                                  NULL));
4566           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4567                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4568
4569           if (!double_reduc)
4570             scalar_results.quick_push (new_temp);
4571           else
4572             scalar_results[0] = new_temp;
4573         }
4574       else
4575         scalar_results[0] = new_temp;
4576
4577       new_phis[0] = epilog_stmt;
4578     }
4579
4580   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4581           phis with new adjusted scalar results, i.e., replace use <s_out0>
4582           with use <s_out4>.
4583
4584      Transform:
4585         loop_exit:
4586           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4587           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4588           v_out2 = reduce <v_out1>
4589           s_out3 = extract_field <v_out2, 0>
4590           s_out4 = adjust_result <s_out3>
4591           use <s_out0>
4592           use <s_out0>
4593
4594      into:
4595
4596         loop_exit:
4597           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4598           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4599           v_out2 = reduce <v_out1>
4600           s_out3 = extract_field <v_out2, 0>
4601           s_out4 = adjust_result <s_out3>
4602           use <s_out4>
4603           use <s_out4> */
4604
4605
4606   /* In SLP reduction chain we reduce vector results into one vector if
4607      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4608      the last stmt in the reduction chain, since we are looking for the loop
4609      exit phi node.  */
4610   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4611     {
4612       gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4613       /* Handle reduction patterns.  */
4614       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4615         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4616
4617       scalar_dest = gimple_assign_lhs (dest_stmt);
4618       group_size = 1;
4619     }
4620
4621   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4622      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4623      need to match SCALAR_RESULTS with corresponding statements.  The first
4624      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4625      the first vector stmt, etc.
4626      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4627   if (group_size > new_phis.length ())
4628     {
4629       ratio = group_size / new_phis.length ();
4630       gcc_assert (!(group_size % new_phis.length ()));
4631     }
4632   else
4633     ratio = 1;
4634
4635   for (k = 0; k < group_size; k++)
4636     {
4637       if (k % ratio == 0)
4638         {
4639           epilog_stmt = new_phis[k / ratio];
4640           reduction_phi = reduction_phis[k / ratio];
4641           if (double_reduc)
4642             inner_phi = inner_phis[k / ratio];
4643         }
4644
4645       if (slp_reduc)
4646         {
4647           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4648
4649           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4650           /* SLP statements can't participate in patterns.  */
4651           gcc_assert (!orig_stmt);
4652           scalar_dest = gimple_assign_lhs (current_stmt);
4653         }
4654
4655       phis.create (3);
4656       /* Find the loop-closed-use at the loop exit of the original scalar
4657          result.  (The reduction result is expected to have two immediate uses -
4658          one at the latch block, and one at the loop exit).  */
4659       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4660         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4661             && !is_gimple_debug (USE_STMT (use_p)))
4662           phis.safe_push (USE_STMT (use_p));
4663
4664       /* While we expect to have found an exit_phi because of loop-closed-ssa
4665          form we can end up without one if the scalar cycle is dead.  */
4666
4667       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4668         {
4669           if (outer_loop)
4670             {
4671               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4672               gphi *vect_phi;
4673
4674               /* FORNOW. Currently not supporting the case that an inner-loop
4675                  reduction is not used in the outer-loop (but only outside the
4676                  outer-loop), unless it is double reduction.  */
4677               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4678                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4679                           || double_reduc);
4680
4681               if (double_reduc)
4682                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4683               else
4684                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4685               if (!double_reduc
4686                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4687                       != vect_double_reduction_def)
4688                 continue;
4689
4690               /* Handle double reduction:
4691
4692                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4693                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4694                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4695                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4696
4697                  At that point the regular reduction (stmt2 and stmt3) is
4698                  already vectorized, as well as the exit phi node, stmt4.
4699                  Here we vectorize the phi node of double reduction, stmt1, and
4700                  update all relevant statements.  */
4701
4702               /* Go through all the uses of s2 to find double reduction phi
4703                  node, i.e., stmt1 above.  */
4704               orig_name = PHI_RESULT (exit_phi);
4705               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4706                 {
4707                   stmt_vec_info use_stmt_vinfo;
4708                   stmt_vec_info new_phi_vinfo;
4709                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4710                   basic_block bb = gimple_bb (use_stmt);
4711                   gimple use;
4712
4713                   /* Check that USE_STMT is really double reduction phi
4714                      node.  */
4715                   if (gimple_code (use_stmt) != GIMPLE_PHI
4716                       || gimple_phi_num_args (use_stmt) != 2
4717                       || bb->loop_father != outer_loop)
4718                     continue;
4719                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4720                   if (!use_stmt_vinfo
4721                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4722                           != vect_double_reduction_def)
4723                     continue;
4724
4725                   /* Create vector phi node for double reduction:
4726                      vs1 = phi <vs0, vs2>
4727                      vs1 was created previously in this function by a call to
4728                        vect_get_vec_def_for_operand and is stored in
4729                        vec_initial_def;
4730                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4731                      vs0 is created here.  */
4732
4733                   /* Create vector phi node.  */
4734                   vect_phi = create_phi_node (vec_initial_def, bb);
4735                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4736                                     loop_vec_info_for_loop (outer_loop), NULL);
4737                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4738
4739                   /* Create vs0 - initial def of the double reduction phi.  */
4740                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4741                                              loop_preheader_edge (outer_loop));
4742                   init_def = get_initial_def_for_reduction (stmt,
4743                                                           preheader_arg, NULL);
4744                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4745                                                     vectype, NULL);
4746
4747                   /* Update phi node arguments with vs0 and vs2.  */
4748                   add_phi_arg (vect_phi, vect_phi_init,
4749                                loop_preheader_edge (outer_loop),
4750                                UNKNOWN_LOCATION);
4751                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4752                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4753                   if (dump_enabled_p ())
4754                     {
4755                       dump_printf_loc (MSG_NOTE, vect_location,
4756                                        "created double reduction phi node: ");
4757                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4758                       dump_printf (MSG_NOTE, "\n");
4759                     }
4760
4761                   vect_phi_res = PHI_RESULT (vect_phi);
4762
4763                   /* Replace the use, i.e., set the correct vs1 in the regular
4764                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4765                      loop is redundant.  */
4766                   use = reduction_phi;
4767                   for (j = 0; j < ncopies; j++)
4768                     {
4769                       edge pr_edge = loop_preheader_edge (loop);
4770                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4771                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4772                     }
4773                 }
4774             }
4775         }
4776
4777       phis.release ();
4778       if (nested_in_vect_loop)
4779         {
4780           if (double_reduc)
4781             loop = outer_loop;
4782           else
4783             continue;
4784         }
4785
4786       phis.create (3);
4787       /* Find the loop-closed-use at the loop exit of the original scalar
4788          result.  (The reduction result is expected to have two immediate uses,
4789          one at the latch block, and one at the loop exit).  For double
4790          reductions we are looking for exit phis of the outer loop.  */
4791       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4792         {
4793           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4794             {
4795               if (!is_gimple_debug (USE_STMT (use_p)))
4796                 phis.safe_push (USE_STMT (use_p));
4797             }
4798           else
4799             {
4800               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4801                 {
4802                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4803
4804                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4805                     {
4806                       if (!flow_bb_inside_loop_p (loop,
4807                                              gimple_bb (USE_STMT (phi_use_p)))
4808                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4809                         phis.safe_push (USE_STMT (phi_use_p));
4810                     }
4811                 }
4812             }
4813         }
4814
4815       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4816         {
4817           /* Replace the uses:  */
4818           orig_name = PHI_RESULT (exit_phi);
4819           scalar_result = scalar_results[k];
4820           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4821             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4822               SET_USE (use_p, scalar_result);
4823         }
4824
4825       phis.release ();
4826     }
4827 }
4828
4829
4830 /* Function vectorizable_reduction.
4831
4832    Check if STMT performs a reduction operation that can be vectorized.
4833    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4834    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4835    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4836
4837    This function also handles reduction idioms (patterns) that have been
4838    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4839    of this form:
4840      X = pattern_expr (arg0, arg1, ..., X)
4841    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4842    sequence that had been detected and replaced by the pattern-stmt (STMT).
4843
4844    In some cases of reduction patterns, the type of the reduction variable X is
4845    different than the type of the other arguments of STMT.
4846    In such cases, the vectype that is used when transforming STMT into a vector
4847    stmt is different than the vectype that is used to determine the
4848    vectorization factor, because it consists of a different number of elements
4849    than the actual number of elements that are being operated upon in parallel.
4850
4851    For example, consider an accumulation of shorts into an int accumulator.
4852    On some targets it's possible to vectorize this pattern operating on 8
4853    shorts at a time (hence, the vectype for purposes of determining the
4854    vectorization factor should be V8HI); on the other hand, the vectype that
4855    is used to create the vector form is actually V4SI (the type of the result).
4856
4857    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4858    indicates what is the actual level of parallelism (V8HI in the example), so
4859    that the right vectorization factor would be derived.  This vectype
4860    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4861    be used to create the vectorized stmt.  The right vectype for the vectorized
4862    stmt is obtained from the type of the result X:
4863         get_vectype_for_scalar_type (TREE_TYPE (X))
4864
4865    This means that, contrary to "regular" reductions (or "regular" stmts in
4866    general), the following equation:
4867       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4868    does *NOT* necessarily hold for reduction patterns.  */
4869
4870 bool
4871 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4872                         gimple *vec_stmt, slp_tree slp_node)
4873 {
4874   tree vec_dest;
4875   tree scalar_dest;
4876   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4877   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4878   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4879   tree vectype_in = NULL_TREE;
4880   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4881   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4882   enum tree_code code, orig_code, epilog_reduc_code;
4883   machine_mode vec_mode;
4884   int op_type;
4885   optab optab, reduc_optab;
4886   tree new_temp = NULL_TREE;
4887   tree def;
4888   gimple def_stmt;
4889   enum vect_def_type dt;
4890   gphi *new_phi = NULL;
4891   tree scalar_type;
4892   bool is_simple_use;
4893   gimple orig_stmt;
4894   stmt_vec_info orig_stmt_info;
4895   tree expr = NULL_TREE;
4896   int i;
4897   int ncopies;
4898   int epilog_copies;
4899   stmt_vec_info prev_stmt_info, prev_phi_info;
4900   bool single_defuse_cycle = false;
4901   tree reduc_def = NULL_TREE;
4902   gimple new_stmt = NULL;
4903   int j;
4904   tree ops[3];
4905   bool nested_cycle = false, found_nested_cycle_def = false;
4906   gimple reduc_def_stmt = NULL;
4907   bool double_reduc = false, dummy;
4908   basic_block def_bb;
4909   struct loop * def_stmt_loop, *outer_loop = NULL;
4910   tree def_arg;
4911   gimple def_arg_stmt;
4912   auto_vec<tree> vec_oprnds0;
4913   auto_vec<tree> vec_oprnds1;
4914   auto_vec<tree> vect_defs;
4915   auto_vec<gimple> phis;
4916   int vec_num;
4917   tree def0, def1, tem, op0, op1 = NULL_TREE;
4918   bool first_p = true;
4919
4920   /* In case of reduction chain we switch to the first stmt in the chain, but
4921      we don't update STMT_INFO, since only the last stmt is marked as reduction
4922      and has reduction properties.  */
4923   if (GROUP_FIRST_ELEMENT (stmt_info)
4924       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4925     {
4926       stmt = GROUP_FIRST_ELEMENT (stmt_info);
4927       first_p = false;
4928     }
4929
4930   if (nested_in_vect_loop_p (loop, stmt))
4931     {
4932       outer_loop = loop;
4933       loop = loop->inner;
4934       nested_cycle = true;
4935     }
4936
4937   /* 1. Is vectorizable reduction?  */
4938   /* Not supportable if the reduction variable is used in the loop, unless
4939      it's a reduction chain.  */
4940   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4941       && !GROUP_FIRST_ELEMENT (stmt_info))
4942     return false;
4943
4944   /* Reductions that are not used even in an enclosing outer-loop,
4945      are expected to be "live" (used out of the loop).  */
4946   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4947       && !STMT_VINFO_LIVE_P (stmt_info))
4948     return false;
4949
4950   /* Make sure it was already recognized as a reduction computation.  */
4951   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4952       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4953     return false;
4954
4955   /* 2. Has this been recognized as a reduction pattern?
4956
4957      Check if STMT represents a pattern that has been recognized
4958      in earlier analysis stages.  For stmts that represent a pattern,
4959      the STMT_VINFO_RELATED_STMT field records the last stmt in
4960      the original sequence that constitutes the pattern.  */
4961
4962   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4963   if (orig_stmt)
4964     {
4965       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4966       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4967       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4968     }
4969
4970   /* 3. Check the operands of the operation.  The first operands are defined
4971         inside the loop body. The last operand is the reduction variable,
4972         which is defined by the loop-header-phi.  */
4973
4974   gcc_assert (is_gimple_assign (stmt));
4975
4976   /* Flatten RHS.  */
4977   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4978     {
4979     case GIMPLE_SINGLE_RHS:
4980       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4981       if (op_type == ternary_op)
4982         {
4983           tree rhs = gimple_assign_rhs1 (stmt);
4984           ops[0] = TREE_OPERAND (rhs, 0);
4985           ops[1] = TREE_OPERAND (rhs, 1);
4986           ops[2] = TREE_OPERAND (rhs, 2);
4987           code = TREE_CODE (rhs);
4988         }
4989       else
4990         return false;
4991       break;
4992
4993     case GIMPLE_BINARY_RHS:
4994       code = gimple_assign_rhs_code (stmt);
4995       op_type = TREE_CODE_LENGTH (code);
4996       gcc_assert (op_type == binary_op);
4997       ops[0] = gimple_assign_rhs1 (stmt);
4998       ops[1] = gimple_assign_rhs2 (stmt);
4999       break;
5000
5001     case GIMPLE_TERNARY_RHS:
5002       code = gimple_assign_rhs_code (stmt);
5003       op_type = TREE_CODE_LENGTH (code);
5004       gcc_assert (op_type == ternary_op);
5005       ops[0] = gimple_assign_rhs1 (stmt);
5006       ops[1] = gimple_assign_rhs2 (stmt);
5007       ops[2] = gimple_assign_rhs3 (stmt);
5008       break;
5009
5010     case GIMPLE_UNARY_RHS:
5011       return false;
5012
5013     default:
5014       gcc_unreachable ();
5015     }
5016   /* The default is that the reduction variable is the last in statement.  */
5017   int reduc_index = op_type - 1;
5018
5019   if (code == COND_EXPR && slp_node)
5020     return false;
5021
5022   scalar_dest = gimple_assign_lhs (stmt);
5023   scalar_type = TREE_TYPE (scalar_dest);
5024   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5025       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5026     return false;
5027
5028   /* Do not try to vectorize bit-precision reductions.  */
5029   if ((TYPE_PRECISION (scalar_type)
5030        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5031     return false;
5032
5033   /* All uses but the last are expected to be defined in the loop.
5034      The last use is the reduction variable.  In case of nested cycle this
5035      assumption is not true: we use reduc_index to record the index of the
5036      reduction variable.  */
5037   for (i = 0; i < op_type - 1; i++)
5038     {
5039       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5040       if (i == 0 && code == COND_EXPR)
5041         continue;
5042
5043       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5044                                             &def_stmt, &def, &dt, &tem);
5045       if (!vectype_in)
5046         vectype_in = tem;
5047       gcc_assert (is_simple_use);
5048
5049       if (dt != vect_internal_def
5050           && dt != vect_external_def
5051           && dt != vect_constant_def
5052           && dt != vect_induction_def
5053           && !(dt == vect_nested_cycle && nested_cycle))
5054         return false;
5055
5056       if (dt == vect_nested_cycle)
5057         {
5058           found_nested_cycle_def = true;
5059           reduc_def_stmt = def_stmt;
5060           reduc_index = i;
5061         }
5062     }
5063
5064   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5065                                         &def_stmt, &def, &dt, &tem);
5066   if (!vectype_in)
5067     vectype_in = tem;
5068   gcc_assert (is_simple_use);
5069   if (!found_nested_cycle_def)
5070     reduc_def_stmt = def_stmt;
5071
5072   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5073     return false;
5074
5075   if (!(dt == vect_reduction_def
5076         || dt == vect_nested_cycle
5077         || ((dt == vect_internal_def || dt == vect_external_def
5078              || dt == vect_constant_def || dt == vect_induction_def)
5079             && nested_cycle && found_nested_cycle_def)))
5080     {
5081       /* For pattern recognized stmts, orig_stmt might be a reduction,
5082          but some helper statements for the pattern might not, or
5083          might be COND_EXPRs with reduction uses in the condition.  */
5084       gcc_assert (orig_stmt);
5085       return false;
5086     }
5087
5088   gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5089                                          !nested_cycle, &dummy);
5090   if (orig_stmt)
5091     gcc_assert (tmp == orig_stmt
5092                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5093   else
5094     /* We changed STMT to be the first stmt in reduction chain, hence we
5095        check that in this case the first element in the chain is STMT.  */
5096     gcc_assert (stmt == tmp
5097                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5098
5099   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5100     return false;
5101
5102   if (slp_node || PURE_SLP_STMT (stmt_info))
5103     ncopies = 1;
5104   else
5105     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5106                / TYPE_VECTOR_SUBPARTS (vectype_in));
5107
5108   gcc_assert (ncopies >= 1);
5109
5110   vec_mode = TYPE_MODE (vectype_in);
5111
5112   if (code == COND_EXPR)
5113     {
5114       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5115         {
5116           if (dump_enabled_p ())
5117             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5118                              "unsupported condition in reduction\n");
5119
5120           return false;
5121         }
5122     }
5123   else
5124     {
5125       /* 4. Supportable by target?  */
5126
5127       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5128           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5129         {
5130           /* Shifts and rotates are only supported by vectorizable_shifts,
5131              not vectorizable_reduction.  */
5132           if (dump_enabled_p ())
5133             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5134                              "unsupported shift or rotation.\n");
5135           return false;
5136         }
5137
5138       /* 4.1. check support for the operation in the loop  */
5139       optab = optab_for_tree_code (code, vectype_in, optab_default);
5140       if (!optab)
5141         {
5142           if (dump_enabled_p ())
5143             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5144                              "no optab.\n");
5145
5146           return false;
5147         }
5148
5149       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5150         {
5151           if (dump_enabled_p ())
5152             dump_printf (MSG_NOTE, "op not supported by target.\n");
5153
5154           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5155               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5156                   < vect_min_worthwhile_factor (code))
5157             return false;
5158
5159           if (dump_enabled_p ())
5160             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5161         }
5162
5163       /* Worthwhile without SIMD support?  */
5164       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5165           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5166              < vect_min_worthwhile_factor (code))
5167         {
5168           if (dump_enabled_p ())
5169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5170                              "not worthwhile without SIMD support.\n");
5171
5172           return false;
5173         }
5174     }
5175
5176   /* 4.2. Check support for the epilog operation.
5177
5178           If STMT represents a reduction pattern, then the type of the
5179           reduction variable may be different than the type of the rest
5180           of the arguments.  For example, consider the case of accumulation
5181           of shorts into an int accumulator; The original code:
5182                         S1: int_a = (int) short_a;
5183           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5184
5185           was replaced with:
5186                         STMT: int_acc = widen_sum <short_a, int_acc>
5187
5188           This means that:
5189           1. The tree-code that is used to create the vector operation in the
5190              epilog code (that reduces the partial results) is not the
5191              tree-code of STMT, but is rather the tree-code of the original
5192              stmt from the pattern that STMT is replacing.  I.e, in the example
5193              above we want to use 'widen_sum' in the loop, but 'plus' in the
5194              epilog.
5195           2. The type (mode) we use to check available target support
5196              for the vector operation to be created in the *epilog*, is
5197              determined by the type of the reduction variable (in the example
5198              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5199              However the type (mode) we use to check available target support
5200              for the vector operation to be created *inside the loop*, is
5201              determined by the type of the other arguments to STMT (in the
5202              example we'd check this: optab_handler (widen_sum_optab,
5203              vect_short_mode)).
5204
5205           This is contrary to "regular" reductions, in which the types of all
5206           the arguments are the same as the type of the reduction variable.
5207           For "regular" reductions we can therefore use the same vector type
5208           (and also the same tree-code) when generating the epilog code and
5209           when generating the code inside the loop.  */
5210
5211   if (orig_stmt)
5212     {
5213       /* This is a reduction pattern: get the vectype from the type of the
5214          reduction variable, and get the tree-code from orig_stmt.  */
5215       orig_code = gimple_assign_rhs_code (orig_stmt);
5216       gcc_assert (vectype_out);
5217       vec_mode = TYPE_MODE (vectype_out);
5218     }
5219   else
5220     {
5221       /* Regular reduction: use the same vectype and tree-code as used for
5222          the vector code inside the loop can be used for the epilog code. */
5223       orig_code = code;
5224     }
5225
5226   if (nested_cycle)
5227     {
5228       def_bb = gimple_bb (reduc_def_stmt);
5229       def_stmt_loop = def_bb->loop_father;
5230       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5231                                        loop_preheader_edge (def_stmt_loop));
5232       if (TREE_CODE (def_arg) == SSA_NAME
5233           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5234           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5235           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5236           && vinfo_for_stmt (def_arg_stmt)
5237           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5238               == vect_double_reduction_def)
5239         double_reduc = true;
5240     }
5241
5242   epilog_reduc_code = ERROR_MARK;
5243   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5244     {
5245       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5246                                          optab_default);
5247       if (!reduc_optab)
5248         {
5249           if (dump_enabled_p ())
5250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5251                              "no optab for reduction.\n");
5252
5253           epilog_reduc_code = ERROR_MARK;
5254         }
5255       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5256         {
5257           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5258           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5259             {
5260               if (dump_enabled_p ())
5261                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5262                                  "reduc op not supported by target.\n");
5263
5264               epilog_reduc_code = ERROR_MARK;
5265             }
5266         }
5267     }
5268   else
5269     {
5270       if (!nested_cycle || double_reduc)
5271         {
5272           if (dump_enabled_p ())
5273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5274                              "no reduc code for scalar code.\n");
5275
5276           return false;
5277         }
5278     }
5279
5280   if (double_reduc && ncopies > 1)
5281     {
5282       if (dump_enabled_p ())
5283         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5284                          "multiple types in double reduction\n");
5285
5286       return false;
5287     }
5288
5289   /* In case of widenning multiplication by a constant, we update the type
5290      of the constant to be the type of the other operand.  We check that the
5291      constant fits the type in the pattern recognition pass.  */
5292   if (code == DOT_PROD_EXPR
5293       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5294     {
5295       if (TREE_CODE (ops[0]) == INTEGER_CST)
5296         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5297       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5298         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5299       else
5300         {
5301           if (dump_enabled_p ())
5302             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5303                              "invalid types in dot-prod\n");
5304
5305           return false;
5306         }
5307     }
5308
5309   if (!vec_stmt) /* transformation not required.  */
5310     {
5311       if (first_p
5312           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5313                                          reduc_index))
5314         return false;
5315       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5316       return true;
5317     }
5318
5319   /** Transform.  **/
5320
5321   if (dump_enabled_p ())
5322     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5323
5324   /* FORNOW: Multiple types are not supported for condition.  */
5325   if (code == COND_EXPR)
5326     gcc_assert (ncopies == 1);
5327
5328   /* Create the destination vector  */
5329   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5330
5331   /* In case the vectorization factor (VF) is bigger than the number
5332      of elements that we can fit in a vectype (nunits), we have to generate
5333      more than one vector stmt - i.e - we need to "unroll" the
5334      vector stmt by a factor VF/nunits.  For more details see documentation
5335      in vectorizable_operation.  */
5336
5337   /* If the reduction is used in an outer loop we need to generate
5338      VF intermediate results, like so (e.g. for ncopies=2):
5339         r0 = phi (init, r0)
5340         r1 = phi (init, r1)
5341         r0 = x0 + r0;
5342         r1 = x1 + r1;
5343     (i.e. we generate VF results in 2 registers).
5344     In this case we have a separate def-use cycle for each copy, and therefore
5345     for each copy we get the vector def for the reduction variable from the
5346     respective phi node created for this copy.
5347
5348     Otherwise (the reduction is unused in the loop nest), we can combine
5349     together intermediate results, like so (e.g. for ncopies=2):
5350         r = phi (init, r)
5351         r = x0 + r;
5352         r = x1 + r;
5353    (i.e. we generate VF/2 results in a single register).
5354    In this case for each copy we get the vector def for the reduction variable
5355    from the vectorized reduction operation generated in the previous iteration.
5356   */
5357
5358   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5359     {
5360       single_defuse_cycle = true;
5361       epilog_copies = 1;
5362     }
5363   else
5364     epilog_copies = ncopies;
5365
5366   prev_stmt_info = NULL;
5367   prev_phi_info = NULL;
5368   if (slp_node)
5369     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5370   else
5371     {
5372       vec_num = 1;
5373       vec_oprnds0.create (1);
5374       if (op_type == ternary_op)
5375         vec_oprnds1.create (1);
5376     }
5377
5378   phis.create (vec_num);
5379   vect_defs.create (vec_num);
5380   if (!slp_node)
5381     vect_defs.quick_push (NULL_TREE);
5382
5383   for (j = 0; j < ncopies; j++)
5384     {
5385       if (j == 0 || !single_defuse_cycle)
5386         {
5387           for (i = 0; i < vec_num; i++)
5388             {
5389               /* Create the reduction-phi that defines the reduction
5390                  operand.  */
5391               new_phi = create_phi_node (vec_dest, loop->header);
5392               set_vinfo_for_stmt (new_phi,
5393                                   new_stmt_vec_info (new_phi, loop_vinfo,
5394                                                      NULL));
5395                if (j == 0 || slp_node)
5396                  phis.quick_push (new_phi);
5397             }
5398         }
5399
5400       if (code == COND_EXPR)
5401         {
5402           gcc_assert (!slp_node);
5403           vectorizable_condition (stmt, gsi, vec_stmt,
5404                                   PHI_RESULT (phis[0]),
5405                                   reduc_index, NULL);
5406           /* Multiple types are not supported for condition.  */
5407           break;
5408         }
5409
5410       /* Handle uses.  */
5411       if (j == 0)
5412         {
5413           op0 = ops[!reduc_index];
5414           if (op_type == ternary_op)
5415             {
5416               if (reduc_index == 0)
5417                 op1 = ops[2];
5418               else
5419                 op1 = ops[1];
5420             }
5421
5422           if (slp_node)
5423             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5424                                slp_node, -1);
5425           else
5426             {
5427               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5428                                                             stmt, NULL);
5429               vec_oprnds0.quick_push (loop_vec_def0);
5430               if (op_type == ternary_op)
5431                {
5432                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5433                                                                NULL);
5434                  vec_oprnds1.quick_push (loop_vec_def1);
5435                }
5436             }
5437         }
5438       else
5439         {
5440           if (!slp_node)
5441             {
5442               enum vect_def_type dt;
5443               gimple dummy_stmt;
5444               tree dummy;
5445
5446               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5447                                   &dummy_stmt, &dummy, &dt);
5448               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5449                                                               loop_vec_def0);
5450               vec_oprnds0[0] = loop_vec_def0;
5451               if (op_type == ternary_op)
5452                 {
5453                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5454                                       &dummy, &dt);
5455                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5456                                                                 loop_vec_def1);
5457                   vec_oprnds1[0] = loop_vec_def1;
5458                 }
5459             }
5460
5461           if (single_defuse_cycle)
5462             reduc_def = gimple_assign_lhs (new_stmt);
5463
5464           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5465         }
5466
5467       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5468         {
5469           if (slp_node)
5470             reduc_def = PHI_RESULT (phis[i]);
5471           else
5472             {
5473               if (!single_defuse_cycle || j == 0)
5474                 reduc_def = PHI_RESULT (new_phi);
5475             }
5476
5477           def1 = ((op_type == ternary_op)
5478                   ? vec_oprnds1[i] : NULL);
5479           if (op_type == binary_op)
5480             {
5481               if (reduc_index == 0)
5482                 expr = build2 (code, vectype_out, reduc_def, def0);
5483               else
5484                 expr = build2 (code, vectype_out, def0, reduc_def);
5485             }
5486           else
5487             {
5488               if (reduc_index == 0)
5489                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5490               else
5491                 {
5492                   if (reduc_index == 1)
5493                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5494                   else
5495                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5496                 }
5497             }
5498
5499           new_stmt = gimple_build_assign (vec_dest, expr);
5500           new_temp = make_ssa_name (vec_dest, new_stmt);
5501           gimple_assign_set_lhs (new_stmt, new_temp);
5502           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5503
5504           if (slp_node)
5505             {
5506               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5507               vect_defs.quick_push (new_temp);
5508             }
5509           else
5510             vect_defs[0] = new_temp;
5511         }
5512
5513       if (slp_node)
5514         continue;
5515
5516       if (j == 0)
5517         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5518       else
5519         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5520
5521       prev_stmt_info = vinfo_for_stmt (new_stmt);
5522       prev_phi_info = vinfo_for_stmt (new_phi);
5523     }
5524
5525   /* Finalize the reduction-phi (set its arguments) and create the
5526      epilog reduction code.  */
5527   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5528     {
5529       new_temp = gimple_assign_lhs (*vec_stmt);
5530       vect_defs[0] = new_temp;
5531     }
5532
5533   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5534                                     epilog_reduc_code, phis, reduc_index,
5535                                     double_reduc, slp_node);
5536
5537   return true;
5538 }
5539
5540 /* Function vect_min_worthwhile_factor.
5541
5542    For a loop where we could vectorize the operation indicated by CODE,
5543    return the minimum vectorization factor that makes it worthwhile
5544    to use generic vectors.  */
5545 int
5546 vect_min_worthwhile_factor (enum tree_code code)
5547 {
5548   switch (code)
5549     {
5550     case PLUS_EXPR:
5551     case MINUS_EXPR:
5552     case NEGATE_EXPR:
5553       return 4;
5554
5555     case BIT_AND_EXPR:
5556     case BIT_IOR_EXPR:
5557     case BIT_XOR_EXPR:
5558     case BIT_NOT_EXPR:
5559       return 2;
5560
5561     default:
5562       return INT_MAX;
5563     }
5564 }
5565
5566
5567 /* Function vectorizable_induction
5568
5569    Check if PHI performs an induction computation that can be vectorized.
5570    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5571    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5572    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5573
5574 bool
5575 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5576                         gimple *vec_stmt)
5577 {
5578   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5579   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5580   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5581   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5582   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5583   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5584   tree vec_def;
5585
5586   gcc_assert (ncopies >= 1);
5587   /* FORNOW. These restrictions should be relaxed.  */
5588   if (nested_in_vect_loop_p (loop, phi))
5589     {
5590       imm_use_iterator imm_iter;
5591       use_operand_p use_p;
5592       gimple exit_phi;
5593       edge latch_e;
5594       tree loop_arg;
5595
5596       if (ncopies > 1)
5597         {
5598           if (dump_enabled_p ())
5599             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5600                              "multiple types in nested loop.\n");
5601           return false;
5602         }
5603
5604       exit_phi = NULL;
5605       latch_e = loop_latch_edge (loop->inner);
5606       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5607       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5608         {
5609           gimple use_stmt = USE_STMT (use_p);
5610           if (is_gimple_debug (use_stmt))
5611             continue;
5612
5613           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5614             {
5615               exit_phi = use_stmt;
5616               break;
5617             }
5618         }
5619       if (exit_phi)
5620         {
5621           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5622           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5623                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5624             {
5625               if (dump_enabled_p ())
5626                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5627                                  "inner-loop induction only used outside "
5628                                  "of the outer vectorized loop.\n");
5629               return false;
5630             }
5631         }
5632     }
5633
5634   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5635     return false;
5636
5637   /* FORNOW: SLP not supported.  */
5638   if (STMT_SLP_TYPE (stmt_info))
5639     return false;
5640
5641   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5642
5643   if (gimple_code (phi) != GIMPLE_PHI)
5644     return false;
5645
5646   if (!vec_stmt) /* transformation not required.  */
5647     {
5648       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5649       if (dump_enabled_p ())
5650         dump_printf_loc (MSG_NOTE, vect_location,
5651                          "=== vectorizable_induction ===\n");
5652       vect_model_induction_cost (stmt_info, ncopies);
5653       return true;
5654     }
5655
5656   /** Transform.  **/
5657
5658   if (dump_enabled_p ())
5659     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5660
5661   vec_def = get_initial_def_for_induction (phi);
5662   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5663   return true;
5664 }
5665
5666 /* Function vectorizable_live_operation.
5667
5668    STMT computes a value that is used outside the loop.  Check if
5669    it can be supported.  */
5670
5671 bool
5672 vectorizable_live_operation (gimple stmt,
5673                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5674                              gimple *vec_stmt)
5675 {
5676   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5677   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5678   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5679   int i;
5680   int op_type;
5681   tree op;
5682   tree def;
5683   gimple def_stmt;
5684   enum vect_def_type dt;
5685   enum tree_code code;
5686   enum gimple_rhs_class rhs_class;
5687
5688   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5689
5690   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5691     return false;
5692
5693   if (!is_gimple_assign (stmt))
5694     {
5695       if (gimple_call_internal_p (stmt)
5696           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5697           && gimple_call_lhs (stmt)
5698           && loop->simduid
5699           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5700           && loop->simduid
5701              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5702         {
5703           edge e = single_exit (loop);
5704           basic_block merge_bb = e->dest;
5705           imm_use_iterator imm_iter;
5706           use_operand_p use_p;
5707           tree lhs = gimple_call_lhs (stmt);
5708
5709           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5710             {
5711               gimple use_stmt = USE_STMT (use_p);
5712               if (gimple_code (use_stmt) == GIMPLE_PHI
5713                   && gimple_bb (use_stmt) == merge_bb)
5714                 {
5715                   if (vec_stmt)
5716                     {
5717                       tree vfm1
5718                         = build_int_cst (unsigned_type_node,
5719                                          loop_vinfo->vectorization_factor - 1);
5720                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5721                     }
5722                   return true;
5723                 }
5724             }
5725         }
5726
5727       return false;
5728     }
5729
5730   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5731     return false;
5732
5733   /* FORNOW. CHECKME. */
5734   if (nested_in_vect_loop_p (loop, stmt))
5735     return false;
5736
5737   code = gimple_assign_rhs_code (stmt);
5738   op_type = TREE_CODE_LENGTH (code);
5739   rhs_class = get_gimple_rhs_class (code);
5740   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5741   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5742
5743   /* FORNOW: support only if all uses are invariant.  This means
5744      that the scalar operations can remain in place, unvectorized.
5745      The original last scalar value that they compute will be used.  */
5746
5747   for (i = 0; i < op_type; i++)
5748     {
5749       if (rhs_class == GIMPLE_SINGLE_RHS)
5750         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5751       else
5752         op = gimple_op (stmt, i + 1);
5753       if (op
5754           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5755                                   &dt))
5756         {
5757           if (dump_enabled_p ())
5758             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5759                              "use not simple.\n");
5760           return false;
5761         }
5762
5763       if (dt != vect_external_def && dt != vect_constant_def)
5764         return false;
5765     }
5766
5767   /* No transformation is required for the cases we currently support.  */
5768   return true;
5769 }
5770
5771 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5772
5773 static void
5774 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5775 {
5776   ssa_op_iter op_iter;
5777   imm_use_iterator imm_iter;
5778   def_operand_p def_p;
5779   gimple ustmt;
5780
5781   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5782     {
5783       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5784         {
5785           basic_block bb;
5786
5787           if (!is_gimple_debug (ustmt))
5788             continue;
5789
5790           bb = gimple_bb (ustmt);
5791
5792           if (!flow_bb_inside_loop_p (loop, bb))
5793             {
5794               if (gimple_debug_bind_p (ustmt))
5795                 {
5796                   if (dump_enabled_p ())
5797                     dump_printf_loc (MSG_NOTE, vect_location,
5798                                      "killing debug use\n");
5799
5800                   gimple_debug_bind_reset_value (ustmt);
5801                   update_stmt (ustmt);
5802                 }
5803               else
5804                 gcc_unreachable ();
5805             }
5806         }
5807     }
5808 }
5809
5810
5811 /* This function builds ni_name = number of iterations.  Statements
5812    are emitted on the loop preheader edge.  */
5813
5814 static tree
5815 vect_build_loop_niters (loop_vec_info loop_vinfo)
5816 {
5817   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5818   if (TREE_CODE (ni) == INTEGER_CST)
5819     return ni;
5820   else
5821     {
5822       tree ni_name, var;
5823       gimple_seq stmts = NULL;
5824       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5825
5826       var = create_tmp_var (TREE_TYPE (ni), "niters");
5827       ni_name = force_gimple_operand (ni, &stmts, false, var);
5828       if (stmts)
5829         gsi_insert_seq_on_edge_immediate (pe, stmts);
5830
5831       return ni_name;
5832     }
5833 }
5834
5835
5836 /* This function generates the following statements:
5837
5838    ni_name = number of iterations loop executes
5839    ratio = ni_name / vf
5840    ratio_mult_vf_name = ratio * vf
5841
5842    and places them on the loop preheader edge.  */
5843
5844 static void
5845 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5846                                  tree ni_name,
5847                                  tree *ratio_mult_vf_name_ptr,
5848                                  tree *ratio_name_ptr)
5849 {
5850   tree ni_minus_gap_name;
5851   tree var;
5852   tree ratio_name;
5853   tree ratio_mult_vf_name;
5854   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5855   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5856   tree log_vf;
5857
5858   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5859
5860   /* If epilogue loop is required because of data accesses with gaps, we
5861      subtract one iteration from the total number of iterations here for
5862      correct calculation of RATIO.  */
5863   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5864     {
5865       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5866                                        ni_name,
5867                                        build_one_cst (TREE_TYPE (ni_name)));
5868       if (!is_gimple_val (ni_minus_gap_name))
5869         {
5870           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5871           gimple stmts = NULL;
5872           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5873                                                     true, var);
5874           gsi_insert_seq_on_edge_immediate (pe, stmts);
5875         }
5876     }
5877   else
5878     ni_minus_gap_name = ni_name;
5879
5880   /* Create: ratio = ni >> log2(vf) */
5881   /* ???  As we have ni == number of latch executions + 1, ni could
5882      have overflown to zero.  So avoid computing ratio based on ni
5883      but compute it using the fact that we know ratio will be at least
5884      one, thus via (ni - vf) >> log2(vf) + 1.  */
5885   ratio_name
5886     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5887                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5888                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5889                                              ni_minus_gap_name,
5890                                              build_int_cst
5891                                                (TREE_TYPE (ni_name), vf)),
5892                                 log_vf),
5893                    build_int_cst (TREE_TYPE (ni_name), 1));
5894   if (!is_gimple_val (ratio_name))
5895     {
5896       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5897       gimple stmts = NULL;
5898       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5899       gsi_insert_seq_on_edge_immediate (pe, stmts);
5900     }
5901   *ratio_name_ptr = ratio_name;
5902
5903   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5904
5905   if (ratio_mult_vf_name_ptr)
5906     {
5907       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5908                                         ratio_name, log_vf);
5909       if (!is_gimple_val (ratio_mult_vf_name))
5910         {
5911           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5912           gimple stmts = NULL;
5913           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5914                                                      true, var);
5915           gsi_insert_seq_on_edge_immediate (pe, stmts);
5916         }
5917       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5918     }
5919
5920   return;
5921 }
5922
5923
5924 /* Function vect_transform_loop.
5925
5926    The analysis phase has determined that the loop is vectorizable.
5927    Vectorize the loop - created vectorized stmts to replace the scalar
5928    stmts in the loop, and update the loop exit condition.  */
5929
5930 void
5931 vect_transform_loop (loop_vec_info loop_vinfo)
5932 {
5933   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5934   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5935   int nbbs = loop->num_nodes;
5936   int i;
5937   tree ratio = NULL;
5938   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5939   bool grouped_store;
5940   bool slp_scheduled = false;
5941   gimple stmt, pattern_stmt;
5942   gimple_seq pattern_def_seq = NULL;
5943   gimple_stmt_iterator pattern_def_si = gsi_none ();
5944   bool transform_pattern_stmt = false;
5945   bool check_profitability = false;
5946   int th;
5947   /* Record number of iterations before we started tampering with the profile. */
5948   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5949
5950   if (dump_enabled_p ())
5951     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5952
5953   /* If profile is inprecise, we have chance to fix it up.  */
5954   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5955     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5956
5957   /* Use the more conservative vectorization threshold.  If the number
5958      of iterations is constant assume the cost check has been performed
5959      by our caller.  If the threshold makes all loops profitable that
5960      run at least the vectorization factor number of times checking
5961      is pointless, too.  */
5962   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5963   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5964       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5965     {
5966       if (dump_enabled_p ())
5967         dump_printf_loc (MSG_NOTE, vect_location,
5968                          "Profitability threshold is %d loop iterations.\n",
5969                          th);
5970       check_profitability = true;
5971     }
5972
5973   /* Version the loop first, if required, so the profitability check
5974      comes first.  */
5975
5976   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5977       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5978     {
5979       vect_loop_versioning (loop_vinfo, th, check_profitability);
5980       check_profitability = false;
5981     }
5982
5983   tree ni_name = vect_build_loop_niters (loop_vinfo);
5984   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5985
5986   /* Peel the loop if there are data refs with unknown alignment.
5987      Only one data ref with unknown store is allowed.  */
5988
5989   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5990     {
5991       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5992                                      th, check_profitability);
5993       check_profitability = false;
5994       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5995          be re-computed.  */
5996       ni_name = NULL_TREE;
5997     }
5998
5999   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6000      compile time constant), or it is a constant that doesn't divide by the
6001      vectorization factor, then an epilog loop needs to be created.
6002      We therefore duplicate the loop: the original loop will be vectorized,
6003      and will compute the first (n/VF) iterations.  The second copy of the loop
6004      will remain scalar and will compute the remaining (n%VF) iterations.
6005      (VF is the vectorization factor).  */
6006
6007   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6008       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6009     {
6010       tree ratio_mult_vf;
6011       if (!ni_name)
6012         ni_name = vect_build_loop_niters (loop_vinfo);
6013       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6014                                        &ratio);
6015       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6016                                       th, check_profitability);
6017     }
6018   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6019     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6020                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6021   else
6022     {
6023       if (!ni_name)
6024         ni_name = vect_build_loop_niters (loop_vinfo);
6025       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6026     }
6027
6028   /* 1) Make sure the loop header has exactly two entries
6029      2) Make sure we have a preheader basic block.  */
6030
6031   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6032
6033   split_edge (loop_preheader_edge (loop));
6034
6035   /* FORNOW: the vectorizer supports only loops which body consist
6036      of one basic block (header + empty latch). When the vectorizer will
6037      support more involved loop forms, the order by which the BBs are
6038      traversed need to be reconsidered.  */
6039
6040   for (i = 0; i < nbbs; i++)
6041     {
6042       basic_block bb = bbs[i];
6043       stmt_vec_info stmt_info;
6044
6045       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6046            gsi_next (&si))
6047         {
6048           gphi *phi = si.phi ();
6049           if (dump_enabled_p ())
6050             {
6051               dump_printf_loc (MSG_NOTE, vect_location,
6052                                "------>vectorizing phi: ");
6053               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6054               dump_printf (MSG_NOTE, "\n");
6055             }
6056           stmt_info = vinfo_for_stmt (phi);
6057           if (!stmt_info)
6058             continue;
6059
6060           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6061             vect_loop_kill_debug_uses (loop, phi);
6062
6063           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6064               && !STMT_VINFO_LIVE_P (stmt_info))
6065             continue;
6066
6067           if (STMT_VINFO_VECTYPE (stmt_info)
6068               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6069                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6070               && dump_enabled_p ())
6071             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6072
6073           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6074             {
6075               if (dump_enabled_p ())
6076                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6077               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6078             }
6079         }
6080
6081       pattern_stmt = NULL;
6082       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6083            !gsi_end_p (si) || transform_pattern_stmt;)
6084         {
6085           bool is_store;
6086
6087           if (transform_pattern_stmt)
6088             stmt = pattern_stmt;
6089           else
6090             {
6091               stmt = gsi_stmt (si);
6092               /* During vectorization remove existing clobber stmts.  */
6093               if (gimple_clobber_p (stmt))
6094                 {
6095                   unlink_stmt_vdef (stmt);
6096                   gsi_remove (&si, true);
6097                   release_defs (stmt);
6098                   continue;
6099                 }
6100             }
6101
6102           if (dump_enabled_p ())
6103             {
6104               dump_printf_loc (MSG_NOTE, vect_location,
6105                                "------>vectorizing statement: ");
6106               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6107               dump_printf (MSG_NOTE, "\n");
6108             }
6109
6110           stmt_info = vinfo_for_stmt (stmt);
6111
6112           /* vector stmts created in the outer-loop during vectorization of
6113              stmts in an inner-loop may not have a stmt_info, and do not
6114              need to be vectorized.  */
6115           if (!stmt_info)
6116             {
6117               gsi_next (&si);
6118               continue;
6119             }
6120
6121           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6122             vect_loop_kill_debug_uses (loop, stmt);
6123
6124           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6125               && !STMT_VINFO_LIVE_P (stmt_info))
6126             {
6127               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6128                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6129                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6130                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6131                 {
6132                   stmt = pattern_stmt;
6133                   stmt_info = vinfo_for_stmt (stmt);
6134                 }
6135               else
6136                 {
6137                   gsi_next (&si);
6138                   continue;
6139                 }
6140             }
6141           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6142                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6143                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6144                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6145             transform_pattern_stmt = true;
6146
6147           /* If pattern statement has def stmts, vectorize them too.  */
6148           if (is_pattern_stmt_p (stmt_info))
6149             {
6150               if (pattern_def_seq == NULL)
6151                 {
6152                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6153                   pattern_def_si = gsi_start (pattern_def_seq);
6154                 }
6155               else if (!gsi_end_p (pattern_def_si))
6156                 gsi_next (&pattern_def_si);
6157               if (pattern_def_seq != NULL)
6158                 {
6159                   gimple pattern_def_stmt = NULL;
6160                   stmt_vec_info pattern_def_stmt_info = NULL;
6161
6162                   while (!gsi_end_p (pattern_def_si))
6163                     {
6164                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6165                       pattern_def_stmt_info
6166                         = vinfo_for_stmt (pattern_def_stmt);
6167                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6168                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6169                         break;
6170                       gsi_next (&pattern_def_si);
6171                     }
6172
6173                   if (!gsi_end_p (pattern_def_si))
6174                     {
6175                       if (dump_enabled_p ())
6176                         {
6177                           dump_printf_loc (MSG_NOTE, vect_location,
6178                                            "==> vectorizing pattern def "
6179                                            "stmt: ");
6180                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6181                                             pattern_def_stmt, 0);
6182                           dump_printf (MSG_NOTE, "\n");
6183                         }
6184
6185                       stmt = pattern_def_stmt;
6186                       stmt_info = pattern_def_stmt_info;
6187                     }
6188                   else
6189                     {
6190                       pattern_def_si = gsi_none ();
6191                       transform_pattern_stmt = false;
6192                     }
6193                 }
6194               else
6195                 transform_pattern_stmt = false;
6196             }
6197
6198           if (STMT_VINFO_VECTYPE (stmt_info))
6199             {
6200               unsigned int nunits
6201                 = (unsigned int)
6202                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6203               if (!STMT_SLP_TYPE (stmt_info)
6204                   && nunits != (unsigned int) vectorization_factor
6205                   && dump_enabled_p ())
6206                   /* For SLP VF is set according to unrolling factor, and not
6207                      to vector size, hence for SLP this print is not valid.  */
6208                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6209             }
6210
6211           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6212              reached.  */
6213           if (STMT_SLP_TYPE (stmt_info))
6214             {
6215               if (!slp_scheduled)
6216                 {
6217                   slp_scheduled = true;
6218
6219                   if (dump_enabled_p ())
6220                     dump_printf_loc (MSG_NOTE, vect_location,
6221                                      "=== scheduling SLP instances ===\n");
6222
6223                   vect_schedule_slp (loop_vinfo, NULL);
6224                 }
6225
6226               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6227               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6228                 {
6229                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6230                     {
6231                       pattern_def_seq = NULL;
6232                       gsi_next (&si);
6233                     }
6234                   continue;
6235                 }
6236             }
6237
6238           /* -------- vectorize statement ------------ */
6239           if (dump_enabled_p ())
6240             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6241
6242           grouped_store = false;
6243           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6244           if (is_store)
6245             {
6246               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6247                 {
6248                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6249                      interleaving chain was completed - free all the stores in
6250                      the chain.  */
6251                   gsi_next (&si);
6252                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6253                 }
6254               else
6255                 {
6256                   /* Free the attached stmt_vec_info and remove the stmt.  */
6257                   gimple store = gsi_stmt (si);
6258                   free_stmt_vec_info (store);
6259                   unlink_stmt_vdef (store);
6260                   gsi_remove (&si, true);
6261                   release_defs (store);
6262                 }
6263
6264               /* Stores can only appear at the end of pattern statements.  */
6265               gcc_assert (!transform_pattern_stmt);
6266               pattern_def_seq = NULL;
6267             }
6268           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6269             {
6270               pattern_def_seq = NULL;
6271               gsi_next (&si);
6272             }
6273         }                       /* stmts in BB */
6274     }                           /* BBs in loop */
6275
6276   slpeel_make_loop_iterate_ntimes (loop, ratio);
6277
6278   /* Reduce loop iterations by the vectorization factor.  */
6279   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6280                       expected_iterations / vectorization_factor);
6281   loop->nb_iterations_upper_bound
6282     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6283   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6284       && loop->nb_iterations_upper_bound != 0)
6285     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6286   if (loop->any_estimate)
6287     {
6288       loop->nb_iterations_estimate
6289         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6290        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6291            && loop->nb_iterations_estimate != 0)
6292          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6293     }
6294
6295   if (dump_enabled_p ())
6296     {
6297       dump_printf_loc (MSG_NOTE, vect_location,
6298                        "LOOP VECTORIZED\n");
6299       if (loop->inner)
6300         dump_printf_loc (MSG_NOTE, vect_location,
6301                          "OUTER LOOP VECTORIZED\n");
6302       dump_printf (MSG_NOTE, "\n");
6303     }
6304 }