gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "predict.h"
  40 #include "hard-reg-set.h"
  41 #include "function.h"
  42 #include "dominance.h"
  43 #include "cfg.h"
  44 #include "cfganal.h"
  45 #include "basic-block.h"
  46 #include "gimple-pretty-print.h"
  47 #include "tree-ssa-alias.h"
  48 #include "internal-fn.h"
  49 #include "gimple-expr.h"
  50 #include "is-a.h"
  51 #include "gimple.h"
  52 #include "gimplify.h"
  53 #include "gimple-iterator.h"
  54 #include "gimplify-me.h"
  55 #include "gimple-ssa.h"
  56 #include "tree-phinodes.h"
  57 #include "ssa-iterators.h"
  58 #include "stringpool.h"
  59 #include "tree-ssanames.h"
  60 #include "tree-ssa-loop-ivopts.h"
  61 #include "tree-ssa-loop-manip.h"
  62 #include "tree-ssa-loop-niter.h"
  63 #include "tree-pass.h"
  64 #include "cfgloop.h"
  65 #include "hashtab.h"
  66 #include "rtl.h"
  67 #include "flags.h"
  68 #include "statistics.h"
  69 #include "real.h"
  70 #include "fixed-value.h"
  71 #include "insn-config.h"
  72 #include "expmed.h"
  73 #include "dojump.h"
  74 #include "explow.h"
  75 #include "calls.h"
  76 #include "emit-rtl.h"
  77 #include "varasm.h"
  78 #include "stmt.h"
  79 #include "expr.h"
  80 #include "recog.h"
  81 #include "insn-codes.h"
  82 #include "optabs.h"
  83 #include "params.h"
  84 #include "diagnostic-core.h"
  85 #include "tree-chrec.h"
  86 #include "tree-scalar-evolution.h"
  87 #include "tree-vectorizer.h"
  88 #include "target.h"
  89
  90 /* Loop Vectorization Pass.
  91
  92    This pass tries to vectorize loops.
  93
  94    For example, the vectorizer transforms the following simple loop:
  95
  96         short a[N]; short b[N]; short c[N]; int i;
  97
  98         for (i=0; i<N; i++){
  99           a[i] = b[i] + c[i];
 100         }
 101
 102    as if it was manually vectorized by rewriting the source code into:
 103
 104         typedef int __attribute__((mode(V8HI))) v8hi;
 105         short a[N];  short b[N]; short c[N];   int i;
 106         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
 107         v8hi va, vb, vc;
 108
 109         for (i=0; i<N/8; i++){
 110           vb = pb[i];
 111           vc = pc[i];
 112           va = vb + vc;
 113           pa[i] = va;
 114         }
 115
 116         The main entry to this pass is vectorize_loops(), in which
 117    the vectorizer applies a set of analyses on a given set of loops,
 118    followed by the actual vectorization transformation for the loops that
 119    had successfully passed the analysis phase.
 120         Throughout this pass we make a distinction between two types of
 121    data: scalars (which are represented by SSA_NAMES), and memory references
 122    ("data-refs").  These two types of data require different handling both
 123    during analysis and transformation. The types of data-refs that the
 124    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 125    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 126    accesses are required to have a simple (consecutive) access pattern.
 127
 128    Analysis phase:
 129    ===============
 130         The driver for the analysis phase is vect_analyze_loop().
 131    It applies a set of analyses, some of which rely on the scalar evolution
 132    analyzer (scev) developed by Sebastian Pop.
 133
 134         During the analysis phase the vectorizer records some information
 135    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 136    loop, as well as general information about the loop as a whole, which is
 137    recorded in a "loop_vec_info" struct attached to each loop.
 138
 139    Transformation phase:
 140    =====================
 141         The loop transformation phase scans all the stmts in the loop, and
 142    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 143    the loop that needs to be vectorized.  It inserts the vector code sequence
 144    just before the scalar stmt S, and records a pointer to the vector code
 145    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 146    attached to S).  This pointer will be used for the vectorization of following
 147    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 148    otherwise, we rely on dead code elimination for removing it.
 149
 150         For example, say stmt S1 was vectorized into stmt VS1:
 151
 152    VS1: vb = px[i];
 153    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 154    S2:  a = b;
 155
 156    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 157    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 158    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 159    resulting sequence would be:
 160
 161    VS1: vb = px[i];
 162    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 163    VS2: va = vb;
 164    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 165
 166         Operands that are not SSA_NAMEs, are data-refs that appear in
 167    load/store operations (like 'x[i]' in S1), and are handled differently.
 168
 169    Target modeling:
 170    =================
 171         Currently the only target specific information that is used is the
 172    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 173    Targets that can support different sizes of vectors, for now will need
 174    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 175    flexibility will be added in the future.
 176
 177         Since we only vectorize operations which vector form can be
 178    expressed using existing tree codes, to verify that an operation is
 179    supported, the vectorizer checks the relevant optab at the relevant
 180    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 181    the value found is CODE_FOR_nothing, then there's no target support, and
 182    we can't vectorize the stmt.
 183
 184    For additional information on this project see:
 185    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 186 */
 187
 188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 189
 190 /* Function vect_determine_vectorization_factor
 191
 192    Determine the vectorization factor (VF).  VF is the number of data elements
 193    that are operated upon in parallel in a single iteration of the vectorized
 194    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 195    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 196    elements can fit in a single vector register.
 197
 198    We currently support vectorization of loops in which all types operated upon
 199    are of the same size.  Therefore this function currently sets VF according to
 200    the size of the types operated upon, and fails if there are multiple sizes
 201    in the loop.
 202
 203    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 204    original loop:
 205         for (i=0; i<N; i++){
 206           a[i] = b[i] + c[i];
 207         }
 208
 209    vectorized loop:
 210         for (i=0; i<N; i+=VF){
 211           a[i:VF] = b[i:VF] + c[i:VF];
 212         }
 213 */
 214
 215 static bool
 216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 217 {
 218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 219   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 220   int nbbs = loop->num_nodes;
 221   unsigned int vectorization_factor = 0;
 222   tree scalar_type;
 223   gphi *phi;
 224   tree vectype;
 225   unsigned int nunits;
 226   stmt_vec_info stmt_info;
 227   int i;
 228   HOST_WIDE_INT dummy;
 229   gimple stmt, pattern_stmt = NULL;
 230   gimple_seq pattern_def_seq = NULL;
 231   gimple_stmt_iterator pattern_def_si = gsi_none ();
 232   bool analyze_pattern_stmt = false;
 233
 234   if (dump_enabled_p ())
 235     dump_printf_loc (MSG_NOTE, vect_location,
 236                      "=== vect_determine_vectorization_factor ===\n");
 237
 238   for (i = 0; i < nbbs; i++)
 239     {
 240       basic_block bb = bbs[i];
 241
 242       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 243            gsi_next (&si))
 244         {
 245           phi = si.phi ();
 246           stmt_info = vinfo_for_stmt (phi);
 247           if (dump_enabled_p ())
 248             {
 249               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 250               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 251               dump_printf (MSG_NOTE, "\n");
 252             }
 253
 254           gcc_assert (stmt_info);
 255
 256           if (STMT_VINFO_RELEVANT_P (stmt_info))
 257             {
 258               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 259               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location,
 264                                    "get vectype for scalar type:  ");
 265                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 266                   dump_printf (MSG_NOTE, "\n");
 267                 }
 268
 269               vectype = get_vectype_for_scalar_type (scalar_type);
 270               if (!vectype)
 271                 {
 272                   if (dump_enabled_p ())
 273                     {
 274                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 275                                        "not vectorized: unsupported "
 276                                        "data-type ");
 277                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 278                                          scalar_type);
 279                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 280                     }
 281                   return false;
 282                 }
 283               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 284
 285               if (dump_enabled_p ())
 286                 {
 287                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 288                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 289                   dump_printf (MSG_NOTE, "\n");
 290                 }
 291
 292               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 293               if (dump_enabled_p ())
 294                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 295                                  nunits);
 296
 297               if (!vectorization_factor
 298                   || (nunits > vectorization_factor))
 299                 vectorization_factor = nunits;
 300             }
 301         }
 302
 303       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 304            !gsi_end_p (si) || analyze_pattern_stmt;)
 305         {
 306           tree vf_vectype;
 307
 308           if (analyze_pattern_stmt)
 309             stmt = pattern_stmt;
 310           else
 311             stmt = gsi_stmt (si);
 312
 313           stmt_info = vinfo_for_stmt (stmt);
 314
 315           if (dump_enabled_p ())
 316             {
 317               dump_printf_loc (MSG_NOTE, vect_location,
 318                                "==> examining statement: ");
 319               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 320               dump_printf (MSG_NOTE, "\n");
 321             }
 322
 323           gcc_assert (stmt_info);
 324
 325           /* Skip stmts which do not need to be vectorized.  */
 326           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 327                && !STMT_VINFO_LIVE_P (stmt_info))
 328               || gimple_clobber_p (stmt))
 329             {
 330               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 331                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 332                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 333                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 334                 {
 335                   stmt = pattern_stmt;
 336                   stmt_info = vinfo_for_stmt (pattern_stmt);
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_NOTE, vect_location,
 340                                        "==> examining pattern statement: ");
 341                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 342                       dump_printf (MSG_NOTE, "\n");
 343                     }
 344                 }
 345               else
 346                 {
 347                   if (dump_enabled_p ())
 348                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 349                   gsi_next (&si);
 350                   continue;
 351                 }
 352             }
 353           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 354                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 355                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 356                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 357             analyze_pattern_stmt = true;
 358
 359           /* If a pattern statement has def stmts, analyze them too.  */
 360           if (is_pattern_stmt_p (stmt_info))
 361             {
 362               if (pattern_def_seq == NULL)
 363                 {
 364                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 365                   pattern_def_si = gsi_start (pattern_def_seq);
 366                 }
 367               else if (!gsi_end_p (pattern_def_si))
 368                 gsi_next (&pattern_def_si);
 369               if (pattern_def_seq != NULL)
 370                 {
 371                   gimple pattern_def_stmt = NULL;
 372                   stmt_vec_info pattern_def_stmt_info = NULL;
 373
 374                   while (!gsi_end_p (pattern_def_si))
 375                     {
 376                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 377                       pattern_def_stmt_info
 378                         = vinfo_for_stmt (pattern_def_stmt);
 379                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 380                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 381                         break;
 382                       gsi_next (&pattern_def_si);
 383                     }
 384
 385                   if (!gsi_end_p (pattern_def_si))
 386                     {
 387                       if (dump_enabled_p ())
 388                         {
 389                           dump_printf_loc (MSG_NOTE, vect_location,
 390                                            "==> examining pattern def stmt: ");
 391                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 392                                             pattern_def_stmt, 0);
 393                           dump_printf (MSG_NOTE, "\n");
 394                         }
 395
 396                       stmt = pattern_def_stmt;
 397                       stmt_info = pattern_def_stmt_info;
 398                     }
 399                   else
 400                     {
 401                       pattern_def_si = gsi_none ();
 402                       analyze_pattern_stmt = false;
 403                     }
 404                 }
 405               else
 406                 analyze_pattern_stmt = false;
 407             }
 408
 409           if (gimple_get_lhs (stmt) == NULL_TREE
 410               /* MASK_STORE has no lhs, but is ok.  */
 411               && (!is_gimple_call (stmt)
 412                   || !gimple_call_internal_p (stmt)
 413                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 414             {
 415               if (is_gimple_call (stmt))
 416                 {
 417                   /* Ignore calls with no lhs.  These must be calls to
 418                      #pragma omp simd functions, and what vectorization factor
 419                      it really needs can't be determined until
 420                      vectorizable_simd_clone_call.  */
 421                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 422                     {
 423                       pattern_def_seq = NULL;
 424                       gsi_next (&si);
 425                     }
 426                   continue;
 427                 }
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: irregular stmt.");
 432                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 433                                     0);
 434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 435                 }
 436               return false;
 437             }
 438
 439           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: vector stmt in loop:");
 445                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 446                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 447                 }
 448               return false;
 449             }
 450
 451           if (STMT_VINFO_VECTYPE (stmt_info))
 452             {
 453               /* The only case when a vectype had been already set is for stmts
 454                  that contain a dataref, or for "pattern-stmts" (stmts
 455                  generated by the vectorizer to represent/replace a certain
 456                  idiom).  */
 457               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 458                           || is_pattern_stmt_p (stmt_info)
 459                           || !gsi_end_p (pattern_def_si));
 460               vectype = STMT_VINFO_VECTYPE (stmt_info);
 461             }
 462           else
 463             {
 464               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 465               if (is_gimple_call (stmt)
 466                   && gimple_call_internal_p (stmt)
 467                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 468                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 469               else
 470                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 471               if (dump_enabled_p ())
 472                 {
 473                   dump_printf_loc (MSG_NOTE, vect_location,
 474                                    "get vectype for scalar type:  ");
 475                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 476                   dump_printf (MSG_NOTE, "\n");
 477                 }
 478               vectype = get_vectype_for_scalar_type (scalar_type);
 479               if (!vectype)
 480                 {
 481                   if (dump_enabled_p ())
 482                     {
 483                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                                        "not vectorized: unsupported "
 485                                        "data-type ");
 486                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 487                                          scalar_type);
 488                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 489                     }
 490                   return false;
 491                 }
 492
 493               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 494
 495               if (dump_enabled_p ())
 496                 {
 497                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 498                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 499                   dump_printf (MSG_NOTE, "\n");
 500                 }
 501             }
 502
 503           /* The vectorization factor is according to the smallest
 504              scalar type (or the largest vector size, but we only
 505              support one vector size per loop).  */
 506           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                        &dummy);
 508           if (dump_enabled_p ())
 509             {
 510               dump_printf_loc (MSG_NOTE, vect_location,
 511                                "get vectype for scalar type:  ");
 512               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513               dump_printf (MSG_NOTE, "\n");
 514             }
 515           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   return true;
 583 }
 584
 585
 586 /* Function vect_is_simple_iv_evolution.
 587
 588    FORNOW: A simple evolution of an induction variables in the loop is
 589    considered a polynomial evolution.  */
 590
 591 static bool
 592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 593                              tree * step)
 594 {
 595   tree init_expr;
 596   tree step_expr;
 597   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 598   basic_block bb;
 599
 600   /* When there is no evolution in this loop, the evolution function
 601      is not "simple".  */
 602   if (evolution_part == NULL_TREE)
 603     return false;
 604
 605   /* When the evolution is a polynomial of degree >= 2
 606      the evolution function is not "simple".  */
 607   if (tree_is_chrec (evolution_part))
 608     return false;
 609
 610   step_expr = evolution_part;
 611   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 612
 613   if (dump_enabled_p ())
 614     {
 615       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 616       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 617       dump_printf (MSG_NOTE, ",  init: ");
 618       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 619       dump_printf (MSG_NOTE, "\n");
 620     }
 621
 622   *init = init_expr;
 623   *step = step_expr;
 624
 625   if (TREE_CODE (step_expr) != INTEGER_CST
 626       && (TREE_CODE (step_expr) != SSA_NAME
 627           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 628               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 629           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 630               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 631                   || !flag_associative_math)))
 632       && (TREE_CODE (step_expr) != REAL_CST
 633           || !flag_associative_math))
 634     {
 635       if (dump_enabled_p ())
 636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 637                          "step unknown.\n");
 638       return false;
 639     }
 640
 641   return true;
 642 }
 643
 644 /* Function vect_analyze_scalar_cycles_1.
 645
 646    Examine the cross iteration def-use cycles of scalar variables
 647    in LOOP.  LOOP_VINFO represents the loop that is now being
 648    considered for vectorization (can be LOOP, or an outer-loop
 649    enclosing LOOP).  */
 650
 651 static void
 652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 653 {
 654   basic_block bb = loop->header;
 655   tree init, step;
 656   auto_vec<gimple, 64> worklist;
 657   gphi_iterator gsi;
 658   bool double_reduc;
 659
 660   if (dump_enabled_p ())
 661     dump_printf_loc (MSG_NOTE, vect_location,
 662                      "=== vect_analyze_scalar_cycles ===\n");
 663
 664   /* First - identify all inductions.  Reduction detection assumes that all the
 665      inductions have been identified, therefore, this order must not be
 666      changed.  */
 667   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 668     {
 669       gphi *phi = gsi.phi ();
 670       tree access_fn = NULL;
 671       tree def = PHI_RESULT (phi);
 672       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 673
 674       if (dump_enabled_p ())
 675         {
 676           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 677           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 678           dump_printf (MSG_NOTE, "\n");
 679         }
 680
 681       /* Skip virtual phi's.  The data dependences that are associated with
 682          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 683       if (virtual_operand_p (def))
 684         continue;
 685
 686       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 687
 688       /* Analyze the evolution function.  */
 689       access_fn = analyze_scalar_evolution (loop, def);
 690       if (access_fn)
 691         {
 692           STRIP_NOPS (access_fn);
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_NOTE, vect_location,
 696                                "Access function of PHI: ");
 697               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 698               dump_printf (MSG_NOTE, "\n");
 699             }
 700           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 701             = evolution_part_in_loop_num (access_fn, loop->num);
 702         }
 703
 704       if (!access_fn
 705           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 706           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 707               && TREE_CODE (step) != INTEGER_CST))
 708         {
 709           worklist.safe_push (phi);
 710           continue;
 711         }
 712
 713       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 714
 715       if (dump_enabled_p ())
 716         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 717       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 718     }
 719
 720
 721   /* Second - identify all reductions and nested cycles.  */
 722   while (worklist.length () > 0)
 723     {
 724       gimple phi = worklist.pop ();
 725       tree def = PHI_RESULT (phi);
 726       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 727       gimple reduc_stmt;
 728       bool nested_cycle;
 729
 730       if (dump_enabled_p ())
 731         {
 732           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 733           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 734           dump_printf (MSG_NOTE, "\n");
 735         }
 736
 737       gcc_assert (!virtual_operand_p (def)
 738                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 739
 740       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 741       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 742                                                 &double_reduc);
 743       if (reduc_stmt)
 744         {
 745           if (double_reduc)
 746             {
 747               if (dump_enabled_p ())
 748                 dump_printf_loc (MSG_NOTE, vect_location,
 749                                  "Detected double reduction.\n");
 750
 751               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 752               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 753                                                     vect_double_reduction_def;
 754             }
 755           else
 756             {
 757               if (nested_cycle)
 758                 {
 759                   if (dump_enabled_p ())
 760                     dump_printf_loc (MSG_NOTE, vect_location,
 761                                      "Detected vectorizable nested cycle.\n");
 762
 763                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 764                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 765                                                              vect_nested_cycle;
 766                 }
 767               else
 768                 {
 769                   if (dump_enabled_p ())
 770                     dump_printf_loc (MSG_NOTE, vect_location,
 771                                      "Detected reduction.\n");
 772
 773                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 774                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 775                                                            vect_reduction_def;
 776                   /* Store the reduction cycles for possible vectorization in
 777                      loop-aware SLP.  */
 778                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 779                 }
 780             }
 781         }
 782       else
 783         if (dump_enabled_p ())
 784           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 785                            "Unknown def-use cycle pattern.\n");
 786     }
 787 }
 788
 789
 790 /* Function vect_analyze_scalar_cycles.
 791
 792    Examine the cross iteration def-use cycles of scalar variables, by
 793    analyzing the loop-header PHIs of scalar variables.  Classify each
 794    cycle as one of the following: invariant, induction, reduction, unknown.
 795    We do that for the loop represented by LOOP_VINFO, and also to its
 796    inner-loop, if exists.
 797    Examples for scalar cycles:
 798
 799    Example1: reduction:
 800
 801               loop1:
 802               for (i=0; i<N; i++)
 803                  sum += a[i];
 804
 805    Example2: induction:
 806
 807               loop2:
 808               for (i=0; i<N; i++)
 809                  a[i] = i;  */
 810
 811 static void
 812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 813 {
 814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 815
 816   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 817
 818   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 819      Reductions in such inner-loop therefore have different properties than
 820      the reductions in the nest that gets vectorized:
 821      1. When vectorized, they are executed in the same order as in the original
 822         scalar loop, so we can't change the order of computation when
 823         vectorizing them.
 824      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 825         current checks are too strict.  */
 826
 827   if (loop->inner)
 828     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 829 }
 830
 831 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 832
 833 static void
 834 vect_fixup_reduc_chain (gimple stmt)
 835 {
 836   gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 837   gimple stmtp;
 838   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 839               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 840   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 841   do
 842     {
 843       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 844       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 845       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 846       if (stmt)
 847         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 848           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 849     }
 850   while (stmt);
 851   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 852 }
 853
 854 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 855
 856 static void
 857 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 858 {
 859   gimple first;
 860   unsigned i;
 861
 862   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 863     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 864       {
 865         vect_fixup_reduc_chain (first);
 866         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 867           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 868       }
 869 }
 870
 871 /* Function vect_get_loop_niters.
 872
 873    Determine how many iterations the loop is executed and place it
 874    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 875    in NUMBER_OF_ITERATIONSM1.
 876
 877    Return the loop exit condition.  */
 878
 879
 880 static gcond *
 881 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 882                       tree *number_of_iterationsm1)
 883 {
 884   tree niters;
 885
 886   if (dump_enabled_p ())
 887     dump_printf_loc (MSG_NOTE, vect_location,
 888                      "=== get_loop_niters ===\n");
 889
 890   niters = number_of_latch_executions (loop);
 891   *number_of_iterationsm1 = niters;
 892
 893   /* We want the number of loop header executions which is the number
 894      of latch executions plus one.
 895      ???  For UINT_MAX latch executions this number overflows to zero
 896      for loops like do { n++; } while (n != 0);  */
 897   if (niters && !chrec_contains_undetermined (niters))
 898     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 899                           build_int_cst (TREE_TYPE (niters), 1));
 900   *number_of_iterations = niters;
 901
 902   return get_loop_exit_condition (loop);
 903 }
 904
 905
 906 /* Function bb_in_loop_p
 907
 908    Used as predicate for dfs order traversal of the loop bbs.  */
 909
 910 static bool
 911 bb_in_loop_p (const_basic_block bb, const void *data)
 912 {
 913   const struct loop *const loop = (const struct loop *)data;
 914   if (flow_bb_inside_loop_p (loop, bb))
 915     return true;
 916   return false;
 917 }
 918
 919
 920 /* Function new_loop_vec_info.
 921
 922    Create and initialize a new loop_vec_info struct for LOOP, as well as
 923    stmt_vec_info structs for all the stmts in LOOP.  */
 924
 925 static loop_vec_info
 926 new_loop_vec_info (struct loop *loop)
 927 {
 928   loop_vec_info res;
 929   basic_block *bbs;
 930   gimple_stmt_iterator si;
 931   unsigned int i, nbbs;
 932
 933   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 934   LOOP_VINFO_LOOP (res) = loop;
 935
 936   bbs = get_loop_body (loop);
 937
 938   /* Create/Update stmt_info for all stmts in the loop.  */
 939   for (i = 0; i < loop->num_nodes; i++)
 940     {
 941       basic_block bb = bbs[i];
 942
 943       /* BBs in a nested inner-loop will have been already processed (because
 944          we will have called vect_analyze_loop_form for any nested inner-loop).
 945          Therefore, for stmts in an inner-loop we just want to update the
 946          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 947          loop_info of the outer-loop we are currently considering to vectorize
 948          (instead of the loop_info of the inner-loop).
 949          For stmts in other BBs we need to create a stmt_info from scratch.  */
 950       if (bb->loop_father != loop)
 951         {
 952           /* Inner-loop bb.  */
 953           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 954           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 955             {
 956               gimple phi = gsi_stmt (si);
 957               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 958               loop_vec_info inner_loop_vinfo =
 959                 STMT_VINFO_LOOP_VINFO (stmt_info);
 960               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 961               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 962             }
 963           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 964            {
 965               gimple stmt = gsi_stmt (si);
 966               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 967               loop_vec_info inner_loop_vinfo =
 968                  STMT_VINFO_LOOP_VINFO (stmt_info);
 969               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 970               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 971            }
 972         }
 973       else
 974         {
 975           /* bb in current nest.  */
 976           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 977             {
 978               gimple phi = gsi_stmt (si);
 979               gimple_set_uid (phi, 0);
 980               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 981             }
 982
 983           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 984             {
 985               gimple stmt = gsi_stmt (si);
 986               gimple_set_uid (stmt, 0);
 987               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 988             }
 989         }
 990     }
 991
 992   /* CHECKME: We want to visit all BBs before their successors (except for
 993      latch blocks, for which this assertion wouldn't hold).  In the simple
 994      case of the loop forms we allow, a dfs order of the BBs would the same
 995      as reversed postorder traversal, so we are safe.  */
 996
 997    free (bbs);
 998    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 999    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1000                               bbs, loop->num_nodes, loop);
1001    gcc_assert (nbbs == loop->num_nodes);
1002
1003   LOOP_VINFO_BBS (res) = bbs;
1004   LOOP_VINFO_NITERSM1 (res) = NULL;
1005   LOOP_VINFO_NITERS (res) = NULL;
1006   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1007   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1008   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1009   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1010   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1011   LOOP_VINFO_VECT_FACTOR (res) = 0;
1012   LOOP_VINFO_LOOP_NEST (res).create (3);
1013   LOOP_VINFO_DATAREFS (res).create (10);
1014   LOOP_VINFO_DDRS (res).create (10 * 10);
1015   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1016   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
1017              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1018   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
1019              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1020   LOOP_VINFO_GROUPED_STORES (res).create (10);
1021   LOOP_VINFO_REDUCTIONS (res).create (10);
1022   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
1023   LOOP_VINFO_SLP_INSTANCES (res).create (10);
1024   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1025   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1026   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1027   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1028   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1029
1030   return res;
1031 }
1032
1033
1034 /* Function destroy_loop_vec_info.
1035
1036    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1037    stmts in the loop.  */
1038
1039 void
1040 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1041 {
1042   struct loop *loop;
1043   basic_block *bbs;
1044   int nbbs;
1045   gimple_stmt_iterator si;
1046   int j;
1047   vec<slp_instance> slp_instances;
1048   slp_instance instance;
1049   bool swapped;
1050
1051   if (!loop_vinfo)
1052     return;
1053
1054   loop = LOOP_VINFO_LOOP (loop_vinfo);
1055
1056   bbs = LOOP_VINFO_BBS (loop_vinfo);
1057   nbbs = clean_stmts ? loop->num_nodes : 0;
1058   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1059
1060   for (j = 0; j < nbbs; j++)
1061     {
1062       basic_block bb = bbs[j];
1063       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1064         free_stmt_vec_info (gsi_stmt (si));
1065
1066       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1067         {
1068           gimple stmt = gsi_stmt (si);
1069
1070           /* We may have broken canonical form by moving a constant
1071              into RHS1 of a commutative op.  Fix such occurrences.  */
1072           if (swapped && is_gimple_assign (stmt))
1073             {
1074               enum tree_code code = gimple_assign_rhs_code (stmt);
1075
1076               if ((code == PLUS_EXPR
1077                    || code == POINTER_PLUS_EXPR
1078                    || code == MULT_EXPR)
1079                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1080                 swap_ssa_operands (stmt,
1081                                    gimple_assign_rhs1_ptr (stmt),
1082                                    gimple_assign_rhs2_ptr (stmt));
1083             }
1084
1085           /* Free stmt_vec_info.  */
1086           free_stmt_vec_info (stmt);
1087           gsi_next (&si);
1088         }
1089     }
1090
1091   free (LOOP_VINFO_BBS (loop_vinfo));
1092   vect_destroy_datarefs (loop_vinfo, NULL);
1093   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1094   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1095   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1096   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1097   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1098   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1099     vect_free_slp_instance (instance);
1100
1101   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1102   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1103   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1104   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1105
1106   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1107   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1108
1109   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1110
1111   free (loop_vinfo);
1112   loop->aux = NULL;
1113 }
1114
1115
1116 /* Function vect_analyze_loop_1.
1117
1118    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1119    for it. The different analyses will record information in the
1120    loop_vec_info struct.  This is a subset of the analyses applied in
1121    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1122    that is now considered for (outer-loop) vectorization.  */
1123
1124 static loop_vec_info
1125 vect_analyze_loop_1 (struct loop *loop)
1126 {
1127   loop_vec_info loop_vinfo;
1128
1129   if (dump_enabled_p ())
1130     dump_printf_loc (MSG_NOTE, vect_location,
1131                      "===== analyze_loop_nest_1 =====\n");
1132
1133   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1134
1135   loop_vinfo = vect_analyze_loop_form (loop);
1136   if (!loop_vinfo)
1137     {
1138       if (dump_enabled_p ())
1139         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140                          "bad inner-loop form.\n");
1141       return NULL;
1142     }
1143
1144   return loop_vinfo;
1145 }
1146
1147
1148 /* Function vect_analyze_loop_form.
1149
1150    Verify that certain CFG restrictions hold, including:
1151    - the loop has a pre-header
1152    - the loop has a single entry and exit
1153    - the loop exit condition is simple enough, and the number of iterations
1154      can be analyzed (a countable loop).  */
1155
1156 loop_vec_info
1157 vect_analyze_loop_form (struct loop *loop)
1158 {
1159   loop_vec_info loop_vinfo;
1160   gcond *loop_cond;
1161   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1162   loop_vec_info inner_loop_vinfo = NULL;
1163
1164   if (dump_enabled_p ())
1165     dump_printf_loc (MSG_NOTE, vect_location,
1166                      "=== vect_analyze_loop_form ===\n");
1167
1168   /* Different restrictions apply when we are considering an inner-most loop,
1169      vs. an outer (nested) loop.
1170      (FORNOW. May want to relax some of these restrictions in the future).  */
1171
1172   if (!loop->inner)
1173     {
1174       /* Inner-most loop.  We currently require that the number of BBs is
1175          exactly 2 (the header and latch).  Vectorizable inner-most loops
1176          look like this:
1177
1178                         (pre-header)
1179                            |
1180                           header <--------+
1181                            | |            |
1182                            | +--> latch --+
1183                            |
1184                         (exit-bb)  */
1185
1186       if (loop->num_nodes != 2)
1187         {
1188           if (dump_enabled_p ())
1189             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1190                              "not vectorized: control flow in loop.\n");
1191           return NULL;
1192         }
1193
1194       if (empty_block_p (loop->header))
1195         {
1196           if (dump_enabled_p ())
1197             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1198                              "not vectorized: empty loop.\n");
1199           return NULL;
1200         }
1201     }
1202   else
1203     {
1204       struct loop *innerloop = loop->inner;
1205       edge entryedge;
1206
1207       /* Nested loop. We currently require that the loop is doubly-nested,
1208          contains a single inner loop, and the number of BBs is exactly 5.
1209          Vectorizable outer-loops look like this:
1210
1211                         (pre-header)
1212                            |
1213                           header <---+
1214                            |         |
1215                           inner-loop |
1216                            |         |
1217                           tail ------+
1218                            |
1219                         (exit-bb)
1220
1221          The inner-loop has the properties expected of inner-most loops
1222          as described above.  */
1223
1224       if ((loop->inner)->inner || (loop->inner)->next)
1225         {
1226           if (dump_enabled_p ())
1227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228                              "not vectorized: multiple nested loops.\n");
1229           return NULL;
1230         }
1231
1232       /* Analyze the inner-loop.  */
1233       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1234       if (!inner_loop_vinfo)
1235         {
1236           if (dump_enabled_p ())
1237             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238                              "not vectorized: Bad inner loop.\n");
1239           return NULL;
1240         }
1241
1242       if (!expr_invariant_in_loop_p (loop,
1243                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1244         {
1245           if (dump_enabled_p ())
1246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247                              "not vectorized: inner-loop count not"
1248                              " invariant.\n");
1249           destroy_loop_vec_info (inner_loop_vinfo, true);
1250           return NULL;
1251         }
1252
1253       if (loop->num_nodes != 5)
1254         {
1255           if (dump_enabled_p ())
1256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257                              "not vectorized: control flow in loop.\n");
1258           destroy_loop_vec_info (inner_loop_vinfo, true);
1259           return NULL;
1260         }
1261
1262       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1263       entryedge = EDGE_PRED (innerloop->header, 0);
1264       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1265         entryedge = EDGE_PRED (innerloop->header, 1);
1266
1267       if (entryedge->src != loop->header
1268           || !single_exit (innerloop)
1269           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1270         {
1271           if (dump_enabled_p ())
1272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273                              "not vectorized: unsupported outerloop form.\n");
1274           destroy_loop_vec_info (inner_loop_vinfo, true);
1275           return NULL;
1276         }
1277
1278       if (dump_enabled_p ())
1279         dump_printf_loc (MSG_NOTE, vect_location,
1280                          "Considering outer-loop vectorization.\n");
1281     }
1282
1283   if (!single_exit (loop)
1284       || EDGE_COUNT (loop->header->preds) != 2)
1285     {
1286       if (dump_enabled_p ())
1287         {
1288           if (!single_exit (loop))
1289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290                              "not vectorized: multiple exits.\n");
1291           else if (EDGE_COUNT (loop->header->preds) != 2)
1292             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293                              "not vectorized: too many incoming edges.\n");
1294         }
1295       if (inner_loop_vinfo)
1296         destroy_loop_vec_info (inner_loop_vinfo, true);
1297       return NULL;
1298     }
1299
1300   /* We assume that the loop exit condition is at the end of the loop. i.e,
1301      that the loop is represented as a do-while (with a proper if-guard
1302      before the loop if needed), where the loop header contains all the
1303      executable statements, and the latch is empty.  */
1304   if (!empty_block_p (loop->latch)
1305       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1306     {
1307       if (dump_enabled_p ())
1308         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1309                          "not vectorized: latch block not empty.\n");
1310       if (inner_loop_vinfo)
1311         destroy_loop_vec_info (inner_loop_vinfo, true);
1312       return NULL;
1313     }
1314
1315   /* Make sure there exists a single-predecessor exit bb:  */
1316   if (!single_pred_p (single_exit (loop)->dest))
1317     {
1318       edge e = single_exit (loop);
1319       if (!(e->flags & EDGE_ABNORMAL))
1320         {
1321           split_loop_exit_edge (e);
1322           if (dump_enabled_p ())
1323             dump_printf (MSG_NOTE, "split exit edge.\n");
1324         }
1325       else
1326         {
1327           if (dump_enabled_p ())
1328             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329                              "not vectorized: abnormal loop exit edge.\n");
1330           if (inner_loop_vinfo)
1331             destroy_loop_vec_info (inner_loop_vinfo, true);
1332           return NULL;
1333         }
1334     }
1335
1336   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1337                                     &number_of_iterationsm1);
1338   if (!loop_cond)
1339     {
1340       if (dump_enabled_p ())
1341         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1342                          "not vectorized: complicated exit condition.\n");
1343       if (inner_loop_vinfo)
1344         destroy_loop_vec_info (inner_loop_vinfo, true);
1345       return NULL;
1346     }
1347
1348   if (!number_of_iterations
1349       || chrec_contains_undetermined (number_of_iterations))
1350     {
1351       if (dump_enabled_p ())
1352         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1353                          "not vectorized: number of iterations cannot be "
1354                          "computed.\n");
1355       if (inner_loop_vinfo)
1356         destroy_loop_vec_info (inner_loop_vinfo, true);
1357       return NULL;
1358     }
1359
1360   if (integer_zerop (number_of_iterations))
1361     {
1362       if (dump_enabled_p ())
1363         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1364                          "not vectorized: number of iterations = 0.\n");
1365       if (inner_loop_vinfo)
1366         destroy_loop_vec_info (inner_loop_vinfo, true);
1367       return NULL;
1368     }
1369
1370   loop_vinfo = new_loop_vec_info (loop);
1371   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1372   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1373   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1374
1375   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1376     {
1377       if (dump_enabled_p ())
1378         {
1379           dump_printf_loc (MSG_NOTE, vect_location,
1380                            "Symbolic number of iterations is ");
1381           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1382           dump_printf (MSG_NOTE, "\n");
1383         }
1384     }
1385
1386   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1387
1388   /* CHECKME: May want to keep it around it in the future.  */
1389   if (inner_loop_vinfo)
1390     destroy_loop_vec_info (inner_loop_vinfo, false);
1391
1392   gcc_assert (!loop->aux);
1393   loop->aux = loop_vinfo;
1394   return loop_vinfo;
1395 }
1396
1397 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1398    statements update the vectorization factor.  */
1399
1400 static void
1401 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1402 {
1403   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1404   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1405   int nbbs = loop->num_nodes;
1406   unsigned int vectorization_factor;
1407   int i;
1408
1409   if (dump_enabled_p ())
1410     dump_printf_loc (MSG_NOTE, vect_location,
1411                      "=== vect_update_vf_for_slp ===\n");
1412
1413   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1414   gcc_assert (vectorization_factor != 0);
1415
1416   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1417      vectorization factor of the loop is the unrolling factor required by
1418      the SLP instances.  If that unrolling factor is 1, we say, that we
1419      perform pure SLP on loop - cross iteration parallelism is not
1420      exploited.  */
1421   bool only_slp_in_loop = true;
1422   for (i = 0; i < nbbs; i++)
1423     {
1424       basic_block bb = bbs[i];
1425       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1426            gsi_next (&si))
1427         {
1428           gimple stmt = gsi_stmt (si);
1429           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1430           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1431               && STMT_VINFO_RELATED_STMT (stmt_info))
1432             {
1433               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1434               stmt_info = vinfo_for_stmt (stmt);
1435             }
1436           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1437                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1438               && !PURE_SLP_STMT (stmt_info))
1439             /* STMT needs both SLP and loop-based vectorization.  */
1440             only_slp_in_loop = false;
1441         }
1442     }
1443
1444   if (only_slp_in_loop)
1445     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1446   else
1447     vectorization_factor
1448       = least_common_multiple (vectorization_factor,
1449                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1450
1451   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1452   if (dump_enabled_p ())
1453     dump_printf_loc (MSG_NOTE, vect_location,
1454                      "Updating vectorization factor to %d\n",
1455                      vectorization_factor);
1456 }
1457
1458 /* Function vect_analyze_loop_operations.
1459
1460    Scan the loop stmts and make sure they are all vectorizable.  */
1461
1462 static bool
1463 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1464 {
1465   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1466   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1467   int nbbs = loop->num_nodes;
1468   unsigned int vectorization_factor;
1469   int i;
1470   stmt_vec_info stmt_info;
1471   bool need_to_vectorize = false;
1472   int min_profitable_iters;
1473   int min_scalar_loop_bound;
1474   unsigned int th;
1475   bool ok;
1476   HOST_WIDE_INT max_niter;
1477   HOST_WIDE_INT estimated_niter;
1478   int min_profitable_estimate;
1479
1480   if (dump_enabled_p ())
1481     dump_printf_loc (MSG_NOTE, vect_location,
1482                      "=== vect_analyze_loop_operations ===\n");
1483
1484   for (i = 0; i < nbbs; i++)
1485     {
1486       basic_block bb = bbs[i];
1487
1488       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1489            gsi_next (&si))
1490         {
1491           gphi *phi = si.phi ();
1492           ok = true;
1493
1494           stmt_info = vinfo_for_stmt (phi);
1495           if (dump_enabled_p ())
1496             {
1497               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1498               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1499               dump_printf (MSG_NOTE, "\n");
1500             }
1501
1502           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1503              (i.e., a phi in the tail of the outer-loop).  */
1504           if (! is_loop_header_bb_p (bb))
1505             {
1506               /* FORNOW: we currently don't support the case that these phis
1507                  are not used in the outerloop (unless it is double reduction,
1508                  i.e., this phi is vect_reduction_def), cause this case
1509                  requires to actually do something here.  */
1510               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1511                    || STMT_VINFO_LIVE_P (stmt_info))
1512                   && STMT_VINFO_DEF_TYPE (stmt_info)
1513                      != vect_double_reduction_def)
1514                 {
1515                   if (dump_enabled_p ())
1516                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1517                                      "Unsupported loop-closed phi in "
1518                                      "outer-loop.\n");
1519                   return false;
1520                 }
1521
1522               /* If PHI is used in the outer loop, we check that its operand
1523                  is defined in the inner loop.  */
1524               if (STMT_VINFO_RELEVANT_P (stmt_info))
1525                 {
1526                   tree phi_op;
1527                   gimple op_def_stmt;
1528
1529                   if (gimple_phi_num_args (phi) != 1)
1530                     return false;
1531
1532                   phi_op = PHI_ARG_DEF (phi, 0);
1533                   if (TREE_CODE (phi_op) != SSA_NAME)
1534                     return false;
1535
1536                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1537                   if (gimple_nop_p (op_def_stmt)
1538                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1539                       || !vinfo_for_stmt (op_def_stmt))
1540                     return false;
1541
1542                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1543                         != vect_used_in_outer
1544                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1545                            != vect_used_in_outer_by_reduction)
1546                     return false;
1547                 }
1548
1549               continue;
1550             }
1551
1552           gcc_assert (stmt_info);
1553
1554           if (STMT_VINFO_LIVE_P (stmt_info))
1555             {
1556               /* FORNOW: not yet supported.  */
1557               if (dump_enabled_p ())
1558                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1559                                  "not vectorized: value used after loop.\n");
1560               return false;
1561             }
1562
1563           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1564               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1565             {
1566               /* A scalar-dependence cycle that we don't support.  */
1567               if (dump_enabled_p ())
1568                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1569                                  "not vectorized: scalar dependence cycle.\n");
1570               return false;
1571             }
1572
1573           if (STMT_VINFO_RELEVANT_P (stmt_info))
1574             {
1575               need_to_vectorize = true;
1576               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1577                 ok = vectorizable_induction (phi, NULL, NULL);
1578             }
1579
1580           if (!ok)
1581             {
1582               if (dump_enabled_p ())
1583                 {
1584                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585                                    "not vectorized: relevant phi not "
1586                                    "supported: ");
1587                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1588                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1589                 }
1590               return false;
1591             }
1592         }
1593
1594       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1595            gsi_next (&si))
1596         {
1597           gimple stmt = gsi_stmt (si);
1598           if (!gimple_clobber_p (stmt)
1599               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1600             return false;
1601         }
1602     } /* bbs */
1603
1604   /* All operations in the loop are either irrelevant (deal with loop
1605      control, or dead), or only used outside the loop and can be moved
1606      out of the loop (e.g. invariants, inductions).  The loop can be
1607      optimized away by scalar optimizations.  We're better off not
1608      touching this loop.  */
1609   if (!need_to_vectorize)
1610     {
1611       if (dump_enabled_p ())
1612         dump_printf_loc (MSG_NOTE, vect_location,
1613                          "All the computation can be taken out of the loop.\n");
1614       if (dump_enabled_p ())
1615         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1616                          "not vectorized: redundant loop. no profit to "
1617                          "vectorize.\n");
1618       return false;
1619     }
1620
1621   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1622   gcc_assert (vectorization_factor != 0);
1623
1624   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1625     dump_printf_loc (MSG_NOTE, vect_location,
1626                      "vectorization_factor = %d, niters = "
1627                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1628                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1629
1630   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1631        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1632       || ((max_niter = max_stmt_executions_int (loop)) != -1
1633           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1634     {
1635       if (dump_enabled_p ())
1636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637                          "not vectorized: iteration count too small.\n");
1638       if (dump_enabled_p ())
1639         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640                          "not vectorized: iteration count smaller than "
1641                          "vectorization factor.\n");
1642       return false;
1643     }
1644
1645   /* Analyze cost.  Decide if worth while to vectorize.  */
1646
1647   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1648                                       &min_profitable_estimate);
1649   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1650
1651   if (min_profitable_iters < 0)
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                          "not vectorized: vectorization not profitable.\n");
1656       if (dump_enabled_p ())
1657         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1658                          "not vectorized: vector version will never be "
1659                          "profitable.\n");
1660       return false;
1661     }
1662
1663   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1664                             * vectorization_factor) - 1);
1665
1666
1667   /* Use the cost model only if it is more conservative than user specified
1668      threshold.  */
1669
1670   th = (unsigned) min_scalar_loop_bound;
1671   if (min_profitable_iters
1672       && (!min_scalar_loop_bound
1673           || min_profitable_iters > min_scalar_loop_bound))
1674     th = (unsigned) min_profitable_iters;
1675
1676   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1677
1678   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1679       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: vectorization not profitable.\n");
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_NOTE, vect_location,
1686                          "not vectorized: iteration count smaller than user "
1687                          "specified loop bound parameter or minimum profitable "
1688                          "iterations (whichever is more conservative).\n");
1689       return false;
1690     }
1691
1692   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1693       && ((unsigned HOST_WIDE_INT) estimated_niter
1694           <= MAX (th, (unsigned)min_profitable_estimate)))
1695     {
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                          "not vectorized: estimated iteration count too "
1699                          "small.\n");
1700       if (dump_enabled_p ())
1701         dump_printf_loc (MSG_NOTE, vect_location,
1702                          "not vectorized: estimated iteration count smaller "
1703                          "than specified loop bound parameter or minimum "
1704                          "profitable iterations (whichever is more "
1705                          "conservative).\n");
1706       return false;
1707     }
1708
1709   return true;
1710 }
1711
1712
1713 /* Function vect_analyze_loop_2.
1714
1715    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1716    for it.  The different analyses will record information in the
1717    loop_vec_info struct.  */
1718 static bool
1719 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1720 {
1721   bool ok;
1722   int max_vf = MAX_VECTORIZATION_FACTOR;
1723   int min_vf = 2;
1724   unsigned int th;
1725   unsigned int n_stmts = 0;
1726
1727   /* Find all data references in the loop (which correspond to vdefs/vuses)
1728      and analyze their evolution in the loop.  Also adjust the minimal
1729      vectorization factor according to the loads and stores.
1730
1731      FORNOW: Handle only simple, array references, which
1732      alignment can be forced, and aligned pointer-references.  */
1733
1734   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1735   if (!ok)
1736     {
1737       if (dump_enabled_p ())
1738         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1739                          "bad data references.\n");
1740       return false;
1741     }
1742
1743   /* Classify all cross-iteration scalar data-flow cycles.
1744      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1745
1746   vect_analyze_scalar_cycles (loop_vinfo);
1747
1748   vect_pattern_recog (loop_vinfo, NULL);
1749
1750   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1751
1752   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1753      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1754
1755   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1756   if (!ok)
1757     {
1758       if (dump_enabled_p ())
1759         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1760                          "bad data access.\n");
1761       return false;
1762     }
1763
1764   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1765
1766   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1767   if (!ok)
1768     {
1769       if (dump_enabled_p ())
1770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1771                          "unexpected pattern.\n");
1772       return false;
1773     }
1774
1775   /* Analyze data dependences between the data-refs in the loop
1776      and adjust the maximum vectorization factor according to
1777      the dependences.
1778      FORNOW: fail at the first data dependence that we encounter.  */
1779
1780   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1781   if (!ok
1782       || max_vf < min_vf)
1783     {
1784       if (dump_enabled_p ())
1785             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1786                              "bad data dependence.\n");
1787       return false;
1788     }
1789
1790   ok = vect_determine_vectorization_factor (loop_vinfo);
1791   if (!ok)
1792     {
1793       if (dump_enabled_p ())
1794         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1795                          "can't determine vectorization factor.\n");
1796       return false;
1797     }
1798   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1799     {
1800       if (dump_enabled_p ())
1801         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1802                          "bad data dependence.\n");
1803       return false;
1804     }
1805
1806   /* Analyze the alignment of the data-refs in the loop.
1807      Fail if a data reference is found that cannot be vectorized.  */
1808
1809   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1810   if (!ok)
1811     {
1812       if (dump_enabled_p ())
1813         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1814                          "bad data alignment.\n");
1815       return false;
1816     }
1817
1818   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1819      It is important to call pruning after vect_analyze_data_ref_accesses,
1820      since we use grouping information gathered by interleaving analysis.  */
1821   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1822   if (!ok)
1823     {
1824       if (dump_enabled_p ())
1825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                          "number of versioning for alias "
1827                          "run-time tests exceeds %d "
1828                          "(--param vect-max-version-for-alias-checks)\n",
1829                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1830       return false;
1831     }
1832
1833   /* This pass will decide on using loop versioning and/or loop peeling in
1834      order to enhance the alignment of data references in the loop.  */
1835
1836   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1837   if (!ok)
1838     {
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841                          "bad data alignment.\n");
1842       return false;
1843     }
1844
1845   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1846   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1847   if (ok)
1848     {
1849       /* If there are any SLP instances mark them as pure_slp.  */
1850       if (vect_make_slp_decision (loop_vinfo))
1851         {
1852           /* Find stmts that need to be both vectorized and SLPed.  */
1853           vect_detect_hybrid_slp (loop_vinfo);
1854
1855           /* Update the vectorization factor based on the SLP decision.  */
1856           vect_update_vf_for_slp (loop_vinfo);
1857
1858           /* Analyze operations in the SLP instances.  Note this may
1859              remove unsupported SLP instances which makes the above
1860              SLP kind detection invalid.  */
1861           unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1862           vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1863                                        LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1864           if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1865             return false;
1866         }
1867     }
1868   else
1869     return false;
1870
1871   /* Scan all the remaining operations in the loop that are not subject
1872      to SLP and make sure they are vectorizable.  */
1873   ok = vect_analyze_loop_operations (loop_vinfo);
1874   if (!ok)
1875     {
1876       if (dump_enabled_p ())
1877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878                          "bad operation or unsupported loop bound.\n");
1879       return false;
1880     }
1881
1882   /* Decide whether we need to create an epilogue loop to handle
1883      remaining scalar iterations.  */
1884   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1885         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1886        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1887
1888   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1889       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1890     {
1891       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1892                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1893           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1894         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1895     }
1896   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1897            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1898                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1899                /* In case of versioning, check if the maximum number of
1900                   iterations is greater than th.  If they are identical,
1901                   the epilogue is unnecessary.  */
1902                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1903                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1904                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1905                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1906     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1907
1908   /* If an epilogue loop is required make sure we can create one.  */
1909   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1910       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1914       if (!vect_can_advance_ivs_p (loop_vinfo)
1915           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1916                                            single_exit (LOOP_VINFO_LOOP
1917                                                          (loop_vinfo))))
1918         {
1919           if (dump_enabled_p ())
1920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1921                              "not vectorized: can't create required "
1922                              "epilog loop\n");
1923           return false;
1924         }
1925     }
1926
1927   return true;
1928 }
1929
1930 /* Function vect_analyze_loop.
1931
1932    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1933    for it.  The different analyses will record information in the
1934    loop_vec_info struct.  */
1935 loop_vec_info
1936 vect_analyze_loop (struct loop *loop)
1937 {
1938   loop_vec_info loop_vinfo;
1939   unsigned int vector_sizes;
1940
1941   /* Autodetect first vector size we try.  */
1942   current_vector_size = 0;
1943   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1944
1945   if (dump_enabled_p ())
1946     dump_printf_loc (MSG_NOTE, vect_location,
1947                      "===== analyze_loop_nest =====\n");
1948
1949   if (loop_outer (loop)
1950       && loop_vec_info_for_loop (loop_outer (loop))
1951       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1952     {
1953       if (dump_enabled_p ())
1954         dump_printf_loc (MSG_NOTE, vect_location,
1955                          "outer-loop already vectorized.\n");
1956       return NULL;
1957     }
1958
1959   while (1)
1960     {
1961       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1962       loop_vinfo = vect_analyze_loop_form (loop);
1963       if (!loop_vinfo)
1964         {
1965           if (dump_enabled_p ())
1966             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967                              "bad loop form.\n");
1968           return NULL;
1969         }
1970
1971       if (vect_analyze_loop_2 (loop_vinfo))
1972         {
1973           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1974
1975           return loop_vinfo;
1976         }
1977
1978       destroy_loop_vec_info (loop_vinfo, true);
1979
1980       vector_sizes &= ~current_vector_size;
1981       if (vector_sizes == 0
1982           || current_vector_size == 0)
1983         return NULL;
1984
1985       /* Try the next biggest vector size.  */
1986       current_vector_size = 1 << floor_log2 (vector_sizes);
1987       if (dump_enabled_p ())
1988         dump_printf_loc (MSG_NOTE, vect_location,
1989                          "***** Re-trying analysis with "
1990                          "vector size %d\n", current_vector_size);
1991     }
1992 }
1993
1994
1995 /* Function reduction_code_for_scalar_code
1996
1997    Input:
1998    CODE - tree_code of a reduction operations.
1999
2000    Output:
2001    REDUC_CODE - the corresponding tree-code to be used to reduce the
2002       vector of partial results into a single scalar result, or ERROR_MARK
2003       if the operation is a supported reduction operation, but does not have
2004       such a tree-code.
2005
2006    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2007
2008 static bool
2009 reduction_code_for_scalar_code (enum tree_code code,
2010                                 enum tree_code *reduc_code)
2011 {
2012   switch (code)
2013     {
2014       case MAX_EXPR:
2015         *reduc_code = REDUC_MAX_EXPR;
2016         return true;
2017
2018       case MIN_EXPR:
2019         *reduc_code = REDUC_MIN_EXPR;
2020         return true;
2021
2022       case PLUS_EXPR:
2023         *reduc_code = REDUC_PLUS_EXPR;
2024         return true;
2025
2026       case MULT_EXPR:
2027       case MINUS_EXPR:
2028       case BIT_IOR_EXPR:
2029       case BIT_XOR_EXPR:
2030       case BIT_AND_EXPR:
2031         *reduc_code = ERROR_MARK;
2032         return true;
2033
2034       default:
2035        return false;
2036     }
2037 }
2038
2039
2040 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2041    STMT is printed with a message MSG. */
2042
2043 static void
2044 report_vect_op (int msg_type, gimple stmt, const char *msg)
2045 {
2046   dump_printf_loc (msg_type, vect_location, "%s", msg);
2047   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2048   dump_printf (msg_type, "\n");
2049 }
2050
2051
2052 /* Detect SLP reduction of the form:
2053
2054    #a1 = phi <a5, a0>
2055    a2 = operation (a1)
2056    a3 = operation (a2)
2057    a4 = operation (a3)
2058    a5 = operation (a4)
2059
2060    #a = phi <a5>
2061
2062    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2063    FIRST_STMT is the first reduction stmt in the chain
2064    (a2 = operation (a1)).
2065
2066    Return TRUE if a reduction chain was detected.  */
2067
2068 static bool
2069 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2070 {
2071   struct loop *loop = (gimple_bb (phi))->loop_father;
2072   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2073   enum tree_code code;
2074   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2075   stmt_vec_info use_stmt_info, current_stmt_info;
2076   tree lhs;
2077   imm_use_iterator imm_iter;
2078   use_operand_p use_p;
2079   int nloop_uses, size = 0, n_out_of_loop_uses;
2080   bool found = false;
2081
2082   if (loop != vect_loop)
2083     return false;
2084
2085   lhs = PHI_RESULT (phi);
2086   code = gimple_assign_rhs_code (first_stmt);
2087   while (1)
2088     {
2089       nloop_uses = 0;
2090       n_out_of_loop_uses = 0;
2091       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2092         {
2093           gimple use_stmt = USE_STMT (use_p);
2094           if (is_gimple_debug (use_stmt))
2095             continue;
2096
2097           /* Check if we got back to the reduction phi.  */
2098           if (use_stmt == phi)
2099             {
2100               loop_use_stmt = use_stmt;
2101               found = true;
2102               break;
2103             }
2104
2105           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2106             {
2107               loop_use_stmt = use_stmt;
2108               nloop_uses++;
2109             }
2110            else
2111              n_out_of_loop_uses++;
2112
2113            /* There are can be either a single use in the loop or two uses in
2114               phi nodes.  */
2115            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2116              return false;
2117         }
2118
2119       if (found)
2120         break;
2121
2122       /* We reached a statement with no loop uses.  */
2123       if (nloop_uses == 0)
2124         return false;
2125
2126       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2127       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2128         return false;
2129
2130       if (!is_gimple_assign (loop_use_stmt)
2131           || code != gimple_assign_rhs_code (loop_use_stmt)
2132           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2133         return false;
2134
2135       /* Insert USE_STMT into reduction chain.  */
2136       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2137       if (current_stmt)
2138         {
2139           current_stmt_info = vinfo_for_stmt (current_stmt);
2140           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2141           GROUP_FIRST_ELEMENT (use_stmt_info)
2142             = GROUP_FIRST_ELEMENT (current_stmt_info);
2143         }
2144       else
2145         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2146
2147       lhs = gimple_assign_lhs (loop_use_stmt);
2148       current_stmt = loop_use_stmt;
2149       size++;
2150    }
2151
2152   if (!found || loop_use_stmt != phi || size < 2)
2153     return false;
2154
2155   /* Swap the operands, if needed, to make the reduction operand be the second
2156      operand.  */
2157   lhs = PHI_RESULT (phi);
2158   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2159   while (next_stmt)
2160     {
2161       if (gimple_assign_rhs2 (next_stmt) == lhs)
2162         {
2163           tree op = gimple_assign_rhs1 (next_stmt);
2164           gimple def_stmt = NULL;
2165
2166           if (TREE_CODE (op) == SSA_NAME)
2167             def_stmt = SSA_NAME_DEF_STMT (op);
2168
2169           /* Check that the other def is either defined in the loop
2170              ("vect_internal_def"), or it's an induction (defined by a
2171              loop-header phi-node).  */
2172           if (def_stmt
2173               && gimple_bb (def_stmt)
2174               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2175               && (is_gimple_assign (def_stmt)
2176                   || is_gimple_call (def_stmt)
2177                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2178                            == vect_induction_def
2179                   || (gimple_code (def_stmt) == GIMPLE_PHI
2180                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2181                                   == vect_internal_def
2182                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2183             {
2184               lhs = gimple_assign_lhs (next_stmt);
2185               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2186               continue;
2187             }
2188
2189           return false;
2190         }
2191       else
2192         {
2193           tree op = gimple_assign_rhs2 (next_stmt);
2194           gimple def_stmt = NULL;
2195
2196           if (TREE_CODE (op) == SSA_NAME)
2197             def_stmt = SSA_NAME_DEF_STMT (op);
2198
2199           /* Check that the other def is either defined in the loop
2200             ("vect_internal_def"), or it's an induction (defined by a
2201             loop-header phi-node).  */
2202           if (def_stmt
2203               && gimple_bb (def_stmt)
2204               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2205               && (is_gimple_assign (def_stmt)
2206                   || is_gimple_call (def_stmt)
2207                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2208                               == vect_induction_def
2209                   || (gimple_code (def_stmt) == GIMPLE_PHI
2210                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2211                                   == vect_internal_def
2212                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2213             {
2214               if (dump_enabled_p ())
2215                 {
2216                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2217                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2218                   dump_printf (MSG_NOTE, "\n");
2219                 }
2220
2221               swap_ssa_operands (next_stmt,
2222                                  gimple_assign_rhs1_ptr (next_stmt),
2223                                  gimple_assign_rhs2_ptr (next_stmt));
2224               update_stmt (next_stmt);
2225
2226               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2227                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2228             }
2229           else
2230             return false;
2231         }
2232
2233       lhs = gimple_assign_lhs (next_stmt);
2234       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2235     }
2236
2237   /* Save the chain for further analysis in SLP detection.  */
2238   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2239   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2240   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2241
2242   return true;
2243 }
2244
2245
2246 /* Function vect_is_simple_reduction_1
2247
2248    (1) Detect a cross-iteration def-use cycle that represents a simple
2249    reduction computation.  We look for the following pattern:
2250
2251    loop_header:
2252      a1 = phi < a0, a2 >
2253      a3 = ...
2254      a2 = operation (a3, a1)
2255
2256    or
2257
2258    a3 = ...
2259    loop_header:
2260      a1 = phi < a0, a2 >
2261      a2 = operation (a3, a1)
2262
2263    such that:
2264    1. operation is commutative and associative and it is safe to
2265       change the order of the computation (if CHECK_REDUCTION is true)
2266    2. no uses for a2 in the loop (a2 is used out of the loop)
2267    3. no uses of a1 in the loop besides the reduction operation
2268    4. no uses of a1 outside the loop.
2269
2270    Conditions 1,4 are tested here.
2271    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2272
2273    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2274    nested cycles, if CHECK_REDUCTION is false.
2275
2276    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2277    reductions:
2278
2279      a1 = phi < a0, a2 >
2280      inner loop (def of a3)
2281      a2 = phi < a3 >
2282
2283    If MODIFY is true it tries also to rework the code in-place to enable
2284    detection of more reduction patterns.  For the time being we rewrite
2285    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2286 */
2287
2288 static gimple
2289 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2290                             bool check_reduction, bool *double_reduc,
2291                             bool modify)
2292 {
2293   struct loop *loop = (gimple_bb (phi))->loop_father;
2294   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2295   edge latch_e = loop_latch_edge (loop);
2296   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2297   gimple def_stmt, def1 = NULL, def2 = NULL;
2298   enum tree_code orig_code, code;
2299   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2300   tree type;
2301   int nloop_uses;
2302   tree name;
2303   imm_use_iterator imm_iter;
2304   use_operand_p use_p;
2305   bool phi_def;
2306
2307   *double_reduc = false;
2308
2309   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2310      otherwise, we assume outer loop vectorization.  */
2311   gcc_assert ((check_reduction && loop == vect_loop)
2312               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2313
2314   name = PHI_RESULT (phi);
2315   /* ???  If there are no uses of the PHI result the inner loop reduction
2316      won't be detected as possibly double-reduction by vectorizable_reduction
2317      because that tries to walk the PHI arg from the preheader edge which
2318      can be constant.  See PR60382.  */
2319   if (has_zero_uses (name))
2320     return NULL;
2321   nloop_uses = 0;
2322   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2323     {
2324       gimple use_stmt = USE_STMT (use_p);
2325       if (is_gimple_debug (use_stmt))
2326         continue;
2327
2328       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2329         {
2330           if (dump_enabled_p ())
2331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332                              "intermediate value used outside loop.\n");
2333
2334           return NULL;
2335         }
2336
2337       nloop_uses++;
2338       if (nloop_uses > 1)
2339         {
2340           if (dump_enabled_p ())
2341             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2342                              "reduction used in loop.\n");
2343           return NULL;
2344         }
2345     }
2346
2347   if (TREE_CODE (loop_arg) != SSA_NAME)
2348     {
2349       if (dump_enabled_p ())
2350         {
2351           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352                            "reduction: not ssa_name: ");
2353           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2354           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2355         }
2356       return NULL;
2357     }
2358
2359   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2360   if (!def_stmt)
2361     {
2362       if (dump_enabled_p ())
2363         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364                          "reduction: no def_stmt.\n");
2365       return NULL;
2366     }
2367
2368   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2369     {
2370       if (dump_enabled_p ())
2371         {
2372           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2373           dump_printf (MSG_NOTE, "\n");
2374         }
2375       return NULL;
2376     }
2377
2378   if (is_gimple_assign (def_stmt))
2379     {
2380       name = gimple_assign_lhs (def_stmt);
2381       phi_def = false;
2382     }
2383   else
2384     {
2385       name = PHI_RESULT (def_stmt);
2386       phi_def = true;
2387     }
2388
2389   nloop_uses = 0;
2390   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2391     {
2392       gimple use_stmt = USE_STMT (use_p);
2393       if (is_gimple_debug (use_stmt))
2394         continue;
2395       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2396         nloop_uses++;
2397       if (nloop_uses > 1)
2398         {
2399           if (dump_enabled_p ())
2400             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401                              "reduction used in loop.\n");
2402           return NULL;
2403         }
2404     }
2405
2406   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2407      defined in the inner loop.  */
2408   if (phi_def)
2409     {
2410       op1 = PHI_ARG_DEF (def_stmt, 0);
2411
2412       if (gimple_phi_num_args (def_stmt) != 1
2413           || TREE_CODE (op1) != SSA_NAME)
2414         {
2415           if (dump_enabled_p ())
2416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2417                              "unsupported phi node definition.\n");
2418
2419           return NULL;
2420         }
2421
2422       def1 = SSA_NAME_DEF_STMT (op1);
2423       if (gimple_bb (def1)
2424           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2425           && loop->inner
2426           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2427           && is_gimple_assign (def1))
2428         {
2429           if (dump_enabled_p ())
2430             report_vect_op (MSG_NOTE, def_stmt,
2431                             "detected double reduction: ");
2432
2433           *double_reduc = true;
2434           return def_stmt;
2435         }
2436
2437       return NULL;
2438     }
2439
2440   code = orig_code = gimple_assign_rhs_code (def_stmt);
2441
2442   /* We can handle "res -= x[i]", which is non-associative by
2443      simply rewriting this into "res += -x[i]".  Avoid changing
2444      gimple instruction for the first simple tests and only do this
2445      if we're allowed to change code at all.  */
2446   if (code == MINUS_EXPR
2447       && modify
2448       && (op1 = gimple_assign_rhs1 (def_stmt))
2449       && TREE_CODE (op1) == SSA_NAME
2450       && SSA_NAME_DEF_STMT (op1) == phi)
2451     code = PLUS_EXPR;
2452
2453   if (check_reduction
2454       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2455     {
2456       if (dump_enabled_p ())
2457         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2458                         "reduction: not commutative/associative: ");
2459       return NULL;
2460     }
2461
2462   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2463     {
2464       if (code != COND_EXPR)
2465         {
2466           if (dump_enabled_p ())
2467             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2468                             "reduction: not binary operation: ");
2469
2470           return NULL;
2471         }
2472
2473       op3 = gimple_assign_rhs1 (def_stmt);
2474       if (COMPARISON_CLASS_P (op3))
2475         {
2476           op4 = TREE_OPERAND (op3, 1);
2477           op3 = TREE_OPERAND (op3, 0);
2478         }
2479
2480       op1 = gimple_assign_rhs2 (def_stmt);
2481       op2 = gimple_assign_rhs3 (def_stmt);
2482
2483       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2484         {
2485           if (dump_enabled_p ())
2486             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2487                             "reduction: uses not ssa_names: ");
2488
2489           return NULL;
2490         }
2491     }
2492   else
2493     {
2494       op1 = gimple_assign_rhs1 (def_stmt);
2495       op2 = gimple_assign_rhs2 (def_stmt);
2496
2497       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2498         {
2499           if (dump_enabled_p ())
2500             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2501                             "reduction: uses not ssa_names: ");
2502
2503           return NULL;
2504         }
2505    }
2506
2507   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2508   if ((TREE_CODE (op1) == SSA_NAME
2509        && !types_compatible_p (type,TREE_TYPE (op1)))
2510       || (TREE_CODE (op2) == SSA_NAME
2511           && !types_compatible_p (type, TREE_TYPE (op2)))
2512       || (op3 && TREE_CODE (op3) == SSA_NAME
2513           && !types_compatible_p (type, TREE_TYPE (op3)))
2514       || (op4 && TREE_CODE (op4) == SSA_NAME
2515           && !types_compatible_p (type, TREE_TYPE (op4))))
2516     {
2517       if (dump_enabled_p ())
2518         {
2519           dump_printf_loc (MSG_NOTE, vect_location,
2520                            "reduction: multiple types: operation type: ");
2521           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2522           dump_printf (MSG_NOTE, ", operands types: ");
2523           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2524                              TREE_TYPE (op1));
2525           dump_printf (MSG_NOTE, ",");
2526           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2527                              TREE_TYPE (op2));
2528           if (op3)
2529             {
2530               dump_printf (MSG_NOTE, ",");
2531               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2532                                  TREE_TYPE (op3));
2533             }
2534
2535           if (op4)
2536             {
2537               dump_printf (MSG_NOTE, ",");
2538               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2539                                  TREE_TYPE (op4));
2540             }
2541           dump_printf (MSG_NOTE, "\n");
2542         }
2543
2544       return NULL;
2545     }
2546
2547   /* Check that it's ok to change the order of the computation.
2548      Generally, when vectorizing a reduction we change the order of the
2549      computation.  This may change the behavior of the program in some
2550      cases, so we need to check that this is ok.  One exception is when
2551      vectorizing an outer-loop: the inner-loop is executed sequentially,
2552      and therefore vectorizing reductions in the inner-loop during
2553      outer-loop vectorization is safe.  */
2554
2555   /* CHECKME: check for !flag_finite_math_only too?  */
2556   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2557       && check_reduction)
2558     {
2559       /* Changing the order of operations changes the semantics.  */
2560       if (dump_enabled_p ())
2561         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2562                         "reduction: unsafe fp math optimization: ");
2563       return NULL;
2564     }
2565   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2566            && check_reduction)
2567     {
2568       /* Changing the order of operations changes the semantics.  */
2569       if (dump_enabled_p ())
2570         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2571                         "reduction: unsafe int math optimization: ");
2572       return NULL;
2573     }
2574   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2575     {
2576       /* Changing the order of operations changes the semantics.  */
2577       if (dump_enabled_p ())
2578         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2579                         "reduction: unsafe fixed-point math optimization: ");
2580       return NULL;
2581     }
2582
2583   /* If we detected "res -= x[i]" earlier, rewrite it into
2584      "res += -x[i]" now.  If this turns out to be useless reassoc
2585      will clean it up again.  */
2586   if (orig_code == MINUS_EXPR)
2587     {
2588       tree rhs = gimple_assign_rhs2 (def_stmt);
2589       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2590       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2591       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2592       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2593                                                           loop_info, NULL));
2594       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2595       gimple_assign_set_rhs2 (def_stmt, negrhs);
2596       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2597       update_stmt (def_stmt);
2598     }
2599
2600   /* Reduction is safe. We're dealing with one of the following:
2601      1) integer arithmetic and no trapv
2602      2) floating point arithmetic, and special flags permit this optimization
2603      3) nested cycle (i.e., outer loop vectorization).  */
2604   if (TREE_CODE (op1) == SSA_NAME)
2605     def1 = SSA_NAME_DEF_STMT (op1);
2606
2607   if (TREE_CODE (op2) == SSA_NAME)
2608     def2 = SSA_NAME_DEF_STMT (op2);
2609
2610   if (code != COND_EXPR
2611       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2612     {
2613       if (dump_enabled_p ())
2614         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2615       return NULL;
2616     }
2617
2618   /* Check that one def is the reduction def, defined by PHI,
2619      the other def is either defined in the loop ("vect_internal_def"),
2620      or it's an induction (defined by a loop-header phi-node).  */
2621
2622   if (def2 && def2 == phi
2623       && (code == COND_EXPR
2624           || !def1 || gimple_nop_p (def1)
2625           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2626           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2627               && (is_gimple_assign (def1)
2628                   || is_gimple_call (def1)
2629                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2630                       == vect_induction_def
2631                   || (gimple_code (def1) == GIMPLE_PHI
2632                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2633                           == vect_internal_def
2634                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2635     {
2636       if (dump_enabled_p ())
2637         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2638       return def_stmt;
2639     }
2640
2641   if (def1 && def1 == phi
2642       && (code == COND_EXPR
2643           || !def2 || gimple_nop_p (def2)
2644           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2645           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2646               && (is_gimple_assign (def2)
2647                   || is_gimple_call (def2)
2648                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2649                       == vect_induction_def
2650                   || (gimple_code (def2) == GIMPLE_PHI
2651                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2652                           == vect_internal_def
2653                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2654     {
2655       if (check_reduction)
2656         {
2657           /* Swap operands (just for simplicity - so that the rest of the code
2658              can assume that the reduction variable is always the last (second)
2659              argument).  */
2660           if (dump_enabled_p ())
2661             report_vect_op (MSG_NOTE, def_stmt,
2662                             "detected reduction: need to swap operands: ");
2663
2664           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2665                              gimple_assign_rhs2_ptr (def_stmt));
2666
2667           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2668             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2669         }
2670       else
2671         {
2672           if (dump_enabled_p ())
2673             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2674         }
2675
2676       return def_stmt;
2677     }
2678
2679   /* Try to find SLP reduction chain.  */
2680   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2681     {
2682       if (dump_enabled_p ())
2683         report_vect_op (MSG_NOTE, def_stmt,
2684                         "reduction: detected reduction chain: ");
2685
2686       return def_stmt;
2687     }
2688
2689   if (dump_enabled_p ())
2690     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2691                     "reduction: unknown pattern: ");
2692
2693   return NULL;
2694 }
2695
2696 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2697    in-place.  Arguments as there.  */
2698
2699 static gimple
2700 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2701                           bool check_reduction, bool *double_reduc)
2702 {
2703   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2704                                      double_reduc, false);
2705 }
2706
2707 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2708    in-place if it enables detection of more reductions.  Arguments
2709    as there.  */
2710
2711 gimple
2712 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2713                           bool check_reduction, bool *double_reduc)
2714 {
2715   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2716                                      double_reduc, true);
2717 }
2718
2719 /* Calculate the cost of one scalar iteration of the loop.  */
2720 int
2721 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2722                                        stmt_vector_for_cost *scalar_cost_vec)
2723 {
2724   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2725   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2726   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2727   int innerloop_iters, i;
2728
2729   /* Count statements in scalar loop.  Using this as scalar cost for a single
2730      iteration for now.
2731
2732      TODO: Add outer loop support.
2733
2734      TODO: Consider assigning different costs to different scalar
2735      statements.  */
2736
2737   /* FORNOW.  */
2738   innerloop_iters = 1;
2739   if (loop->inner)
2740     innerloop_iters = 50; /* FIXME */
2741
2742   for (i = 0; i < nbbs; i++)
2743     {
2744       gimple_stmt_iterator si;
2745       basic_block bb = bbs[i];
2746
2747       if (bb->loop_father == loop->inner)
2748         factor = innerloop_iters;
2749       else
2750         factor = 1;
2751
2752       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2753         {
2754           gimple stmt = gsi_stmt (si);
2755           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2756
2757           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2758             continue;
2759
2760           /* Skip stmts that are not vectorized inside the loop.  */
2761           if (stmt_info
2762               && !STMT_VINFO_RELEVANT_P (stmt_info)
2763               && (!STMT_VINFO_LIVE_P (stmt_info)
2764                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2765               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2766             continue;
2767
2768           vect_cost_for_stmt kind;
2769           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2770             {
2771               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2772                kind = scalar_load;
2773              else
2774                kind = scalar_store;
2775             }
2776           else
2777             kind = scalar_stmt;
2778
2779           scalar_single_iter_cost
2780             += record_stmt_cost (scalar_cost_vec, factor, kind,
2781                                  NULL, 0, vect_prologue);
2782         }
2783     }
2784   return scalar_single_iter_cost;
2785 }
2786
2787 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2788 int
2789 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2790                              int *peel_iters_epilogue,
2791                              stmt_vector_for_cost *scalar_cost_vec,
2792                              stmt_vector_for_cost *prologue_cost_vec,
2793                              stmt_vector_for_cost *epilogue_cost_vec)
2794 {
2795   int retval = 0;
2796   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2797
2798   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2799     {
2800       *peel_iters_epilogue = vf/2;
2801       if (dump_enabled_p ())
2802         dump_printf_loc (MSG_NOTE, vect_location,
2803                          "cost model: epilogue peel iters set to vf/2 "
2804                          "because loop iterations are unknown .\n");
2805
2806       /* If peeled iterations are known but number of scalar loop
2807          iterations are unknown, count a taken branch per peeled loop.  */
2808       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2809                                  NULL, 0, vect_prologue);
2810       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2811                                  NULL, 0, vect_epilogue);
2812     }
2813   else
2814     {
2815       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2816       peel_iters_prologue = niters < peel_iters_prologue ?
2817                             niters : peel_iters_prologue;
2818       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2819       /* If we need to peel for gaps, but no peeling is required, we have to
2820          peel VF iterations.  */
2821       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2822         *peel_iters_epilogue = vf;
2823     }
2824
2825   stmt_info_for_cost *si;
2826   int j;
2827   if (peel_iters_prologue)
2828     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2829       retval += record_stmt_cost (prologue_cost_vec,
2830                                   si->count * peel_iters_prologue,
2831                                   si->kind, NULL, si->misalign,
2832                                   vect_prologue);
2833   if (*peel_iters_epilogue)
2834     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2835       retval += record_stmt_cost (epilogue_cost_vec,
2836                                   si->count * *peel_iters_epilogue,
2837                                   si->kind, NULL, si->misalign,
2838                                   vect_epilogue);
2839
2840   return retval;
2841 }
2842
2843 /* Function vect_estimate_min_profitable_iters
2844
2845    Return the number of iterations required for the vector version of the
2846    loop to be profitable relative to the cost of the scalar version of the
2847    loop.  */
2848
2849 static void
2850 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2851                                     int *ret_min_profitable_niters,
2852                                     int *ret_min_profitable_estimate)
2853 {
2854   int min_profitable_iters;
2855   int min_profitable_estimate;
2856   int peel_iters_prologue;
2857   int peel_iters_epilogue;
2858   unsigned vec_inside_cost = 0;
2859   int vec_outside_cost = 0;
2860   unsigned vec_prologue_cost = 0;
2861   unsigned vec_epilogue_cost = 0;
2862   int scalar_single_iter_cost = 0;
2863   int scalar_outside_cost = 0;
2864   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2865   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2866   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2867
2868   /* Cost model disabled.  */
2869   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2870     {
2871       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2872       *ret_min_profitable_niters = 0;
2873       *ret_min_profitable_estimate = 0;
2874       return;
2875     }
2876
2877   /* Requires loop versioning tests to handle misalignment.  */
2878   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2879     {
2880       /*  FIXME: Make cost depend on complexity of individual check.  */
2881       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2882       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2883                             vect_prologue);
2884       dump_printf (MSG_NOTE,
2885                    "cost model: Adding cost of checks for loop "
2886                    "versioning to treat misalignment.\n");
2887     }
2888
2889   /* Requires loop versioning with alias checks.  */
2890   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2891     {
2892       /*  FIXME: Make cost depend on complexity of individual check.  */
2893       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2894       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2895                             vect_prologue);
2896       dump_printf (MSG_NOTE,
2897                    "cost model: Adding cost of checks for loop "
2898                    "versioning aliasing.\n");
2899     }
2900
2901   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2902       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2903     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2904                           vect_prologue);
2905
2906   /* Count statements in scalar loop.  Using this as scalar cost for a single
2907      iteration for now.
2908
2909      TODO: Add outer loop support.
2910
2911      TODO: Consider assigning different costs to different scalar
2912      statements.  */
2913
2914   auto_vec<stmt_info_for_cost> scalar_cost_vec;
2915   scalar_single_iter_cost
2916      = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2917
2918   /* Add additional cost for the peeled instructions in prologue and epilogue
2919      loop.
2920
2921      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2922      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2923
2924      TODO: Build an expression that represents peel_iters for prologue and
2925      epilogue to be used in a run-time test.  */
2926
2927   if (npeel  < 0)
2928     {
2929       peel_iters_prologue = vf/2;
2930       dump_printf (MSG_NOTE, "cost model: "
2931                    "prologue peel iters set to vf/2.\n");
2932
2933       /* If peeling for alignment is unknown, loop bound of main loop becomes
2934          unknown.  */
2935       peel_iters_epilogue = vf/2;
2936       dump_printf (MSG_NOTE, "cost model: "
2937                    "epilogue peel iters set to vf/2 because "
2938                    "peeling for alignment is unknown.\n");
2939
2940       /* If peeled iterations are unknown, count a taken branch and a not taken
2941          branch per peeled loop. Even if scalar loop iterations are known,
2942          vector iterations are not known since peeled prologue iterations are
2943          not known. Hence guards remain the same.  */
2944       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2945                             NULL, 0, vect_prologue);
2946       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2947                             NULL, 0, vect_prologue);
2948       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2949                             NULL, 0, vect_epilogue);
2950       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2951                             NULL, 0, vect_epilogue);
2952       stmt_info_for_cost *si;
2953       int j;
2954       FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2955         {
2956           struct _stmt_vec_info *stmt_info
2957             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2958           (void) add_stmt_cost (target_cost_data,
2959                                 si->count * peel_iters_prologue,
2960                                 si->kind, stmt_info, si->misalign,
2961                                 vect_prologue);
2962           (void) add_stmt_cost (target_cost_data,
2963                                 si->count * peel_iters_epilogue,
2964                                 si->kind, stmt_info, si->misalign,
2965                                 vect_epilogue);
2966         }
2967     }
2968   else
2969     {
2970       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2971       stmt_info_for_cost *si;
2972       int j;
2973       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2974
2975       prologue_cost_vec.create (2);
2976       epilogue_cost_vec.create (2);
2977       peel_iters_prologue = npeel;
2978
2979       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2980                                           &peel_iters_epilogue,
2981                                           &scalar_cost_vec,
2982                                           &prologue_cost_vec,
2983                                           &epilogue_cost_vec);
2984
2985       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2986         {
2987           struct _stmt_vec_info *stmt_info
2988             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2989           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2990                                 si->misalign, vect_prologue);
2991         }
2992
2993       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2994         {
2995           struct _stmt_vec_info *stmt_info
2996             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2997           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2998                                 si->misalign, vect_epilogue);
2999         }
3000
3001       prologue_cost_vec.release ();
3002       epilogue_cost_vec.release ();
3003     }
3004
3005   /* FORNOW: The scalar outside cost is incremented in one of the
3006      following ways:
3007
3008      1. The vectorizer checks for alignment and aliasing and generates
3009      a condition that allows dynamic vectorization.  A cost model
3010      check is ANDED with the versioning condition.  Hence scalar code
3011      path now has the added cost of the versioning check.
3012
3013        if (cost > th & versioning_check)
3014          jmp to vector code
3015
3016      Hence run-time scalar is incremented by not-taken branch cost.
3017
3018      2. The vectorizer then checks if a prologue is required.  If the
3019      cost model check was not done before during versioning, it has to
3020      be done before the prologue check.
3021
3022        if (cost <= th)
3023          prologue = scalar_iters
3024        if (prologue == 0)
3025          jmp to vector code
3026        else
3027          execute prologue
3028        if (prologue == num_iters)
3029          go to exit
3030
3031      Hence the run-time scalar cost is incremented by a taken branch,
3032      plus a not-taken branch, plus a taken branch cost.
3033
3034      3. The vectorizer then checks if an epilogue is required.  If the
3035      cost model check was not done before during prologue check, it
3036      has to be done with the epilogue check.
3037
3038        if (prologue == 0)
3039          jmp to vector code
3040        else
3041          execute prologue
3042        if (prologue == num_iters)
3043          go to exit
3044        vector code:
3045          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3046            jmp to epilogue
3047
3048      Hence the run-time scalar cost should be incremented by 2 taken
3049      branches.
3050
3051      TODO: The back end may reorder the BBS's differently and reverse
3052      conditions/branch directions.  Change the estimates below to
3053      something more reasonable.  */
3054
3055   /* If the number of iterations is known and we do not do versioning, we can
3056      decide whether to vectorize at compile time.  Hence the scalar version
3057      do not carry cost model guard costs.  */
3058   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3059       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3060       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3061     {
3062       /* Cost model check occurs at versioning.  */
3063       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3064           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3065         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3066       else
3067         {
3068           /* Cost model check occurs at prologue generation.  */
3069           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3070             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3071               + vect_get_stmt_cost (cond_branch_not_taken);
3072           /* Cost model check occurs at epilogue generation.  */
3073           else
3074             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3075         }
3076     }
3077
3078   /* Complete the target-specific cost calculations.  */
3079   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3080                &vec_inside_cost, &vec_epilogue_cost);
3081
3082   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3083
3084   if (dump_enabled_p ())
3085     {
3086       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3087       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3088                    vec_inside_cost);
3089       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3090                    vec_prologue_cost);
3091       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3092                    vec_epilogue_cost);
3093       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3094                    scalar_single_iter_cost);
3095       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3096                    scalar_outside_cost);
3097       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3098                    vec_outside_cost);
3099       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3100                    peel_iters_prologue);
3101       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3102                    peel_iters_epilogue);
3103     }
3104
3105   /* Calculate number of iterations required to make the vector version
3106      profitable, relative to the loop bodies only.  The following condition
3107      must hold true:
3108      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3109      where
3110      SIC = scalar iteration cost, VIC = vector iteration cost,
3111      VOC = vector outside cost, VF = vectorization factor,
3112      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3113      SOC = scalar outside cost for run time cost model check.  */
3114
3115   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3116     {
3117       if (vec_outside_cost <= 0)
3118         min_profitable_iters = 1;
3119       else
3120         {
3121           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3122                                   - vec_inside_cost * peel_iters_prologue
3123                                   - vec_inside_cost * peel_iters_epilogue)
3124                                  / ((scalar_single_iter_cost * vf)
3125                                     - vec_inside_cost);
3126
3127           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3128               <= (((int) vec_inside_cost * min_profitable_iters)
3129                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3130             min_profitable_iters++;
3131         }
3132     }
3133   /* vector version will never be profitable.  */
3134   else
3135     {
3136       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3137         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3138                     "did not happen for a simd loop");
3139
3140       if (dump_enabled_p ())
3141         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3142                          "cost model: the vector iteration cost = %d "
3143                          "divided by the scalar iteration cost = %d "
3144                          "is greater or equal to the vectorization factor = %d"
3145                          ".\n",
3146                          vec_inside_cost, scalar_single_iter_cost, vf);
3147       *ret_min_profitable_niters = -1;
3148       *ret_min_profitable_estimate = -1;
3149       return;
3150     }
3151
3152   dump_printf (MSG_NOTE,
3153                "  Calculated minimum iters for profitability: %d\n",
3154                min_profitable_iters);
3155
3156   min_profitable_iters =
3157         min_profitable_iters < vf ? vf : min_profitable_iters;
3158
3159   /* Because the condition we create is:
3160      if (niters <= min_profitable_iters)
3161        then skip the vectorized loop.  */
3162   min_profitable_iters--;
3163
3164   if (dump_enabled_p ())
3165     dump_printf_loc (MSG_NOTE, vect_location,
3166                      "  Runtime profitability threshold = %d\n",
3167                      min_profitable_iters);
3168
3169   *ret_min_profitable_niters = min_profitable_iters;
3170
3171   /* Calculate number of iterations required to make the vector version
3172      profitable, relative to the loop bodies only.
3173
3174      Non-vectorized variant is SIC * niters and it must win over vector
3175      variant on the expected loop trip count.  The following condition must hold true:
3176      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3177
3178   if (vec_outside_cost <= 0)
3179     min_profitable_estimate = 1;
3180   else
3181     {
3182       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3183                                  - vec_inside_cost * peel_iters_prologue
3184                                  - vec_inside_cost * peel_iters_epilogue)
3185                                  / ((scalar_single_iter_cost * vf)
3186                                    - vec_inside_cost);
3187     }
3188   min_profitable_estimate --;
3189   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3190   if (dump_enabled_p ())
3191     dump_printf_loc (MSG_NOTE, vect_location,
3192                      "  Static estimate profitability threshold = %d\n",
3193                       min_profitable_iters);
3194
3195   *ret_min_profitable_estimate = min_profitable_estimate;
3196 }
3197
3198 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3199    vector elements (not bits) for a vector of mode MODE.  */
3200 static void
3201 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3202                               unsigned char *sel)
3203 {
3204   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3205
3206   for (i = 0; i < nelt; i++)
3207     sel[i] = (i + offset) & (2*nelt - 1);
3208 }
3209
3210 /* Checks whether the target supports whole-vector shifts for vectors of mode
3211    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3212    it supports vec_perm_const with masks for all necessary shift amounts.  */
3213 static bool
3214 have_whole_vector_shift (enum machine_mode mode)
3215 {
3216   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3217     return true;
3218
3219   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3220     return false;
3221
3222   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3223   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3224
3225   for (i = nelt/2; i >= 1; i/=2)
3226     {
3227       calc_vec_perm_mask_for_shift (mode, i, sel);
3228       if (!can_vec_perm_p (mode, false, sel))
3229         return false;
3230     }
3231   return true;
3232 }
3233
3234 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3235
3236 static tree
3237 get_reduction_op (gimple stmt, int reduc_index)
3238 {
3239   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3240     {
3241     case GIMPLE_SINGLE_RHS:
3242       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3243                   == ternary_op);
3244       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3245     case GIMPLE_UNARY_RHS:
3246       return gimple_assign_rhs1 (stmt);
3247     case GIMPLE_BINARY_RHS:
3248       return (reduc_index
3249               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3250     case GIMPLE_TERNARY_RHS:
3251       return gimple_op (stmt, reduc_index + 1);
3252     default:
3253       gcc_unreachable ();
3254     }
3255 }
3256
3257 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3258    functions. Design better to avoid maintenance issues.  */
3259
3260 /* Function vect_model_reduction_cost.
3261
3262    Models cost for a reduction operation, including the vector ops
3263    generated within the strip-mine loop, the initial definition before
3264    the loop, and the epilogue code that must be generated.  */
3265
3266 static bool
3267 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3268                            int ncopies, int reduc_index)
3269 {
3270   int prologue_cost = 0, epilogue_cost = 0;
3271   enum tree_code code;
3272   optab optab;
3273   tree vectype;
3274   gimple stmt, orig_stmt;
3275   tree reduction_op;
3276   machine_mode mode;
3277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3278   struct loop *loop = NULL;
3279   void *target_cost_data;
3280
3281   if (loop_vinfo)
3282     {
3283       loop = LOOP_VINFO_LOOP (loop_vinfo);
3284       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3285     }
3286   else
3287     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3288
3289   /* Cost of reduction op inside loop.  */
3290   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3291                                         stmt_info, 0, vect_body);
3292   stmt = STMT_VINFO_STMT (stmt_info);
3293
3294   reduction_op = get_reduction_op (stmt, reduc_index);
3295
3296   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3297   if (!vectype)
3298     {
3299       if (dump_enabled_p ())
3300         {
3301           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3302                            "unsupported data-type ");
3303           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3304                              TREE_TYPE (reduction_op));
3305           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3306         }
3307       return false;
3308    }
3309
3310   mode = TYPE_MODE (vectype);
3311   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3312
3313   if (!orig_stmt)
3314     orig_stmt = STMT_VINFO_STMT (stmt_info);
3315
3316   code = gimple_assign_rhs_code (orig_stmt);
3317
3318   /* Add in cost for initial definition.  */
3319   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3320                                   stmt_info, 0, vect_prologue);
3321
3322   /* Determine cost of epilogue code.
3323
3324      We have a reduction operator that will reduce the vector in one statement.
3325      Also requires scalar extract.  */
3326
3327   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3328     {
3329       if (reduc_code != ERROR_MARK)
3330         {
3331           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3332                                           stmt_info, 0, vect_epilogue);
3333           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3334                                           stmt_info, 0, vect_epilogue);
3335         }
3336       else
3337         {
3338           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3339           tree bitsize =
3340             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3341           int element_bitsize = tree_to_uhwi (bitsize);
3342           int nelements = vec_size_in_bits / element_bitsize;
3343
3344           optab = optab_for_tree_code (code, vectype, optab_default);
3345
3346           /* We have a whole vector shift available.  */
3347           if (VECTOR_MODE_P (mode)
3348               && optab_handler (optab, mode) != CODE_FOR_nothing
3349               && have_whole_vector_shift (mode))
3350             {
3351               /* Final reduction via vector shifts and the reduction operator.
3352                  Also requires scalar extract.  */
3353               epilogue_cost += add_stmt_cost (target_cost_data,
3354                                               exact_log2 (nelements) * 2,
3355                                               vector_stmt, stmt_info, 0,
3356                                               vect_epilogue);
3357               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3358                                               vec_to_scalar, stmt_info, 0,
3359                                               vect_epilogue);
3360             }
3361           else
3362             /* Use extracts and reduction op for final reduction.  For N
3363                elements, we have N extracts and N-1 reduction ops.  */
3364             epilogue_cost += add_stmt_cost (target_cost_data,
3365                                             nelements + nelements - 1,
3366                                             vector_stmt, stmt_info, 0,
3367                                             vect_epilogue);
3368         }
3369     }
3370
3371   if (dump_enabled_p ())
3372     dump_printf (MSG_NOTE,
3373                  "vect_model_reduction_cost: inside_cost = %d, "
3374                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3375                  prologue_cost, epilogue_cost);
3376
3377   return true;
3378 }
3379
3380
3381 /* Function vect_model_induction_cost.
3382
3383    Models cost for induction operations.  */
3384
3385 static void
3386 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3387 {
3388   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3389   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3390   unsigned inside_cost, prologue_cost;
3391
3392   /* loop cost for vec_loop.  */
3393   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3394                                stmt_info, 0, vect_body);
3395
3396   /* prologue cost for vec_init and vec_step.  */
3397   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3398                                  stmt_info, 0, vect_prologue);
3399
3400   if (dump_enabled_p ())
3401     dump_printf_loc (MSG_NOTE, vect_location,
3402                      "vect_model_induction_cost: inside_cost = %d, "
3403                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3404 }
3405
3406
3407 /* Function get_initial_def_for_induction
3408
3409    Input:
3410    STMT - a stmt that performs an induction operation in the loop.
3411    IV_PHI - the initial value of the induction variable
3412
3413    Output:
3414    Return a vector variable, initialized with the first VF values of
3415    the induction variable.  E.g., for an iv with IV_PHI='X' and
3416    evolution S, for a vector of 4 units, we want to return:
3417    [X, X + S, X + 2*S, X + 3*S].  */
3418
3419 static tree
3420 get_initial_def_for_induction (gimple iv_phi)
3421 {
3422   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3423   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3424   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3425   tree vectype;
3426   int nunits;
3427   edge pe = loop_preheader_edge (loop);
3428   struct loop *iv_loop;
3429   basic_block new_bb;
3430   tree new_vec, vec_init, vec_step, t;
3431   tree new_var;
3432   tree new_name;
3433   gimple init_stmt, new_stmt;
3434   gphi *induction_phi;
3435   tree induc_def, vec_def, vec_dest;
3436   tree init_expr, step_expr;
3437   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3438   int i;
3439   int ncopies;
3440   tree expr;
3441   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3442   bool nested_in_vect_loop = false;
3443   gimple_seq stmts = NULL;
3444   imm_use_iterator imm_iter;
3445   use_operand_p use_p;
3446   gimple exit_phi;
3447   edge latch_e;
3448   tree loop_arg;
3449   gimple_stmt_iterator si;
3450   basic_block bb = gimple_bb (iv_phi);
3451   tree stepvectype;
3452   tree resvectype;
3453
3454   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3455   if (nested_in_vect_loop_p (loop, iv_phi))
3456     {
3457       nested_in_vect_loop = true;
3458       iv_loop = loop->inner;
3459     }
3460   else
3461     iv_loop = loop;
3462   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3463
3464   latch_e = loop_latch_edge (iv_loop);
3465   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3466
3467   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3468   gcc_assert (step_expr != NULL_TREE);
3469
3470   pe = loop_preheader_edge (iv_loop);
3471   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3472                                      loop_preheader_edge (iv_loop));
3473
3474   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3475   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3476   gcc_assert (vectype);
3477   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3478   ncopies = vf / nunits;
3479
3480   gcc_assert (phi_info);
3481   gcc_assert (ncopies >= 1);
3482
3483   /* Convert the step to the desired type.  */
3484   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3485                                                   step_expr),
3486                                     &stmts, true, NULL_TREE);
3487   if (stmts)
3488     {
3489       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3490       gcc_assert (!new_bb);
3491     }
3492
3493   /* Find the first insertion point in the BB.  */
3494   si = gsi_after_labels (bb);
3495
3496   /* Create the vector that holds the initial_value of the induction.  */
3497   if (nested_in_vect_loop)
3498     {
3499       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3500          been created during vectorization of previous stmts.  We obtain it
3501          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3502       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3503       /* If the initial value is not of proper type, convert it.  */
3504       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3505         {
3506           new_stmt
3507             = gimple_build_assign (vect_get_new_vect_var (vectype,
3508                                                           vect_simple_var,
3509                                                           "vec_iv_"),
3510                                    VIEW_CONVERT_EXPR,
3511                                    build1 (VIEW_CONVERT_EXPR, vectype,
3512                                            vec_init));
3513           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3514           gimple_assign_set_lhs (new_stmt, vec_init);
3515           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3516                                                  new_stmt);
3517           gcc_assert (!new_bb);
3518           set_vinfo_for_stmt (new_stmt,
3519                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3520         }
3521     }
3522   else
3523     {
3524       vec<constructor_elt, va_gc> *v;
3525
3526       /* iv_loop is the loop to be vectorized. Create:
3527          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3528       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3529                                        vect_scalar_var, "var_");
3530       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3531                                                      init_expr),
3532                                        &stmts, false, new_var);
3533       if (stmts)
3534         {
3535           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3536           gcc_assert (!new_bb);
3537         }
3538
3539       vec_alloc (v, nunits);
3540       bool constant_p = is_gimple_min_invariant (new_name);
3541       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3542       for (i = 1; i < nunits; i++)
3543         {
3544           /* Create: new_name_i = new_name + step_expr  */
3545           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3546                                   new_name, step_expr);
3547           if (!is_gimple_min_invariant (new_name))
3548             {
3549               init_stmt = gimple_build_assign (new_var, new_name);
3550               new_name = make_ssa_name (new_var, init_stmt);
3551               gimple_assign_set_lhs (init_stmt, new_name);
3552               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3553               gcc_assert (!new_bb);
3554               if (dump_enabled_p ())
3555                 {
3556                   dump_printf_loc (MSG_NOTE, vect_location,
3557                                    "created new init_stmt: ");
3558                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3559                   dump_printf (MSG_NOTE, "\n");
3560                 }
3561               constant_p = false;
3562             }
3563           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3564         }
3565       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3566       if (constant_p)
3567         new_vec = build_vector_from_ctor (vectype, v);
3568       else
3569         new_vec = build_constructor (vectype, v);
3570       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3571     }
3572
3573
3574   /* Create the vector that holds the step of the induction.  */
3575   if (nested_in_vect_loop)
3576     /* iv_loop is nested in the loop to be vectorized. Generate:
3577        vec_step = [S, S, S, S]  */
3578     new_name = step_expr;
3579   else
3580     {
3581       /* iv_loop is the loop to be vectorized. Generate:
3582           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3583       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3584         {
3585           expr = build_int_cst (integer_type_node, vf);
3586           expr = fold_convert (TREE_TYPE (step_expr), expr);
3587         }
3588       else
3589         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3590       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3591                               expr, step_expr);
3592       if (TREE_CODE (step_expr) == SSA_NAME)
3593         new_name = vect_init_vector (iv_phi, new_name,
3594                                      TREE_TYPE (step_expr), NULL);
3595     }
3596
3597   t = unshare_expr (new_name);
3598   gcc_assert (CONSTANT_CLASS_P (new_name)
3599               || TREE_CODE (new_name) == SSA_NAME);
3600   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3601   gcc_assert (stepvectype);
3602   new_vec = build_vector_from_val (stepvectype, t);
3603   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3604
3605
3606   /* Create the following def-use cycle:
3607      loop prolog:
3608          vec_init = ...
3609          vec_step = ...
3610      loop:
3611          vec_iv = PHI <vec_init, vec_loop>
3612          ...
3613          STMT
3614          ...
3615          vec_loop = vec_iv + vec_step;  */
3616
3617   /* Create the induction-phi that defines the induction-operand.  */
3618   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3619   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3620   set_vinfo_for_stmt (induction_phi,
3621                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3622   induc_def = PHI_RESULT (induction_phi);
3623
3624   /* Create the iv update inside the loop  */
3625   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3626   vec_def = make_ssa_name (vec_dest, new_stmt);
3627   gimple_assign_set_lhs (new_stmt, vec_def);
3628   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3629   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3630                                                    NULL));
3631
3632   /* Set the arguments of the phi node:  */
3633   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3634   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3635                UNKNOWN_LOCATION);
3636
3637
3638   /* In case that vectorization factor (VF) is bigger than the number
3639      of elements that we can fit in a vectype (nunits), we have to generate
3640      more than one vector stmt - i.e - we need to "unroll" the
3641      vector stmt by a factor VF/nunits.  For more details see documentation
3642      in vectorizable_operation.  */
3643
3644   if (ncopies > 1)
3645     {
3646       stmt_vec_info prev_stmt_vinfo;
3647       /* FORNOW. This restriction should be relaxed.  */
3648       gcc_assert (!nested_in_vect_loop);
3649
3650       /* Create the vector that holds the step of the induction.  */
3651       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3652         {
3653           expr = build_int_cst (integer_type_node, nunits);
3654           expr = fold_convert (TREE_TYPE (step_expr), expr);
3655         }
3656       else
3657         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3658       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3659                               expr, step_expr);
3660       if (TREE_CODE (step_expr) == SSA_NAME)
3661         new_name = vect_init_vector (iv_phi, new_name,
3662                                      TREE_TYPE (step_expr), NULL);
3663       t = unshare_expr (new_name);
3664       gcc_assert (CONSTANT_CLASS_P (new_name)
3665                   || TREE_CODE (new_name) == SSA_NAME);
3666       new_vec = build_vector_from_val (stepvectype, t);
3667       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3668
3669       vec_def = induc_def;
3670       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3671       for (i = 1; i < ncopies; i++)
3672         {
3673           /* vec_i = vec_prev + vec_step  */
3674           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3675                                           vec_def, vec_step);
3676           vec_def = make_ssa_name (vec_dest, new_stmt);
3677           gimple_assign_set_lhs (new_stmt, vec_def);
3678
3679           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3680           if (!useless_type_conversion_p (resvectype, vectype))
3681             {
3682               new_stmt
3683                 = gimple_build_assign
3684                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3685                                                 "vec_iv_"),
3686                          VIEW_CONVERT_EXPR,
3687                          build1 (VIEW_CONVERT_EXPR, resvectype,
3688                                  gimple_assign_lhs (new_stmt)));
3689               gimple_assign_set_lhs (new_stmt,
3690                                      make_ssa_name
3691                                        (gimple_assign_lhs (new_stmt), new_stmt));
3692               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3693             }
3694           set_vinfo_for_stmt (new_stmt,
3695                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3696           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3697           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3698         }
3699     }
3700
3701   if (nested_in_vect_loop)
3702     {
3703       /* Find the loop-closed exit-phi of the induction, and record
3704          the final vector of induction results:  */
3705       exit_phi = NULL;
3706       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3707         {
3708           gimple use_stmt = USE_STMT (use_p);
3709           if (is_gimple_debug (use_stmt))
3710             continue;
3711
3712           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3713             {
3714               exit_phi = use_stmt;
3715               break;
3716             }
3717         }
3718       if (exit_phi)
3719         {
3720           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3721           /* FORNOW. Currently not supporting the case that an inner-loop induction
3722              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3723           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3724                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3725
3726           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3727           if (dump_enabled_p ())
3728             {
3729               dump_printf_loc (MSG_NOTE, vect_location,
3730                                "vector of inductions after inner-loop:");
3731               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3732               dump_printf (MSG_NOTE, "\n");
3733             }
3734         }
3735     }
3736
3737
3738   if (dump_enabled_p ())
3739     {
3740       dump_printf_loc (MSG_NOTE, vect_location,
3741                        "transform induction: created def-use cycle: ");
3742       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3743       dump_printf (MSG_NOTE, "\n");
3744       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3745                         SSA_NAME_DEF_STMT (vec_def), 0);
3746       dump_printf (MSG_NOTE, "\n");
3747     }
3748
3749   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3750   if (!useless_type_conversion_p (resvectype, vectype))
3751     {
3752       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3753                                                              vect_simple_var,
3754                                                              "vec_iv_"),
3755                                       VIEW_CONVERT_EXPR,
3756                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3757                                               induc_def));
3758       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3759       gimple_assign_set_lhs (new_stmt, induc_def);
3760       si = gsi_after_labels (bb);
3761       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3762       set_vinfo_for_stmt (new_stmt,
3763                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3764       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3765         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3766     }
3767
3768   return induc_def;
3769 }
3770
3771
3772 /* Function get_initial_def_for_reduction
3773
3774    Input:
3775    STMT - a stmt that performs a reduction operation in the loop.
3776    INIT_VAL - the initial value of the reduction variable
3777
3778    Output:
3779    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3780         of the reduction (used for adjusting the epilog - see below).
3781    Return a vector variable, initialized according to the operation that STMT
3782         performs. This vector will be used as the initial value of the
3783         vector of partial results.
3784
3785    Option1 (adjust in epilog): Initialize the vector as follows:
3786      add/bit or/xor:    [0,0,...,0,0]
3787      mult/bit and:      [1,1,...,1,1]
3788      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3789    and when necessary (e.g. add/mult case) let the caller know
3790    that it needs to adjust the result by init_val.
3791
3792    Option2: Initialize the vector as follows:
3793      add/bit or/xor:    [init_val,0,0,...,0]
3794      mult/bit and:      [init_val,1,1,...,1]
3795      min/max/cond_expr: [init_val,init_val,...,init_val]
3796    and no adjustments are needed.
3797
3798    For example, for the following code:
3799
3800    s = init_val;
3801    for (i=0;i<n;i++)
3802      s = s + a[i];
3803
3804    STMT is 's = s + a[i]', and the reduction variable is 's'.
3805    For a vector of 4 units, we want to return either [0,0,0,init_val],
3806    or [0,0,0,0] and let the caller know that it needs to adjust
3807    the result at the end by 'init_val'.
3808
3809    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3810    initialization vector is simpler (same element in all entries), if
3811    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3812
3813    A cost model should help decide between these two schemes.  */
3814
3815 tree
3816 get_initial_def_for_reduction (gimple stmt, tree init_val,
3817                                tree *adjustment_def)
3818 {
3819   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3820   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3821   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3822   tree scalar_type = TREE_TYPE (init_val);
3823   tree vectype = get_vectype_for_scalar_type (scalar_type);
3824   int nunits;
3825   enum tree_code code = gimple_assign_rhs_code (stmt);
3826   tree def_for_init;
3827   tree init_def;
3828   tree *elts;
3829   int i;
3830   bool nested_in_vect_loop = false;
3831   tree init_value;
3832   REAL_VALUE_TYPE real_init_val = dconst0;
3833   int int_init_val = 0;
3834   gimple def_stmt = NULL;
3835
3836   gcc_assert (vectype);
3837   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3838
3839   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3840               || SCALAR_FLOAT_TYPE_P (scalar_type));
3841
3842   if (nested_in_vect_loop_p (loop, stmt))
3843     nested_in_vect_loop = true;
3844   else
3845     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3846
3847   /* In case of double reduction we only create a vector variable to be put
3848      in the reduction phi node.  The actual statement creation is done in
3849      vect_create_epilog_for_reduction.  */
3850   if (adjustment_def && nested_in_vect_loop
3851       && TREE_CODE (init_val) == SSA_NAME
3852       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3853       && gimple_code (def_stmt) == GIMPLE_PHI
3854       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3855       && vinfo_for_stmt (def_stmt)
3856       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3857           == vect_double_reduction_def)
3858     {
3859       *adjustment_def = NULL;
3860       return vect_create_destination_var (init_val, vectype);
3861     }
3862
3863   if (TREE_CONSTANT (init_val))
3864     {
3865       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3866         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3867       else
3868         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3869     }
3870   else
3871     init_value = init_val;
3872
3873   switch (code)
3874     {
3875       case WIDEN_SUM_EXPR:
3876       case DOT_PROD_EXPR:
3877       case SAD_EXPR:
3878       case PLUS_EXPR:
3879       case MINUS_EXPR:
3880       case BIT_IOR_EXPR:
3881       case BIT_XOR_EXPR:
3882       case MULT_EXPR:
3883       case BIT_AND_EXPR:
3884         /* ADJUSMENT_DEF is NULL when called from
3885            vect_create_epilog_for_reduction to vectorize double reduction.  */
3886         if (adjustment_def)
3887           {
3888             if (nested_in_vect_loop)
3889               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3890                                                               NULL);
3891             else
3892               *adjustment_def = init_val;
3893           }
3894
3895         if (code == MULT_EXPR)
3896           {
3897             real_init_val = dconst1;
3898             int_init_val = 1;
3899           }
3900
3901         if (code == BIT_AND_EXPR)
3902           int_init_val = -1;
3903
3904         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3905           def_for_init = build_real (scalar_type, real_init_val);
3906         else
3907           def_for_init = build_int_cst (scalar_type, int_init_val);
3908
3909         /* Create a vector of '0' or '1' except the first element.  */
3910         elts = XALLOCAVEC (tree, nunits);
3911         for (i = nunits - 2; i >= 0; --i)
3912           elts[i + 1] = def_for_init;
3913
3914         /* Option1: the first element is '0' or '1' as well.  */
3915         if (adjustment_def)
3916           {
3917             elts[0] = def_for_init;
3918             init_def = build_vector (vectype, elts);
3919             break;
3920           }
3921
3922         /* Option2: the first element is INIT_VAL.  */
3923         elts[0] = init_val;
3924         if (TREE_CONSTANT (init_val))
3925           init_def = build_vector (vectype, elts);
3926         else
3927           {
3928             vec<constructor_elt, va_gc> *v;
3929             vec_alloc (v, nunits);
3930             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3931             for (i = 1; i < nunits; ++i)
3932               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3933             init_def = build_constructor (vectype, v);
3934           }
3935
3936         break;
3937
3938       case MIN_EXPR:
3939       case MAX_EXPR:
3940       case COND_EXPR:
3941         if (adjustment_def)
3942           {
3943             *adjustment_def = NULL_TREE;
3944             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3945             break;
3946           }
3947
3948         init_def = build_vector_from_val (vectype, init_value);
3949         break;
3950
3951       default:
3952         gcc_unreachable ();
3953     }
3954
3955   return init_def;
3956 }
3957
3958 /* Function vect_create_epilog_for_reduction
3959
3960    Create code at the loop-epilog to finalize the result of a reduction
3961    computation.
3962
3963    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3964      reduction statements.
3965    STMT is the scalar reduction stmt that is being vectorized.
3966    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3967      number of elements that we can fit in a vectype (nunits).  In this case
3968      we have to generate more than one vector stmt - i.e - we need to "unroll"
3969      the vector stmt by a factor VF/nunits.  For more details see documentation
3970      in vectorizable_operation.
3971    REDUC_CODE is the tree-code for the epilog reduction.
3972    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3973      computation.
3974    REDUC_INDEX is the index of the operand in the right hand side of the
3975      statement that is defined by REDUCTION_PHI.
3976    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3977    SLP_NODE is an SLP node containing a group of reduction statements. The
3978      first one in this group is STMT.
3979
3980    This function:
3981    1. Creates the reduction def-use cycles: sets the arguments for
3982       REDUCTION_PHIS:
3983       The loop-entry argument is the vectorized initial-value of the reduction.
3984       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3985       sums.
3986    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3987       by applying the operation specified by REDUC_CODE if available, or by
3988       other means (whole-vector shifts or a scalar loop).
3989       The function also creates a new phi node at the loop exit to preserve
3990       loop-closed form, as illustrated below.
3991
3992      The flow at the entry to this function:
3993
3994         loop:
3995           vec_def = phi <null, null>            # REDUCTION_PHI
3996           VECT_DEF = vector_stmt                # vectorized form of STMT
3997           s_loop = scalar_stmt                  # (scalar) STMT
3998         loop_exit:
3999           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4000           use <s_out0>
4001           use <s_out0>
4002
4003      The above is transformed by this function into:
4004
4005         loop:
4006           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4007           VECT_DEF = vector_stmt                # vectorized form of STMT
4008           s_loop = scalar_stmt                  # (scalar) STMT
4009         loop_exit:
4010           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4011           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4012           v_out2 = reduce <v_out1>
4013           s_out3 = extract_field <v_out2, 0>
4014           s_out4 = adjust_result <s_out3>
4015           use <s_out4>
4016           use <s_out4>
4017 */
4018
4019 static void
4020 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
4021                                   int ncopies, enum tree_code reduc_code,
4022                                   vec<gimple> reduction_phis,
4023                                   int reduc_index, bool double_reduc,
4024                                   slp_tree slp_node)
4025 {
4026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4027   stmt_vec_info prev_phi_info;
4028   tree vectype;
4029   machine_mode mode;
4030   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4031   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4032   basic_block exit_bb;
4033   tree scalar_dest;
4034   tree scalar_type;
4035   gimple new_phi = NULL, phi;
4036   gimple_stmt_iterator exit_gsi;
4037   tree vec_dest;
4038   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4039   gimple epilog_stmt = NULL;
4040   enum tree_code code = gimple_assign_rhs_code (stmt);
4041   gimple exit_phi;
4042   tree bitsize;
4043   tree adjustment_def = NULL;
4044   tree vec_initial_def = NULL;
4045   tree reduction_op, expr, def;
4046   tree orig_name, scalar_result;
4047   imm_use_iterator imm_iter, phi_imm_iter;
4048   use_operand_p use_p, phi_use_p;
4049   gimple use_stmt, orig_stmt, reduction_phi = NULL;
4050   bool nested_in_vect_loop = false;
4051   auto_vec<gimple> new_phis;
4052   auto_vec<gimple> inner_phis;
4053   enum vect_def_type dt = vect_unknown_def_type;
4054   int j, i;
4055   auto_vec<tree> scalar_results;
4056   unsigned int group_size = 1, k, ratio;
4057   auto_vec<tree> vec_initial_defs;
4058   auto_vec<gimple> phis;
4059   bool slp_reduc = false;
4060   tree new_phi_result;
4061   gimple inner_phi = NULL;
4062
4063   if (slp_node)
4064     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4065
4066   if (nested_in_vect_loop_p (loop, stmt))
4067     {
4068       outer_loop = loop;
4069       loop = loop->inner;
4070       nested_in_vect_loop = true;
4071       gcc_assert (!slp_node);
4072     }
4073
4074   reduction_op = get_reduction_op (stmt, reduc_index);
4075
4076   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4077   gcc_assert (vectype);
4078   mode = TYPE_MODE (vectype);
4079
4080   /* 1. Create the reduction def-use cycle:
4081      Set the arguments of REDUCTION_PHIS, i.e., transform
4082
4083         loop:
4084           vec_def = phi <null, null>            # REDUCTION_PHI
4085           VECT_DEF = vector_stmt                # vectorized form of STMT
4086           ...
4087
4088      into:
4089
4090         loop:
4091           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4092           VECT_DEF = vector_stmt                # vectorized form of STMT
4093           ...
4094
4095      (in case of SLP, do it for all the phis). */
4096
4097   /* Get the loop-entry arguments.  */
4098   if (slp_node)
4099     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4100                        NULL, slp_node, reduc_index);
4101   else
4102     {
4103       vec_initial_defs.create (1);
4104      /* For the case of reduction, vect_get_vec_def_for_operand returns
4105         the scalar def before the loop, that defines the initial value
4106         of the reduction variable.  */
4107       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4108                                                       &adjustment_def);
4109       vec_initial_defs.quick_push (vec_initial_def);
4110     }
4111
4112   /* Set phi nodes arguments.  */
4113   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4114     {
4115       tree vec_init_def, def;
4116       gimple_seq stmts;
4117       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4118                                            true, NULL_TREE);
4119       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4120       def = vect_defs[i];
4121       for (j = 0; j < ncopies; j++)
4122         {
4123           /* Set the loop-entry arg of the reduction-phi.  */
4124           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4125                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4126
4127           /* Set the loop-latch arg for the reduction-phi.  */
4128           if (j > 0)
4129             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4130
4131           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4132                        UNKNOWN_LOCATION);
4133
4134           if (dump_enabled_p ())
4135             {
4136               dump_printf_loc (MSG_NOTE, vect_location,
4137                                "transform reduction: created def-use cycle: ");
4138               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4139               dump_printf (MSG_NOTE, "\n");
4140               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4141               dump_printf (MSG_NOTE, "\n");
4142             }
4143
4144           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4145         }
4146     }
4147
4148   /* 2. Create epilog code.
4149         The reduction epilog code operates across the elements of the vector
4150         of partial results computed by the vectorized loop.
4151         The reduction epilog code consists of:
4152
4153         step 1: compute the scalar result in a vector (v_out2)
4154         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4155         step 3: adjust the scalar result (s_out3) if needed.
4156
4157         Step 1 can be accomplished using one the following three schemes:
4158           (scheme 1) using reduc_code, if available.
4159           (scheme 2) using whole-vector shifts, if available.
4160           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4161                      combined.
4162
4163           The overall epilog code looks like this:
4164
4165           s_out0 = phi <s_loop>         # original EXIT_PHI
4166           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4167           v_out2 = reduce <v_out1>              # step 1
4168           s_out3 = extract_field <v_out2, 0>    # step 2
4169           s_out4 = adjust_result <s_out3>       # step 3
4170
4171           (step 3 is optional, and steps 1 and 2 may be combined).
4172           Lastly, the uses of s_out0 are replaced by s_out4.  */
4173
4174
4175   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4176          v_out1 = phi <VECT_DEF>
4177          Store them in NEW_PHIS.  */
4178
4179   exit_bb = single_exit (loop)->dest;
4180   prev_phi_info = NULL;
4181   new_phis.create (vect_defs.length ());
4182   FOR_EACH_VEC_ELT (vect_defs, i, def)
4183     {
4184       for (j = 0; j < ncopies; j++)
4185         {
4186           tree new_def = copy_ssa_name (def);
4187           phi = create_phi_node (new_def, exit_bb);
4188           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4189           if (j == 0)
4190             new_phis.quick_push (phi);
4191           else
4192             {
4193               def = vect_get_vec_def_for_stmt_copy (dt, def);
4194               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4195             }
4196
4197           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4198           prev_phi_info = vinfo_for_stmt (phi);
4199         }
4200     }
4201
4202   /* The epilogue is created for the outer-loop, i.e., for the loop being
4203      vectorized.  Create exit phis for the outer loop.  */
4204   if (double_reduc)
4205     {
4206       loop = outer_loop;
4207       exit_bb = single_exit (loop)->dest;
4208       inner_phis.create (vect_defs.length ());
4209       FOR_EACH_VEC_ELT (new_phis, i, phi)
4210         {
4211           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4212           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4213           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4214                            PHI_RESULT (phi));
4215           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4216                                                             loop_vinfo, NULL));
4217           inner_phis.quick_push (phi);
4218           new_phis[i] = outer_phi;
4219           prev_phi_info = vinfo_for_stmt (outer_phi);
4220           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4221             {
4222               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4223               new_result = copy_ssa_name (PHI_RESULT (phi));
4224               outer_phi = create_phi_node (new_result, exit_bb);
4225               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4226                                PHI_RESULT (phi));
4227               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4228                                                         loop_vinfo, NULL));
4229               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4230               prev_phi_info = vinfo_for_stmt (outer_phi);
4231             }
4232         }
4233     }
4234
4235   exit_gsi = gsi_after_labels (exit_bb);
4236
4237   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4238          (i.e. when reduc_code is not available) and in the final adjustment
4239          code (if needed).  Also get the original scalar reduction variable as
4240          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4241          represents a reduction pattern), the tree-code and scalar-def are
4242          taken from the original stmt that the pattern-stmt (STMT) replaces.
4243          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4244          are taken from STMT.  */
4245
4246   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4247   if (!orig_stmt)
4248     {
4249       /* Regular reduction  */
4250       orig_stmt = stmt;
4251     }
4252   else
4253     {
4254       /* Reduction pattern  */
4255       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4256       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4257       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4258     }
4259
4260   code = gimple_assign_rhs_code (orig_stmt);
4261   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4262      partial results are added and not subtracted.  */
4263   if (code == MINUS_EXPR)
4264     code = PLUS_EXPR;
4265
4266   scalar_dest = gimple_assign_lhs (orig_stmt);
4267   scalar_type = TREE_TYPE (scalar_dest);
4268   scalar_results.create (group_size);
4269   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4270   bitsize = TYPE_SIZE (scalar_type);
4271
4272   /* In case this is a reduction in an inner-loop while vectorizing an outer
4273      loop - we don't need to extract a single scalar result at the end of the
4274      inner-loop (unless it is double reduction, i.e., the use of reduction is
4275      outside the outer-loop).  The final vector of partial results will be used
4276      in the vectorized outer-loop, or reduced to a scalar result at the end of
4277      the outer-loop.  */
4278   if (nested_in_vect_loop && !double_reduc)
4279     goto vect_finalize_reduction;
4280
4281   /* SLP reduction without reduction chain, e.g.,
4282      # a1 = phi <a2, a0>
4283      # b1 = phi <b2, b0>
4284      a2 = operation (a1)
4285      b2 = operation (b1)  */
4286   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4287
4288   /* In case of reduction chain, e.g.,
4289      # a1 = phi <a3, a0>
4290      a2 = operation (a1)
4291      a3 = operation (a2),
4292
4293      we may end up with more than one vector result.  Here we reduce them to
4294      one vector.  */
4295   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4296     {
4297       tree first_vect = PHI_RESULT (new_phis[0]);
4298       tree tmp;
4299       gassign *new_vec_stmt = NULL;
4300
4301       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4302       for (k = 1; k < new_phis.length (); k++)
4303         {
4304           gimple next_phi = new_phis[k];
4305           tree second_vect = PHI_RESULT (next_phi);
4306
4307           tmp = build2 (code, vectype,  first_vect, second_vect);
4308           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4309           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4310           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4311           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4312         }
4313
4314       new_phi_result = first_vect;
4315       if (new_vec_stmt)
4316         {
4317           new_phis.truncate (0);
4318           new_phis.safe_push (new_vec_stmt);
4319         }
4320     }
4321   else
4322     new_phi_result = PHI_RESULT (new_phis[0]);
4323
4324   /* 2.3 Create the reduction code, using one of the three schemes described
4325          above. In SLP we simply need to extract all the elements from the
4326          vector (without reducing them), so we use scalar shifts.  */
4327   if (reduc_code != ERROR_MARK && !slp_reduc)
4328     {
4329       tree tmp;
4330       tree vec_elem_type;
4331
4332       /*** Case 1:  Create:
4333            v_out2 = reduc_expr <v_out1>  */
4334
4335       if (dump_enabled_p ())
4336         dump_printf_loc (MSG_NOTE, vect_location,
4337                          "Reduce using direct vector reduction.\n");
4338
4339       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4340       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4341         {
4342           tree tmp_dest =
4343               vect_create_destination_var (scalar_dest, vec_elem_type);
4344           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4345           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4346           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4347           gimple_assign_set_lhs (epilog_stmt, new_temp);
4348           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4349
4350           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4351         }
4352       else
4353         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4354       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4355       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4356       gimple_assign_set_lhs (epilog_stmt, new_temp);
4357       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4358       scalar_results.safe_push (new_temp);
4359     }
4360   else
4361     {
4362       bool reduce_with_shift = have_whole_vector_shift (mode);
4363       int element_bitsize = tree_to_uhwi (bitsize);
4364       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4365       tree vec_temp;
4366
4367       /* Regardless of whether we have a whole vector shift, if we're
4368          emulating the operation via tree-vect-generic, we don't want
4369          to use it.  Only the first round of the reduction is likely
4370          to still be profitable via emulation.  */
4371       /* ??? It might be better to emit a reduction tree code here, so that
4372          tree-vect-generic can expand the first round via bit tricks.  */
4373       if (!VECTOR_MODE_P (mode))
4374         reduce_with_shift = false;
4375       else
4376         {
4377           optab optab = optab_for_tree_code (code, vectype, optab_default);
4378           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4379             reduce_with_shift = false;
4380         }
4381
4382       if (reduce_with_shift && !slp_reduc)
4383         {
4384           int nelements = vec_size_in_bits / element_bitsize;
4385           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4386
4387           int elt_offset;
4388
4389           tree zero_vec = build_zero_cst (vectype);
4390           /*** Case 2: Create:
4391              for (offset = nelements/2; offset >= 1; offset/=2)
4392                 {
4393                   Create:  va' = vec_shift <va, offset>
4394                   Create:  va = vop <va, va'>
4395                 }  */
4396
4397           tree rhs;
4398
4399           if (dump_enabled_p ())
4400             dump_printf_loc (MSG_NOTE, vect_location,
4401                              "Reduce using vector shifts\n");
4402
4403           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4404           new_temp = new_phi_result;
4405           for (elt_offset = nelements / 2;
4406                elt_offset >= 1;
4407                elt_offset /= 2)
4408             {
4409               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4410               tree mask = vect_gen_perm_mask_any (vectype, sel);
4411               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4412                                                  new_temp, zero_vec, mask);
4413               new_name = make_ssa_name (vec_dest, epilog_stmt);
4414               gimple_assign_set_lhs (epilog_stmt, new_name);
4415               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4416
4417               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4418                                                  new_temp);
4419               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4420               gimple_assign_set_lhs (epilog_stmt, new_temp);
4421               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4422             }
4423
4424           /* 2.4  Extract the final scalar result.  Create:
4425              s_out3 = extract_field <v_out2, bitpos>  */
4426
4427           if (dump_enabled_p ())
4428             dump_printf_loc (MSG_NOTE, vect_location,
4429                              "extract scalar result\n");
4430
4431           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4432                         bitsize, bitsize_zero_node);
4433           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4434           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4435           gimple_assign_set_lhs (epilog_stmt, new_temp);
4436           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4437           scalar_results.safe_push (new_temp);
4438         }
4439       else
4440         {
4441           /*** Case 3: Create:
4442              s = extract_field <v_out2, 0>
4443              for (offset = element_size;
4444                   offset < vector_size;
4445                   offset += element_size;)
4446                {
4447                  Create:  s' = extract_field <v_out2, offset>
4448                  Create:  s = op <s, s'>  // For non SLP cases
4449                }  */
4450
4451           if (dump_enabled_p ())
4452             dump_printf_loc (MSG_NOTE, vect_location,
4453                              "Reduce using scalar code.\n");
4454
4455           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4456           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4457             {
4458               int bit_offset;
4459               if (gimple_code (new_phi) == GIMPLE_PHI)
4460                 vec_temp = PHI_RESULT (new_phi);
4461               else
4462                 vec_temp = gimple_assign_lhs (new_phi);
4463               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4464                             bitsize_zero_node);
4465               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4466               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4467               gimple_assign_set_lhs (epilog_stmt, new_temp);
4468               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4469
4470               /* In SLP we don't need to apply reduction operation, so we just
4471                  collect s' values in SCALAR_RESULTS.  */
4472               if (slp_reduc)
4473                 scalar_results.safe_push (new_temp);
4474
4475               for (bit_offset = element_bitsize;
4476                    bit_offset < vec_size_in_bits;
4477                    bit_offset += element_bitsize)
4478                 {
4479                   tree bitpos = bitsize_int (bit_offset);
4480                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4481                                      bitsize, bitpos);
4482
4483                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4484                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4485                   gimple_assign_set_lhs (epilog_stmt, new_name);
4486                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4487
4488                   if (slp_reduc)
4489                     {
4490                       /* In SLP we don't need to apply reduction operation, so
4491                          we just collect s' values in SCALAR_RESULTS.  */
4492                       new_temp = new_name;
4493                       scalar_results.safe_push (new_name);
4494                     }
4495                   else
4496                     {
4497                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4498                                                          new_name, new_temp);
4499                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4500                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4501                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4502                     }
4503                 }
4504             }
4505
4506           /* The only case where we need to reduce scalar results in SLP, is
4507              unrolling.  If the size of SCALAR_RESULTS is greater than
4508              GROUP_SIZE, we reduce them combining elements modulo
4509              GROUP_SIZE.  */
4510           if (slp_reduc)
4511             {
4512               tree res, first_res, new_res;
4513               gimple new_stmt;
4514
4515               /* Reduce multiple scalar results in case of SLP unrolling.  */
4516               for (j = group_size; scalar_results.iterate (j, &res);
4517                    j++)
4518                 {
4519                   first_res = scalar_results[j % group_size];
4520                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4521                                                   first_res, res);
4522                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4523                   gimple_assign_set_lhs (new_stmt, new_res);
4524                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4525                   scalar_results[j % group_size] = new_res;
4526                 }
4527             }
4528           else
4529             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4530             scalar_results.safe_push (new_temp);
4531         }
4532     }
4533
4534 vect_finalize_reduction:
4535
4536   if (double_reduc)
4537     loop = loop->inner;
4538
4539   /* 2.5 Adjust the final result by the initial value of the reduction
4540          variable. (When such adjustment is not needed, then
4541          'adjustment_def' is zero).  For example, if code is PLUS we create:
4542          new_temp = loop_exit_def + adjustment_def  */
4543
4544   if (adjustment_def)
4545     {
4546       gcc_assert (!slp_reduc);
4547       if (nested_in_vect_loop)
4548         {
4549           new_phi = new_phis[0];
4550           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4551           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4552           new_dest = vect_create_destination_var (scalar_dest, vectype);
4553         }
4554       else
4555         {
4556           new_temp = scalar_results[0];
4557           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4558           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4559           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4560         }
4561
4562       epilog_stmt = gimple_build_assign (new_dest, expr);
4563       new_temp = make_ssa_name (new_dest, epilog_stmt);
4564       gimple_assign_set_lhs (epilog_stmt, new_temp);
4565       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4566       if (nested_in_vect_loop)
4567         {
4568           set_vinfo_for_stmt (epilog_stmt,
4569                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4570                                                  NULL));
4571           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4572                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4573
4574           if (!double_reduc)
4575             scalar_results.quick_push (new_temp);
4576           else
4577             scalar_results[0] = new_temp;
4578         }
4579       else
4580         scalar_results[0] = new_temp;
4581
4582       new_phis[0] = epilog_stmt;
4583     }
4584
4585   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4586           phis with new adjusted scalar results, i.e., replace use <s_out0>
4587           with use <s_out4>.
4588
4589      Transform:
4590         loop_exit:
4591           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4592           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4593           v_out2 = reduce <v_out1>
4594           s_out3 = extract_field <v_out2, 0>
4595           s_out4 = adjust_result <s_out3>
4596           use <s_out0>
4597           use <s_out0>
4598
4599      into:
4600
4601         loop_exit:
4602           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4603           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4604           v_out2 = reduce <v_out1>
4605           s_out3 = extract_field <v_out2, 0>
4606           s_out4 = adjust_result <s_out3>
4607           use <s_out4>
4608           use <s_out4> */
4609
4610
4611   /* In SLP reduction chain we reduce vector results into one vector if
4612      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4613      the last stmt in the reduction chain, since we are looking for the loop
4614      exit phi node.  */
4615   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4616     {
4617       gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4618       /* Handle reduction patterns.  */
4619       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4620         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4621
4622       scalar_dest = gimple_assign_lhs (dest_stmt);
4623       group_size = 1;
4624     }
4625
4626   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4627      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4628      need to match SCALAR_RESULTS with corresponding statements.  The first
4629      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4630      the first vector stmt, etc.
4631      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4632   if (group_size > new_phis.length ())
4633     {
4634       ratio = group_size / new_phis.length ();
4635       gcc_assert (!(group_size % new_phis.length ()));
4636     }
4637   else
4638     ratio = 1;
4639
4640   for (k = 0; k < group_size; k++)
4641     {
4642       if (k % ratio == 0)
4643         {
4644           epilog_stmt = new_phis[k / ratio];
4645           reduction_phi = reduction_phis[k / ratio];
4646           if (double_reduc)
4647             inner_phi = inner_phis[k / ratio];
4648         }
4649
4650       if (slp_reduc)
4651         {
4652           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4653
4654           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4655           /* SLP statements can't participate in patterns.  */
4656           gcc_assert (!orig_stmt);
4657           scalar_dest = gimple_assign_lhs (current_stmt);
4658         }
4659
4660       phis.create (3);
4661       /* Find the loop-closed-use at the loop exit of the original scalar
4662          result.  (The reduction result is expected to have two immediate uses -
4663          one at the latch block, and one at the loop exit).  */
4664       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4665         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4666             && !is_gimple_debug (USE_STMT (use_p)))
4667           phis.safe_push (USE_STMT (use_p));
4668
4669       /* While we expect to have found an exit_phi because of loop-closed-ssa
4670          form we can end up without one if the scalar cycle is dead.  */
4671
4672       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4673         {
4674           if (outer_loop)
4675             {
4676               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4677               gphi *vect_phi;
4678
4679               /* FORNOW. Currently not supporting the case that an inner-loop
4680                  reduction is not used in the outer-loop (but only outside the
4681                  outer-loop), unless it is double reduction.  */
4682               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4683                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4684                           || double_reduc);
4685
4686               if (double_reduc)
4687                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4688               else
4689                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4690               if (!double_reduc
4691                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4692                       != vect_double_reduction_def)
4693                 continue;
4694
4695               /* Handle double reduction:
4696
4697                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4698                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4699                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4700                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4701
4702                  At that point the regular reduction (stmt2 and stmt3) is
4703                  already vectorized, as well as the exit phi node, stmt4.
4704                  Here we vectorize the phi node of double reduction, stmt1, and
4705                  update all relevant statements.  */
4706
4707               /* Go through all the uses of s2 to find double reduction phi
4708                  node, i.e., stmt1 above.  */
4709               orig_name = PHI_RESULT (exit_phi);
4710               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4711                 {
4712                   stmt_vec_info use_stmt_vinfo;
4713                   stmt_vec_info new_phi_vinfo;
4714                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4715                   basic_block bb = gimple_bb (use_stmt);
4716                   gimple use;
4717
4718                   /* Check that USE_STMT is really double reduction phi
4719                      node.  */
4720                   if (gimple_code (use_stmt) != GIMPLE_PHI
4721                       || gimple_phi_num_args (use_stmt) != 2
4722                       || bb->loop_father != outer_loop)
4723                     continue;
4724                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4725                   if (!use_stmt_vinfo
4726                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4727                           != vect_double_reduction_def)
4728                     continue;
4729
4730                   /* Create vector phi node for double reduction:
4731                      vs1 = phi <vs0, vs2>
4732                      vs1 was created previously in this function by a call to
4733                        vect_get_vec_def_for_operand and is stored in
4734                        vec_initial_def;
4735                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4736                      vs0 is created here.  */
4737
4738                   /* Create vector phi node.  */
4739                   vect_phi = create_phi_node (vec_initial_def, bb);
4740                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4741                                     loop_vec_info_for_loop (outer_loop), NULL);
4742                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4743
4744                   /* Create vs0 - initial def of the double reduction phi.  */
4745                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4746                                              loop_preheader_edge (outer_loop));
4747                   init_def = get_initial_def_for_reduction (stmt,
4748                                                           preheader_arg, NULL);
4749                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4750                                                     vectype, NULL);
4751
4752                   /* Update phi node arguments with vs0 and vs2.  */
4753                   add_phi_arg (vect_phi, vect_phi_init,
4754                                loop_preheader_edge (outer_loop),
4755                                UNKNOWN_LOCATION);
4756                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4757                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4758                   if (dump_enabled_p ())
4759                     {
4760                       dump_printf_loc (MSG_NOTE, vect_location,
4761                                        "created double reduction phi node: ");
4762                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4763                       dump_printf (MSG_NOTE, "\n");
4764                     }
4765
4766                   vect_phi_res = PHI_RESULT (vect_phi);
4767
4768                   /* Replace the use, i.e., set the correct vs1 in the regular
4769                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4770                      loop is redundant.  */
4771                   use = reduction_phi;
4772                   for (j = 0; j < ncopies; j++)
4773                     {
4774                       edge pr_edge = loop_preheader_edge (loop);
4775                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4776                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4777                     }
4778                 }
4779             }
4780         }
4781
4782       phis.release ();
4783       if (nested_in_vect_loop)
4784         {
4785           if (double_reduc)
4786             loop = outer_loop;
4787           else
4788             continue;
4789         }
4790
4791       phis.create (3);
4792       /* Find the loop-closed-use at the loop exit of the original scalar
4793          result.  (The reduction result is expected to have two immediate uses,
4794          one at the latch block, and one at the loop exit).  For double
4795          reductions we are looking for exit phis of the outer loop.  */
4796       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4797         {
4798           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4799             {
4800               if (!is_gimple_debug (USE_STMT (use_p)))
4801                 phis.safe_push (USE_STMT (use_p));
4802             }
4803           else
4804             {
4805               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4806                 {
4807                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4808
4809                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4810                     {
4811                       if (!flow_bb_inside_loop_p (loop,
4812                                              gimple_bb (USE_STMT (phi_use_p)))
4813                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4814                         phis.safe_push (USE_STMT (phi_use_p));
4815                     }
4816                 }
4817             }
4818         }
4819
4820       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4821         {
4822           /* Replace the uses:  */
4823           orig_name = PHI_RESULT (exit_phi);
4824           scalar_result = scalar_results[k];
4825           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4826             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4827               SET_USE (use_p, scalar_result);
4828         }
4829
4830       phis.release ();
4831     }
4832 }
4833
4834
4835 /* Function vectorizable_reduction.
4836
4837    Check if STMT performs a reduction operation that can be vectorized.
4838    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4839    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4840    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4841
4842    This function also handles reduction idioms (patterns) that have been
4843    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4844    of this form:
4845      X = pattern_expr (arg0, arg1, ..., X)
4846    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4847    sequence that had been detected and replaced by the pattern-stmt (STMT).
4848
4849    In some cases of reduction patterns, the type of the reduction variable X is
4850    different than the type of the other arguments of STMT.
4851    In such cases, the vectype that is used when transforming STMT into a vector
4852    stmt is different than the vectype that is used to determine the
4853    vectorization factor, because it consists of a different number of elements
4854    than the actual number of elements that are being operated upon in parallel.
4855
4856    For example, consider an accumulation of shorts into an int accumulator.
4857    On some targets it's possible to vectorize this pattern operating on 8
4858    shorts at a time (hence, the vectype for purposes of determining the
4859    vectorization factor should be V8HI); on the other hand, the vectype that
4860    is used to create the vector form is actually V4SI (the type of the result).
4861
4862    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4863    indicates what is the actual level of parallelism (V8HI in the example), so
4864    that the right vectorization factor would be derived.  This vectype
4865    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4866    be used to create the vectorized stmt.  The right vectype for the vectorized
4867    stmt is obtained from the type of the result X:
4868         get_vectype_for_scalar_type (TREE_TYPE (X))
4869
4870    This means that, contrary to "regular" reductions (or "regular" stmts in
4871    general), the following equation:
4872       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4873    does *NOT* necessarily hold for reduction patterns.  */
4874
4875 bool
4876 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4877                         gimple *vec_stmt, slp_tree slp_node)
4878 {
4879   tree vec_dest;
4880   tree scalar_dest;
4881   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4882   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4883   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4884   tree vectype_in = NULL_TREE;
4885   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4886   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4887   enum tree_code code, orig_code, epilog_reduc_code;
4888   machine_mode vec_mode;
4889   int op_type;
4890   optab optab, reduc_optab;
4891   tree new_temp = NULL_TREE;
4892   tree def;
4893   gimple def_stmt;
4894   enum vect_def_type dt;
4895   gphi *new_phi = NULL;
4896   tree scalar_type;
4897   bool is_simple_use;
4898   gimple orig_stmt;
4899   stmt_vec_info orig_stmt_info;
4900   tree expr = NULL_TREE;
4901   int i;
4902   int ncopies;
4903   int epilog_copies;
4904   stmt_vec_info prev_stmt_info, prev_phi_info;
4905   bool single_defuse_cycle = false;
4906   tree reduc_def = NULL_TREE;
4907   gimple new_stmt = NULL;
4908   int j;
4909   tree ops[3];
4910   bool nested_cycle = false, found_nested_cycle_def = false;
4911   gimple reduc_def_stmt = NULL;
4912   bool double_reduc = false, dummy;
4913   basic_block def_bb;
4914   struct loop * def_stmt_loop, *outer_loop = NULL;
4915   tree def_arg;
4916   gimple def_arg_stmt;
4917   auto_vec<tree> vec_oprnds0;
4918   auto_vec<tree> vec_oprnds1;
4919   auto_vec<tree> vect_defs;
4920   auto_vec<gimple> phis;
4921   int vec_num;
4922   tree def0, def1, tem, op0, op1 = NULL_TREE;
4923   bool first_p = true;
4924
4925   /* In case of reduction chain we switch to the first stmt in the chain, but
4926      we don't update STMT_INFO, since only the last stmt is marked as reduction
4927      and has reduction properties.  */
4928   if (GROUP_FIRST_ELEMENT (stmt_info)
4929       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4930     {
4931       stmt = GROUP_FIRST_ELEMENT (stmt_info);
4932       first_p = false;
4933     }
4934
4935   if (nested_in_vect_loop_p (loop, stmt))
4936     {
4937       outer_loop = loop;
4938       loop = loop->inner;
4939       nested_cycle = true;
4940     }
4941
4942   /* 1. Is vectorizable reduction?  */
4943   /* Not supportable if the reduction variable is used in the loop, unless
4944      it's a reduction chain.  */
4945   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4946       && !GROUP_FIRST_ELEMENT (stmt_info))
4947     return false;
4948
4949   /* Reductions that are not used even in an enclosing outer-loop,
4950      are expected to be "live" (used out of the loop).  */
4951   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4952       && !STMT_VINFO_LIVE_P (stmt_info))
4953     return false;
4954
4955   /* Make sure it was already recognized as a reduction computation.  */
4956   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4957       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4958     return false;
4959
4960   /* 2. Has this been recognized as a reduction pattern?
4961
4962      Check if STMT represents a pattern that has been recognized
4963      in earlier analysis stages.  For stmts that represent a pattern,
4964      the STMT_VINFO_RELATED_STMT field records the last stmt in
4965      the original sequence that constitutes the pattern.  */
4966
4967   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4968   if (orig_stmt)
4969     {
4970       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4971       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4972       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4973     }
4974
4975   /* 3. Check the operands of the operation.  The first operands are defined
4976         inside the loop body. The last operand is the reduction variable,
4977         which is defined by the loop-header-phi.  */
4978
4979   gcc_assert (is_gimple_assign (stmt));
4980
4981   /* Flatten RHS.  */
4982   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4983     {
4984     case GIMPLE_SINGLE_RHS:
4985       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4986       if (op_type == ternary_op)
4987         {
4988           tree rhs = gimple_assign_rhs1 (stmt);
4989           ops[0] = TREE_OPERAND (rhs, 0);
4990           ops[1] = TREE_OPERAND (rhs, 1);
4991           ops[2] = TREE_OPERAND (rhs, 2);
4992           code = TREE_CODE (rhs);
4993         }
4994       else
4995         return false;
4996       break;
4997
4998     case GIMPLE_BINARY_RHS:
4999       code = gimple_assign_rhs_code (stmt);
5000       op_type = TREE_CODE_LENGTH (code);
5001       gcc_assert (op_type == binary_op);
5002       ops[0] = gimple_assign_rhs1 (stmt);
5003       ops[1] = gimple_assign_rhs2 (stmt);
5004       break;
5005
5006     case GIMPLE_TERNARY_RHS:
5007       code = gimple_assign_rhs_code (stmt);
5008       op_type = TREE_CODE_LENGTH (code);
5009       gcc_assert (op_type == ternary_op);
5010       ops[0] = gimple_assign_rhs1 (stmt);
5011       ops[1] = gimple_assign_rhs2 (stmt);
5012       ops[2] = gimple_assign_rhs3 (stmt);
5013       break;
5014
5015     case GIMPLE_UNARY_RHS:
5016       return false;
5017
5018     default:
5019       gcc_unreachable ();
5020     }
5021   /* The default is that the reduction variable is the last in statement.  */
5022   int reduc_index = op_type - 1;
5023
5024   if (code == COND_EXPR && slp_node)
5025     return false;
5026
5027   scalar_dest = gimple_assign_lhs (stmt);
5028   scalar_type = TREE_TYPE (scalar_dest);
5029   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5030       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5031     return false;
5032
5033   /* Do not try to vectorize bit-precision reductions.  */
5034   if ((TYPE_PRECISION (scalar_type)
5035        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5036     return false;
5037
5038   /* All uses but the last are expected to be defined in the loop.
5039      The last use is the reduction variable.  In case of nested cycle this
5040      assumption is not true: we use reduc_index to record the index of the
5041      reduction variable.  */
5042   for (i = 0; i < op_type - 1; i++)
5043     {
5044       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5045       if (i == 0 && code == COND_EXPR)
5046         continue;
5047
5048       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5049                                             &def_stmt, &def, &dt, &tem);
5050       if (!vectype_in)
5051         vectype_in = tem;
5052       gcc_assert (is_simple_use);
5053
5054       if (dt != vect_internal_def
5055           && dt != vect_external_def
5056           && dt != vect_constant_def
5057           && dt != vect_induction_def
5058           && !(dt == vect_nested_cycle && nested_cycle))
5059         return false;
5060
5061       if (dt == vect_nested_cycle)
5062         {
5063           found_nested_cycle_def = true;
5064           reduc_def_stmt = def_stmt;
5065           reduc_index = i;
5066         }
5067     }
5068
5069   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5070                                         &def_stmt, &def, &dt, &tem);
5071   if (!vectype_in)
5072     vectype_in = tem;
5073   gcc_assert (is_simple_use);
5074   if (!found_nested_cycle_def)
5075     reduc_def_stmt = def_stmt;
5076
5077   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5078     return false;
5079
5080   if (!(dt == vect_reduction_def
5081         || dt == vect_nested_cycle
5082         || ((dt == vect_internal_def || dt == vect_external_def
5083              || dt == vect_constant_def || dt == vect_induction_def)
5084             && nested_cycle && found_nested_cycle_def)))
5085     {
5086       /* For pattern recognized stmts, orig_stmt might be a reduction,
5087          but some helper statements for the pattern might not, or
5088          might be COND_EXPRs with reduction uses in the condition.  */
5089       gcc_assert (orig_stmt);
5090       return false;
5091     }
5092
5093   gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5094                                          !nested_cycle, &dummy);
5095   if (orig_stmt)
5096     gcc_assert (tmp == orig_stmt
5097                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5098   else
5099     /* We changed STMT to be the first stmt in reduction chain, hence we
5100        check that in this case the first element in the chain is STMT.  */
5101     gcc_assert (stmt == tmp
5102                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5103
5104   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5105     return false;
5106
5107   if (slp_node || PURE_SLP_STMT (stmt_info))
5108     ncopies = 1;
5109   else
5110     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5111                / TYPE_VECTOR_SUBPARTS (vectype_in));
5112
5113   gcc_assert (ncopies >= 1);
5114
5115   vec_mode = TYPE_MODE (vectype_in);
5116
5117   if (code == COND_EXPR)
5118     {
5119       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5120         {
5121           if (dump_enabled_p ())
5122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5123                              "unsupported condition in reduction\n");
5124
5125           return false;
5126         }
5127     }
5128   else
5129     {
5130       /* 4. Supportable by target?  */
5131
5132       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5133           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5134         {
5135           /* Shifts and rotates are only supported by vectorizable_shifts,
5136              not vectorizable_reduction.  */
5137           if (dump_enabled_p ())
5138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5139                              "unsupported shift or rotation.\n");
5140           return false;
5141         }
5142
5143       /* 4.1. check support for the operation in the loop  */
5144       optab = optab_for_tree_code (code, vectype_in, optab_default);
5145       if (!optab)
5146         {
5147           if (dump_enabled_p ())
5148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5149                              "no optab.\n");
5150
5151           return false;
5152         }
5153
5154       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5155         {
5156           if (dump_enabled_p ())
5157             dump_printf (MSG_NOTE, "op not supported by target.\n");
5158
5159           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5160               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5161                   < vect_min_worthwhile_factor (code))
5162             return false;
5163
5164           if (dump_enabled_p ())
5165             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5166         }
5167
5168       /* Worthwhile without SIMD support?  */
5169       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5170           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5171              < vect_min_worthwhile_factor (code))
5172         {
5173           if (dump_enabled_p ())
5174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5175                              "not worthwhile without SIMD support.\n");
5176
5177           return false;
5178         }
5179     }
5180
5181   /* 4.2. Check support for the epilog operation.
5182
5183           If STMT represents a reduction pattern, then the type of the
5184           reduction variable may be different than the type of the rest
5185           of the arguments.  For example, consider the case of accumulation
5186           of shorts into an int accumulator; The original code:
5187                         S1: int_a = (int) short_a;
5188           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5189
5190           was replaced with:
5191                         STMT: int_acc = widen_sum <short_a, int_acc>
5192
5193           This means that:
5194           1. The tree-code that is used to create the vector operation in the
5195              epilog code (that reduces the partial results) is not the
5196              tree-code of STMT, but is rather the tree-code of the original
5197              stmt from the pattern that STMT is replacing.  I.e, in the example
5198              above we want to use 'widen_sum' in the loop, but 'plus' in the
5199              epilog.
5200           2. The type (mode) we use to check available target support
5201              for the vector operation to be created in the *epilog*, is
5202              determined by the type of the reduction variable (in the example
5203              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5204              However the type (mode) we use to check available target support
5205              for the vector operation to be created *inside the loop*, is
5206              determined by the type of the other arguments to STMT (in the
5207              example we'd check this: optab_handler (widen_sum_optab,
5208              vect_short_mode)).
5209
5210           This is contrary to "regular" reductions, in which the types of all
5211           the arguments are the same as the type of the reduction variable.
5212           For "regular" reductions we can therefore use the same vector type
5213           (and also the same tree-code) when generating the epilog code and
5214           when generating the code inside the loop.  */
5215
5216   if (orig_stmt)
5217     {
5218       /* This is a reduction pattern: get the vectype from the type of the
5219          reduction variable, and get the tree-code from orig_stmt.  */
5220       orig_code = gimple_assign_rhs_code (orig_stmt);
5221       gcc_assert (vectype_out);
5222       vec_mode = TYPE_MODE (vectype_out);
5223     }
5224   else
5225     {
5226       /* Regular reduction: use the same vectype and tree-code as used for
5227          the vector code inside the loop can be used for the epilog code. */
5228       orig_code = code;
5229     }
5230
5231   if (nested_cycle)
5232     {
5233       def_bb = gimple_bb (reduc_def_stmt);
5234       def_stmt_loop = def_bb->loop_father;
5235       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5236                                        loop_preheader_edge (def_stmt_loop));
5237       if (TREE_CODE (def_arg) == SSA_NAME
5238           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5239           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5240           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5241           && vinfo_for_stmt (def_arg_stmt)
5242           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5243               == vect_double_reduction_def)
5244         double_reduc = true;
5245     }
5246
5247   epilog_reduc_code = ERROR_MARK;
5248   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5249     {
5250       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5251                                          optab_default);
5252       if (!reduc_optab)
5253         {
5254           if (dump_enabled_p ())
5255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5256                              "no optab for reduction.\n");
5257
5258           epilog_reduc_code = ERROR_MARK;
5259         }
5260       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5261         {
5262           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5263           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5264             {
5265               if (dump_enabled_p ())
5266                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5267                                  "reduc op not supported by target.\n");
5268
5269               epilog_reduc_code = ERROR_MARK;
5270             }
5271         }
5272     }
5273   else
5274     {
5275       if (!nested_cycle || double_reduc)
5276         {
5277           if (dump_enabled_p ())
5278             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5279                              "no reduc code for scalar code.\n");
5280
5281           return false;
5282         }
5283     }
5284
5285   if (double_reduc && ncopies > 1)
5286     {
5287       if (dump_enabled_p ())
5288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5289                          "multiple types in double reduction\n");
5290
5291       return false;
5292     }
5293
5294   /* In case of widenning multiplication by a constant, we update the type
5295      of the constant to be the type of the other operand.  We check that the
5296      constant fits the type in the pattern recognition pass.  */
5297   if (code == DOT_PROD_EXPR
5298       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5299     {
5300       if (TREE_CODE (ops[0]) == INTEGER_CST)
5301         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5302       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5303         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5304       else
5305         {
5306           if (dump_enabled_p ())
5307             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5308                              "invalid types in dot-prod\n");
5309
5310           return false;
5311         }
5312     }
5313
5314   if (!vec_stmt) /* transformation not required.  */
5315     {
5316       if (first_p
5317           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5318                                          reduc_index))
5319         return false;
5320       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5321       return true;
5322     }
5323
5324   /** Transform.  **/
5325
5326   if (dump_enabled_p ())
5327     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5328
5329   /* FORNOW: Multiple types are not supported for condition.  */
5330   if (code == COND_EXPR)
5331     gcc_assert (ncopies == 1);
5332
5333   /* Create the destination vector  */
5334   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5335
5336   /* In case the vectorization factor (VF) is bigger than the number
5337      of elements that we can fit in a vectype (nunits), we have to generate
5338      more than one vector stmt - i.e - we need to "unroll" the
5339      vector stmt by a factor VF/nunits.  For more details see documentation
5340      in vectorizable_operation.  */
5341
5342   /* If the reduction is used in an outer loop we need to generate
5343      VF intermediate results, like so (e.g. for ncopies=2):
5344         r0 = phi (init, r0)
5345         r1 = phi (init, r1)
5346         r0 = x0 + r0;
5347         r1 = x1 + r1;
5348     (i.e. we generate VF results in 2 registers).
5349     In this case we have a separate def-use cycle for each copy, and therefore
5350     for each copy we get the vector def for the reduction variable from the
5351     respective phi node created for this copy.
5352
5353     Otherwise (the reduction is unused in the loop nest), we can combine
5354     together intermediate results, like so (e.g. for ncopies=2):
5355         r = phi (init, r)
5356         r = x0 + r;
5357         r = x1 + r;
5358    (i.e. we generate VF/2 results in a single register).
5359    In this case for each copy we get the vector def for the reduction variable
5360    from the vectorized reduction operation generated in the previous iteration.
5361   */
5362
5363   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5364     {
5365       single_defuse_cycle = true;
5366       epilog_copies = 1;
5367     }
5368   else
5369     epilog_copies = ncopies;
5370
5371   prev_stmt_info = NULL;
5372   prev_phi_info = NULL;
5373   if (slp_node)
5374     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5375   else
5376     {
5377       vec_num = 1;
5378       vec_oprnds0.create (1);
5379       if (op_type == ternary_op)
5380         vec_oprnds1.create (1);
5381     }
5382
5383   phis.create (vec_num);
5384   vect_defs.create (vec_num);
5385   if (!slp_node)
5386     vect_defs.quick_push (NULL_TREE);
5387
5388   for (j = 0; j < ncopies; j++)
5389     {
5390       if (j == 0 || !single_defuse_cycle)
5391         {
5392           for (i = 0; i < vec_num; i++)
5393             {
5394               /* Create the reduction-phi that defines the reduction
5395                  operand.  */
5396               new_phi = create_phi_node (vec_dest, loop->header);
5397               set_vinfo_for_stmt (new_phi,
5398                                   new_stmt_vec_info (new_phi, loop_vinfo,
5399                                                      NULL));
5400                if (j == 0 || slp_node)
5401                  phis.quick_push (new_phi);
5402             }
5403         }
5404
5405       if (code == COND_EXPR)
5406         {
5407           gcc_assert (!slp_node);
5408           vectorizable_condition (stmt, gsi, vec_stmt,
5409                                   PHI_RESULT (phis[0]),
5410                                   reduc_index, NULL);
5411           /* Multiple types are not supported for condition.  */
5412           break;
5413         }
5414
5415       /* Handle uses.  */
5416       if (j == 0)
5417         {
5418           op0 = ops[!reduc_index];
5419           if (op_type == ternary_op)
5420             {
5421               if (reduc_index == 0)
5422                 op1 = ops[2];
5423               else
5424                 op1 = ops[1];
5425             }
5426
5427           if (slp_node)
5428             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5429                                slp_node, -1);
5430           else
5431             {
5432               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5433                                                             stmt, NULL);
5434               vec_oprnds0.quick_push (loop_vec_def0);
5435               if (op_type == ternary_op)
5436                {
5437                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5438                                                                NULL);
5439                  vec_oprnds1.quick_push (loop_vec_def1);
5440                }
5441             }
5442         }
5443       else
5444         {
5445           if (!slp_node)
5446             {
5447               enum vect_def_type dt;
5448               gimple dummy_stmt;
5449               tree dummy;
5450
5451               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5452                                   &dummy_stmt, &dummy, &dt);
5453               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5454                                                               loop_vec_def0);
5455               vec_oprnds0[0] = loop_vec_def0;
5456               if (op_type == ternary_op)
5457                 {
5458                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5459                                       &dummy, &dt);
5460                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5461                                                                 loop_vec_def1);
5462                   vec_oprnds1[0] = loop_vec_def1;
5463                 }
5464             }
5465
5466           if (single_defuse_cycle)
5467             reduc_def = gimple_assign_lhs (new_stmt);
5468
5469           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5470         }
5471
5472       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5473         {
5474           if (slp_node)
5475             reduc_def = PHI_RESULT (phis[i]);
5476           else
5477             {
5478               if (!single_defuse_cycle || j == 0)
5479                 reduc_def = PHI_RESULT (new_phi);
5480             }
5481
5482           def1 = ((op_type == ternary_op)
5483                   ? vec_oprnds1[i] : NULL);
5484           if (op_type == binary_op)
5485             {
5486               if (reduc_index == 0)
5487                 expr = build2 (code, vectype_out, reduc_def, def0);
5488               else
5489                 expr = build2 (code, vectype_out, def0, reduc_def);
5490             }
5491           else
5492             {
5493               if (reduc_index == 0)
5494                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5495               else
5496                 {
5497                   if (reduc_index == 1)
5498                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5499                   else
5500                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5501                 }
5502             }
5503
5504           new_stmt = gimple_build_assign (vec_dest, expr);
5505           new_temp = make_ssa_name (vec_dest, new_stmt);
5506           gimple_assign_set_lhs (new_stmt, new_temp);
5507           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5508
5509           if (slp_node)
5510             {
5511               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5512               vect_defs.quick_push (new_temp);
5513             }
5514           else
5515             vect_defs[0] = new_temp;
5516         }
5517
5518       if (slp_node)
5519         continue;
5520
5521       if (j == 0)
5522         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5523       else
5524         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5525
5526       prev_stmt_info = vinfo_for_stmt (new_stmt);
5527       prev_phi_info = vinfo_for_stmt (new_phi);
5528     }
5529
5530   /* Finalize the reduction-phi (set its arguments) and create the
5531      epilog reduction code.  */
5532   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5533     {
5534       new_temp = gimple_assign_lhs (*vec_stmt);
5535       vect_defs[0] = new_temp;
5536     }
5537
5538   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5539                                     epilog_reduc_code, phis, reduc_index,
5540                                     double_reduc, slp_node);
5541
5542   return true;
5543 }
5544
5545 /* Function vect_min_worthwhile_factor.
5546
5547    For a loop where we could vectorize the operation indicated by CODE,
5548    return the minimum vectorization factor that makes it worthwhile
5549    to use generic vectors.  */
5550 int
5551 vect_min_worthwhile_factor (enum tree_code code)
5552 {
5553   switch (code)
5554     {
5555     case PLUS_EXPR:
5556     case MINUS_EXPR:
5557     case NEGATE_EXPR:
5558       return 4;
5559
5560     case BIT_AND_EXPR:
5561     case BIT_IOR_EXPR:
5562     case BIT_XOR_EXPR:
5563     case BIT_NOT_EXPR:
5564       return 2;
5565
5566     default:
5567       return INT_MAX;
5568     }
5569 }
5570
5571
5572 /* Function vectorizable_induction
5573
5574    Check if PHI performs an induction computation that can be vectorized.
5575    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5576    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5577    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5578
5579 bool
5580 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5581                         gimple *vec_stmt)
5582 {
5583   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5584   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5585   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5586   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5587   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5588   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5589   tree vec_def;
5590
5591   gcc_assert (ncopies >= 1);
5592   /* FORNOW. These restrictions should be relaxed.  */
5593   if (nested_in_vect_loop_p (loop, phi))
5594     {
5595       imm_use_iterator imm_iter;
5596       use_operand_p use_p;
5597       gimple exit_phi;
5598       edge latch_e;
5599       tree loop_arg;
5600
5601       if (ncopies > 1)
5602         {
5603           if (dump_enabled_p ())
5604             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5605                              "multiple types in nested loop.\n");
5606           return false;
5607         }
5608
5609       exit_phi = NULL;
5610       latch_e = loop_latch_edge (loop->inner);
5611       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5612       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5613         {
5614           gimple use_stmt = USE_STMT (use_p);
5615           if (is_gimple_debug (use_stmt))
5616             continue;
5617
5618           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5619             {
5620               exit_phi = use_stmt;
5621               break;
5622             }
5623         }
5624       if (exit_phi)
5625         {
5626           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5627           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5628                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5629             {
5630               if (dump_enabled_p ())
5631                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5632                                  "inner-loop induction only used outside "
5633                                  "of the outer vectorized loop.\n");
5634               return false;
5635             }
5636         }
5637     }
5638
5639   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5640     return false;
5641
5642   /* FORNOW: SLP not supported.  */
5643   if (STMT_SLP_TYPE (stmt_info))
5644     return false;
5645
5646   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5647
5648   if (gimple_code (phi) != GIMPLE_PHI)
5649     return false;
5650
5651   if (!vec_stmt) /* transformation not required.  */
5652     {
5653       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5654       if (dump_enabled_p ())
5655         dump_printf_loc (MSG_NOTE, vect_location,
5656                          "=== vectorizable_induction ===\n");
5657       vect_model_induction_cost (stmt_info, ncopies);
5658       return true;
5659     }
5660
5661   /** Transform.  **/
5662
5663   if (dump_enabled_p ())
5664     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5665
5666   vec_def = get_initial_def_for_induction (phi);
5667   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5668   return true;
5669 }
5670
5671 /* Function vectorizable_live_operation.
5672
5673    STMT computes a value that is used outside the loop.  Check if
5674    it can be supported.  */
5675
5676 bool
5677 vectorizable_live_operation (gimple stmt,
5678                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5679                              gimple *vec_stmt)
5680 {
5681   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5682   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5683   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5684   int i;
5685   int op_type;
5686   tree op;
5687   tree def;
5688   gimple def_stmt;
5689   enum vect_def_type dt;
5690   enum tree_code code;
5691   enum gimple_rhs_class rhs_class;
5692
5693   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5694
5695   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5696     return false;
5697
5698   if (!is_gimple_assign (stmt))
5699     {
5700       if (gimple_call_internal_p (stmt)
5701           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5702           && gimple_call_lhs (stmt)
5703           && loop->simduid
5704           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5705           && loop->simduid
5706              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5707         {
5708           edge e = single_exit (loop);
5709           basic_block merge_bb = e->dest;
5710           imm_use_iterator imm_iter;
5711           use_operand_p use_p;
5712           tree lhs = gimple_call_lhs (stmt);
5713
5714           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5715             {
5716               gimple use_stmt = USE_STMT (use_p);
5717               if (gimple_code (use_stmt) == GIMPLE_PHI
5718                   && gimple_bb (use_stmt) == merge_bb)
5719                 {
5720                   if (vec_stmt)
5721                     {
5722                       tree vfm1
5723                         = build_int_cst (unsigned_type_node,
5724                                          loop_vinfo->vectorization_factor - 1);
5725                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5726                     }
5727                   return true;
5728                 }
5729             }
5730         }
5731
5732       return false;
5733     }
5734
5735   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5736     return false;
5737
5738   /* FORNOW. CHECKME. */
5739   if (nested_in_vect_loop_p (loop, stmt))
5740     return false;
5741
5742   code = gimple_assign_rhs_code (stmt);
5743   op_type = TREE_CODE_LENGTH (code);
5744   rhs_class = get_gimple_rhs_class (code);
5745   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5746   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5747
5748   /* FORNOW: support only if all uses are invariant.  This means
5749      that the scalar operations can remain in place, unvectorized.
5750      The original last scalar value that they compute will be used.  */
5751
5752   for (i = 0; i < op_type; i++)
5753     {
5754       if (rhs_class == GIMPLE_SINGLE_RHS)
5755         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5756       else
5757         op = gimple_op (stmt, i + 1);
5758       if (op
5759           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5760                                   &dt))
5761         {
5762           if (dump_enabled_p ())
5763             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764                              "use not simple.\n");
5765           return false;
5766         }
5767
5768       if (dt != vect_external_def && dt != vect_constant_def)
5769         return false;
5770     }
5771
5772   /* No transformation is required for the cases we currently support.  */
5773   return true;
5774 }
5775
5776 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5777
5778 static void
5779 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5780 {
5781   ssa_op_iter op_iter;
5782   imm_use_iterator imm_iter;
5783   def_operand_p def_p;
5784   gimple ustmt;
5785
5786   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5787     {
5788       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5789         {
5790           basic_block bb;
5791
5792           if (!is_gimple_debug (ustmt))
5793             continue;
5794
5795           bb = gimple_bb (ustmt);
5796
5797           if (!flow_bb_inside_loop_p (loop, bb))
5798             {
5799               if (gimple_debug_bind_p (ustmt))
5800                 {
5801                   if (dump_enabled_p ())
5802                     dump_printf_loc (MSG_NOTE, vect_location,
5803                                      "killing debug use\n");
5804
5805                   gimple_debug_bind_reset_value (ustmt);
5806                   update_stmt (ustmt);
5807                 }
5808               else
5809                 gcc_unreachable ();
5810             }
5811         }
5812     }
5813 }
5814
5815
5816 /* This function builds ni_name = number of iterations.  Statements
5817    are emitted on the loop preheader edge.  */
5818
5819 static tree
5820 vect_build_loop_niters (loop_vec_info loop_vinfo)
5821 {
5822   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5823   if (TREE_CODE (ni) == INTEGER_CST)
5824     return ni;
5825   else
5826     {
5827       tree ni_name, var;
5828       gimple_seq stmts = NULL;
5829       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5830
5831       var = create_tmp_var (TREE_TYPE (ni), "niters");
5832       ni_name = force_gimple_operand (ni, &stmts, false, var);
5833       if (stmts)
5834         gsi_insert_seq_on_edge_immediate (pe, stmts);
5835
5836       return ni_name;
5837     }
5838 }
5839
5840
5841 /* This function generates the following statements:
5842
5843    ni_name = number of iterations loop executes
5844    ratio = ni_name / vf
5845    ratio_mult_vf_name = ratio * vf
5846
5847    and places them on the loop preheader edge.  */
5848
5849 static void
5850 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5851                                  tree ni_name,
5852                                  tree *ratio_mult_vf_name_ptr,
5853                                  tree *ratio_name_ptr)
5854 {
5855   tree ni_minus_gap_name;
5856   tree var;
5857   tree ratio_name;
5858   tree ratio_mult_vf_name;
5859   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5860   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5861   tree log_vf;
5862
5863   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5864
5865   /* If epilogue loop is required because of data accesses with gaps, we
5866      subtract one iteration from the total number of iterations here for
5867      correct calculation of RATIO.  */
5868   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5869     {
5870       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5871                                        ni_name,
5872                                        build_one_cst (TREE_TYPE (ni_name)));
5873       if (!is_gimple_val (ni_minus_gap_name))
5874         {
5875           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5876           gimple stmts = NULL;
5877           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5878                                                     true, var);
5879           gsi_insert_seq_on_edge_immediate (pe, stmts);
5880         }
5881     }
5882   else
5883     ni_minus_gap_name = ni_name;
5884
5885   /* Create: ratio = ni >> log2(vf) */
5886   /* ???  As we have ni == number of latch executions + 1, ni could
5887      have overflown to zero.  So avoid computing ratio based on ni
5888      but compute it using the fact that we know ratio will be at least
5889      one, thus via (ni - vf) >> log2(vf) + 1.  */
5890   ratio_name
5891     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5892                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5893                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5894                                              ni_minus_gap_name,
5895                                              build_int_cst
5896                                                (TREE_TYPE (ni_name), vf)),
5897                                 log_vf),
5898                    build_int_cst (TREE_TYPE (ni_name), 1));
5899   if (!is_gimple_val (ratio_name))
5900     {
5901       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5902       gimple stmts = NULL;
5903       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5904       gsi_insert_seq_on_edge_immediate (pe, stmts);
5905     }
5906   *ratio_name_ptr = ratio_name;
5907
5908   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5909
5910   if (ratio_mult_vf_name_ptr)
5911     {
5912       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5913                                         ratio_name, log_vf);
5914       if (!is_gimple_val (ratio_mult_vf_name))
5915         {
5916           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5917           gimple stmts = NULL;
5918           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5919                                                      true, var);
5920           gsi_insert_seq_on_edge_immediate (pe, stmts);
5921         }
5922       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5923     }
5924
5925   return;
5926 }
5927
5928
5929 /* Function vect_transform_loop.
5930
5931    The analysis phase has determined that the loop is vectorizable.
5932    Vectorize the loop - created vectorized stmts to replace the scalar
5933    stmts in the loop, and update the loop exit condition.  */
5934
5935 void
5936 vect_transform_loop (loop_vec_info loop_vinfo)
5937 {
5938   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5939   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5940   int nbbs = loop->num_nodes;
5941   int i;
5942   tree ratio = NULL;
5943   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5944   bool grouped_store;
5945   bool slp_scheduled = false;
5946   gimple stmt, pattern_stmt;
5947   gimple_seq pattern_def_seq = NULL;
5948   gimple_stmt_iterator pattern_def_si = gsi_none ();
5949   bool transform_pattern_stmt = false;
5950   bool check_profitability = false;
5951   int th;
5952   /* Record number of iterations before we started tampering with the profile. */
5953   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5954
5955   if (dump_enabled_p ())
5956     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5957
5958   /* If profile is inprecise, we have chance to fix it up.  */
5959   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5960     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5961
5962   /* Use the more conservative vectorization threshold.  If the number
5963      of iterations is constant assume the cost check has been performed
5964      by our caller.  If the threshold makes all loops profitable that
5965      run at least the vectorization factor number of times checking
5966      is pointless, too.  */
5967   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5968   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5969       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5970     {
5971       if (dump_enabled_p ())
5972         dump_printf_loc (MSG_NOTE, vect_location,
5973                          "Profitability threshold is %d loop iterations.\n",
5974                          th);
5975       check_profitability = true;
5976     }
5977
5978   /* Version the loop first, if required, so the profitability check
5979      comes first.  */
5980
5981   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5982       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5983     {
5984       vect_loop_versioning (loop_vinfo, th, check_profitability);
5985       check_profitability = false;
5986     }
5987
5988   tree ni_name = vect_build_loop_niters (loop_vinfo);
5989   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5990
5991   /* Peel the loop if there are data refs with unknown alignment.
5992      Only one data ref with unknown store is allowed.  */
5993
5994   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5995     {
5996       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5997                                      th, check_profitability);
5998       check_profitability = false;
5999       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6000          be re-computed.  */
6001       ni_name = NULL_TREE;
6002     }
6003
6004   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6005      compile time constant), or it is a constant that doesn't divide by the
6006      vectorization factor, then an epilog loop needs to be created.
6007      We therefore duplicate the loop: the original loop will be vectorized,
6008      and will compute the first (n/VF) iterations.  The second copy of the loop
6009      will remain scalar and will compute the remaining (n%VF) iterations.
6010      (VF is the vectorization factor).  */
6011
6012   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6013       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6014     {
6015       tree ratio_mult_vf;
6016       if (!ni_name)
6017         ni_name = vect_build_loop_niters (loop_vinfo);
6018       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6019                                        &ratio);
6020       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6021                                       th, check_profitability);
6022     }
6023   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6024     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6025                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6026   else
6027     {
6028       if (!ni_name)
6029         ni_name = vect_build_loop_niters (loop_vinfo);
6030       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6031     }
6032
6033   /* 1) Make sure the loop header has exactly two entries
6034      2) Make sure we have a preheader basic block.  */
6035
6036   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6037
6038   split_edge (loop_preheader_edge (loop));
6039
6040   /* FORNOW: the vectorizer supports only loops which body consist
6041      of one basic block (header + empty latch). When the vectorizer will
6042      support more involved loop forms, the order by which the BBs are
6043      traversed need to be reconsidered.  */
6044
6045   for (i = 0; i < nbbs; i++)
6046     {
6047       basic_block bb = bbs[i];
6048       stmt_vec_info stmt_info;
6049
6050       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6051            gsi_next (&si))
6052         {
6053           gphi *phi = si.phi ();
6054           if (dump_enabled_p ())
6055             {
6056               dump_printf_loc (MSG_NOTE, vect_location,
6057                                "------>vectorizing phi: ");
6058               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6059               dump_printf (MSG_NOTE, "\n");
6060             }
6061           stmt_info = vinfo_for_stmt (phi);
6062           if (!stmt_info)
6063             continue;
6064
6065           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6066             vect_loop_kill_debug_uses (loop, phi);
6067
6068           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6069               && !STMT_VINFO_LIVE_P (stmt_info))
6070             continue;
6071
6072           if (STMT_VINFO_VECTYPE (stmt_info)
6073               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6074                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6075               && dump_enabled_p ())
6076             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6077
6078           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6079             {
6080               if (dump_enabled_p ())
6081                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6082               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6083             }
6084         }
6085
6086       pattern_stmt = NULL;
6087       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6088            !gsi_end_p (si) || transform_pattern_stmt;)
6089         {
6090           bool is_store;
6091
6092           if (transform_pattern_stmt)
6093             stmt = pattern_stmt;
6094           else
6095             {
6096               stmt = gsi_stmt (si);
6097               /* During vectorization remove existing clobber stmts.  */
6098               if (gimple_clobber_p (stmt))
6099                 {
6100                   unlink_stmt_vdef (stmt);
6101                   gsi_remove (&si, true);
6102                   release_defs (stmt);
6103                   continue;
6104                 }
6105             }
6106
6107           if (dump_enabled_p ())
6108             {
6109               dump_printf_loc (MSG_NOTE, vect_location,
6110                                "------>vectorizing statement: ");
6111               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6112               dump_printf (MSG_NOTE, "\n");
6113             }
6114
6115           stmt_info = vinfo_for_stmt (stmt);
6116
6117           /* vector stmts created in the outer-loop during vectorization of
6118              stmts in an inner-loop may not have a stmt_info, and do not
6119              need to be vectorized.  */
6120           if (!stmt_info)
6121             {
6122               gsi_next (&si);
6123               continue;
6124             }
6125
6126           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6127             vect_loop_kill_debug_uses (loop, stmt);
6128
6129           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6130               && !STMT_VINFO_LIVE_P (stmt_info))
6131             {
6132               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6133                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6134                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6135                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6136                 {
6137                   stmt = pattern_stmt;
6138                   stmt_info = vinfo_for_stmt (stmt);
6139                 }
6140               else
6141                 {
6142                   gsi_next (&si);
6143                   continue;
6144                 }
6145             }
6146           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6147                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6148                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6149                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6150             transform_pattern_stmt = true;
6151
6152           /* If pattern statement has def stmts, vectorize them too.  */
6153           if (is_pattern_stmt_p (stmt_info))
6154             {
6155               if (pattern_def_seq == NULL)
6156                 {
6157                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6158                   pattern_def_si = gsi_start (pattern_def_seq);
6159                 }
6160               else if (!gsi_end_p (pattern_def_si))
6161                 gsi_next (&pattern_def_si);
6162               if (pattern_def_seq != NULL)
6163                 {
6164                   gimple pattern_def_stmt = NULL;
6165                   stmt_vec_info pattern_def_stmt_info = NULL;
6166
6167                   while (!gsi_end_p (pattern_def_si))
6168                     {
6169                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6170                       pattern_def_stmt_info
6171                         = vinfo_for_stmt (pattern_def_stmt);
6172                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6173                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6174                         break;
6175                       gsi_next (&pattern_def_si);
6176                     }
6177
6178                   if (!gsi_end_p (pattern_def_si))
6179                     {
6180                       if (dump_enabled_p ())
6181                         {
6182                           dump_printf_loc (MSG_NOTE, vect_location,
6183                                            "==> vectorizing pattern def "
6184                                            "stmt: ");
6185                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6186                                             pattern_def_stmt, 0);
6187                           dump_printf (MSG_NOTE, "\n");
6188                         }
6189
6190                       stmt = pattern_def_stmt;
6191                       stmt_info = pattern_def_stmt_info;
6192                     }
6193                   else
6194                     {
6195                       pattern_def_si = gsi_none ();
6196                       transform_pattern_stmt = false;
6197                     }
6198                 }
6199               else
6200                 transform_pattern_stmt = false;
6201             }
6202
6203           if (STMT_VINFO_VECTYPE (stmt_info))
6204             {
6205               unsigned int nunits
6206                 = (unsigned int)
6207                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6208               if (!STMT_SLP_TYPE (stmt_info)
6209                   && nunits != (unsigned int) vectorization_factor
6210                   && dump_enabled_p ())
6211                   /* For SLP VF is set according to unrolling factor, and not
6212                      to vector size, hence for SLP this print is not valid.  */
6213                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6214             }
6215
6216           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6217              reached.  */
6218           if (STMT_SLP_TYPE (stmt_info))
6219             {
6220               if (!slp_scheduled)
6221                 {
6222                   slp_scheduled = true;
6223
6224                   if (dump_enabled_p ())
6225                     dump_printf_loc (MSG_NOTE, vect_location,
6226                                      "=== scheduling SLP instances ===\n");
6227
6228                   vect_schedule_slp (loop_vinfo, NULL);
6229                 }
6230
6231               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6232               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6233                 {
6234                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6235                     {
6236                       pattern_def_seq = NULL;
6237                       gsi_next (&si);
6238                     }
6239                   continue;
6240                 }
6241             }
6242
6243           /* -------- vectorize statement ------------ */
6244           if (dump_enabled_p ())
6245             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6246
6247           grouped_store = false;
6248           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6249           if (is_store)
6250             {
6251               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6252                 {
6253                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6254                      interleaving chain was completed - free all the stores in
6255                      the chain.  */
6256                   gsi_next (&si);
6257                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6258                 }
6259               else
6260                 {
6261                   /* Free the attached stmt_vec_info and remove the stmt.  */
6262                   gimple store = gsi_stmt (si);
6263                   free_stmt_vec_info (store);
6264                   unlink_stmt_vdef (store);
6265                   gsi_remove (&si, true);
6266                   release_defs (store);
6267                 }
6268
6269               /* Stores can only appear at the end of pattern statements.  */
6270               gcc_assert (!transform_pattern_stmt);
6271               pattern_def_seq = NULL;
6272             }
6273           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6274             {
6275               pattern_def_seq = NULL;
6276               gsi_next (&si);
6277             }
6278         }                       /* stmts in BB */
6279     }                           /* BBs in loop */
6280
6281   slpeel_make_loop_iterate_ntimes (loop, ratio);
6282
6283   /* Reduce loop iterations by the vectorization factor.  */
6284   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6285                       expected_iterations / vectorization_factor);
6286   loop->nb_iterations_upper_bound
6287     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6288   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6289       && loop->nb_iterations_upper_bound != 0)
6290     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6291   if (loop->any_estimate)
6292     {
6293       loop->nb_iterations_estimate
6294         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6295        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6296            && loop->nb_iterations_estimate != 0)
6297          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6298     }
6299
6300   if (dump_enabled_p ())
6301     {
6302       dump_printf_loc (MSG_NOTE, vect_location,
6303                        "LOOP VECTORIZED\n");
6304       if (loop->inner)
6305         dump_printf_loc (MSG_NOTE, vect_location,
6306                          "OUTER LOOP VECTORIZED\n");
6307       dump_printf (MSG_NOTE, "\n");
6308     }
6309 }