gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "predict.h"
  40 #include "hard-reg-set.h"
  41 #include "function.h"
  42 #include "dominance.h"
  43 #include "cfg.h"
  44 #include "cfganal.h"
  45 #include "basic-block.h"
  46 #include "gimple-pretty-print.h"
  47 #include "tree-ssa-alias.h"
  48 #include "internal-fn.h"
  49 #include "gimple-expr.h"
  50 #include "is-a.h"
  51 #include "gimple.h"
  52 #include "gimplify.h"
  53 #include "gimple-iterator.h"
  54 #include "gimplify-me.h"
  55 #include "gimple-ssa.h"
  56 #include "tree-phinodes.h"
  57 #include "ssa-iterators.h"
  58 #include "stringpool.h"
  59 #include "tree-ssanames.h"
  60 #include "tree-ssa-loop-ivopts.h"
  61 #include "tree-ssa-loop-manip.h"
  62 #include "tree-ssa-loop-niter.h"
  63 #include "tree-pass.h"
  64 #include "cfgloop.h"
  65 #include "hashtab.h"
  66 #include "rtl.h"
  67 #include "flags.h"
  68 #include "statistics.h"
  69 #include "real.h"
  70 #include "fixed-value.h"
  71 #include "insn-config.h"
  72 #include "expmed.h"
  73 #include "dojump.h"
  74 #include "explow.h"
  75 #include "calls.h"
  76 #include "emit-rtl.h"
  77 #include "varasm.h"
  78 #include "stmt.h"
  79 #include "expr.h"
  80 #include "recog.h"
  81 #include "insn-codes.h"
  82 #include "optabs.h"
  83 #include "params.h"
  84 #include "diagnostic-core.h"
  85 #include "tree-chrec.h"
  86 #include "tree-scalar-evolution.h"
  87 #include "tree-vectorizer.h"
  88 #include "target.h"
  89
  90 /* Loop Vectorization Pass.
  91
  92    This pass tries to vectorize loops.
  93
  94    For example, the vectorizer transforms the following simple loop:
  95
  96         short a[N]; short b[N]; short c[N]; int i;
  97
  98         for (i=0; i<N; i++){
  99           a[i] = b[i] + c[i];
 100         }
 101
 102    as if it was manually vectorized by rewriting the source code into:
 103
 104         typedef int __attribute__((mode(V8HI))) v8hi;
 105         short a[N];  short b[N]; short c[N];   int i;
 106         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
 107         v8hi va, vb, vc;
 108
 109         for (i=0; i<N/8; i++){
 110           vb = pb[i];
 111           vc = pc[i];
 112           va = vb + vc;
 113           pa[i] = va;
 114         }
 115
 116         The main entry to this pass is vectorize_loops(), in which
 117    the vectorizer applies a set of analyses on a given set of loops,
 118    followed by the actual vectorization transformation for the loops that
 119    had successfully passed the analysis phase.
 120         Throughout this pass we make a distinction between two types of
 121    data: scalars (which are represented by SSA_NAMES), and memory references
 122    ("data-refs").  These two types of data require different handling both
 123    during analysis and transformation. The types of data-refs that the
 124    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 125    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 126    accesses are required to have a simple (consecutive) access pattern.
 127
 128    Analysis phase:
 129    ===============
 130         The driver for the analysis phase is vect_analyze_loop().
 131    It applies a set of analyses, some of which rely on the scalar evolution
 132    analyzer (scev) developed by Sebastian Pop.
 133
 134         During the analysis phase the vectorizer records some information
 135    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 136    loop, as well as general information about the loop as a whole, which is
 137    recorded in a "loop_vec_info" struct attached to each loop.
 138
 139    Transformation phase:
 140    =====================
 141         The loop transformation phase scans all the stmts in the loop, and
 142    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 143    the loop that needs to be vectorized.  It inserts the vector code sequence
 144    just before the scalar stmt S, and records a pointer to the vector code
 145    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 146    attached to S).  This pointer will be used for the vectorization of following
 147    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 148    otherwise, we rely on dead code elimination for removing it.
 149
 150         For example, say stmt S1 was vectorized into stmt VS1:
 151
 152    VS1: vb = px[i];
 153    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 154    S2:  a = b;
 155
 156    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 157    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 158    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 159    resulting sequence would be:
 160
 161    VS1: vb = px[i];
 162    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 163    VS2: va = vb;
 164    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 165
 166         Operands that are not SSA_NAMEs, are data-refs that appear in
 167    load/store operations (like 'x[i]' in S1), and are handled differently.
 168
 169    Target modeling:
 170    =================
 171         Currently the only target specific information that is used is the
 172    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 173    Targets that can support different sizes of vectors, for now will need
 174    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 175    flexibility will be added in the future.
 176
 177         Since we only vectorize operations which vector form can be
 178    expressed using existing tree codes, to verify that an operation is
 179    supported, the vectorizer checks the relevant optab at the relevant
 180    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 181    the value found is CODE_FOR_nothing, then there's no target support, and
 182    we can't vectorize the stmt.
 183
 184    For additional information on this project see:
 185    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 186 */
 187
 188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 189
 190 /* Function vect_determine_vectorization_factor
 191
 192    Determine the vectorization factor (VF).  VF is the number of data elements
 193    that are operated upon in parallel in a single iteration of the vectorized
 194    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 195    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 196    elements can fit in a single vector register.
 197
 198    We currently support vectorization of loops in which all types operated upon
 199    are of the same size.  Therefore this function currently sets VF according to
 200    the size of the types operated upon, and fails if there are multiple sizes
 201    in the loop.
 202
 203    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 204    original loop:
 205         for (i=0; i<N; i++){
 206           a[i] = b[i] + c[i];
 207         }
 208
 209    vectorized loop:
 210         for (i=0; i<N; i+=VF){
 211           a[i:VF] = b[i:VF] + c[i:VF];
 212         }
 213 */
 214
 215 static bool
 216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 217 {
 218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 219   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 220   int nbbs = loop->num_nodes;
 221   unsigned int vectorization_factor = 0;
 222   tree scalar_type;
 223   gphi *phi;
 224   tree vectype;
 225   unsigned int nunits;
 226   stmt_vec_info stmt_info;
 227   int i;
 228   HOST_WIDE_INT dummy;
 229   gimple stmt, pattern_stmt = NULL;
 230   gimple_seq pattern_def_seq = NULL;
 231   gimple_stmt_iterator pattern_def_si = gsi_none ();
 232   bool analyze_pattern_stmt = false;
 233
 234   if (dump_enabled_p ())
 235     dump_printf_loc (MSG_NOTE, vect_location,
 236                      "=== vect_determine_vectorization_factor ===\n");
 237
 238   for (i = 0; i < nbbs; i++)
 239     {
 240       basic_block bb = bbs[i];
 241
 242       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 243            gsi_next (&si))
 244         {
 245           phi = si.phi ();
 246           stmt_info = vinfo_for_stmt (phi);
 247           if (dump_enabled_p ())
 248             {
 249               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 250               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 251               dump_printf (MSG_NOTE, "\n");
 252             }
 253
 254           gcc_assert (stmt_info);
 255
 256           if (STMT_VINFO_RELEVANT_P (stmt_info))
 257             {
 258               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 259               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location,
 264                                    "get vectype for scalar type:  ");
 265                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 266                   dump_printf (MSG_NOTE, "\n");
 267                 }
 268
 269               vectype = get_vectype_for_scalar_type (scalar_type);
 270               if (!vectype)
 271                 {
 272                   if (dump_enabled_p ())
 273                     {
 274                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 275                                        "not vectorized: unsupported "
 276                                        "data-type ");
 277                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 278                                          scalar_type);
 279                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 280                     }
 281                   return false;
 282                 }
 283               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 284
 285               if (dump_enabled_p ())
 286                 {
 287                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 288                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 289                   dump_printf (MSG_NOTE, "\n");
 290                 }
 291
 292               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 293               if (dump_enabled_p ())
 294                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 295                                  nunits);
 296
 297               if (!vectorization_factor
 298                   || (nunits > vectorization_factor))
 299                 vectorization_factor = nunits;
 300             }
 301         }
 302
 303       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 304            !gsi_end_p (si) || analyze_pattern_stmt;)
 305         {
 306           tree vf_vectype;
 307
 308           if (analyze_pattern_stmt)
 309             stmt = pattern_stmt;
 310           else
 311             stmt = gsi_stmt (si);
 312
 313           stmt_info = vinfo_for_stmt (stmt);
 314
 315           if (dump_enabled_p ())
 316             {
 317               dump_printf_loc (MSG_NOTE, vect_location,
 318                                "==> examining statement: ");
 319               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 320               dump_printf (MSG_NOTE, "\n");
 321             }
 322
 323           gcc_assert (stmt_info);
 324
 325           /* Skip stmts which do not need to be vectorized.  */
 326           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 327                && !STMT_VINFO_LIVE_P (stmt_info))
 328               || gimple_clobber_p (stmt))
 329             {
 330               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 331                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 332                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 333                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 334                 {
 335                   stmt = pattern_stmt;
 336                   stmt_info = vinfo_for_stmt (pattern_stmt);
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_NOTE, vect_location,
 340                                        "==> examining pattern statement: ");
 341                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 342                       dump_printf (MSG_NOTE, "\n");
 343                     }
 344                 }
 345               else
 346                 {
 347                   if (dump_enabled_p ())
 348                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 349                   gsi_next (&si);
 350                   continue;
 351                 }
 352             }
 353           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 354                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 355                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 356                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 357             analyze_pattern_stmt = true;
 358
 359           /* If a pattern statement has def stmts, analyze them too.  */
 360           if (is_pattern_stmt_p (stmt_info))
 361             {
 362               if (pattern_def_seq == NULL)
 363                 {
 364                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 365                   pattern_def_si = gsi_start (pattern_def_seq);
 366                 }
 367               else if (!gsi_end_p (pattern_def_si))
 368                 gsi_next (&pattern_def_si);
 369               if (pattern_def_seq != NULL)
 370                 {
 371                   gimple pattern_def_stmt = NULL;
 372                   stmt_vec_info pattern_def_stmt_info = NULL;
 373
 374                   while (!gsi_end_p (pattern_def_si))
 375                     {
 376                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 377                       pattern_def_stmt_info
 378                         = vinfo_for_stmt (pattern_def_stmt);
 379                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 380                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 381                         break;
 382                       gsi_next (&pattern_def_si);
 383                     }
 384
 385                   if (!gsi_end_p (pattern_def_si))
 386                     {
 387                       if (dump_enabled_p ())
 388                         {
 389                           dump_printf_loc (MSG_NOTE, vect_location,
 390                                            "==> examining pattern def stmt: ");
 391                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 392                                             pattern_def_stmt, 0);
 393                           dump_printf (MSG_NOTE, "\n");
 394                         }
 395
 396                       stmt = pattern_def_stmt;
 397                       stmt_info = pattern_def_stmt_info;
 398                     }
 399                   else
 400                     {
 401                       pattern_def_si = gsi_none ();
 402                       analyze_pattern_stmt = false;
 403                     }
 404                 }
 405               else
 406                 analyze_pattern_stmt = false;
 407             }
 408
 409           if (gimple_get_lhs (stmt) == NULL_TREE
 410               /* MASK_STORE has no lhs, but is ok.  */
 411               && (!is_gimple_call (stmt)
 412                   || !gimple_call_internal_p (stmt)
 413                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 414             {
 415               if (is_gimple_call (stmt))
 416                 {
 417                   /* Ignore calls with no lhs.  These must be calls to
 418                      #pragma omp simd functions, and what vectorization factor
 419                      it really needs can't be determined until
 420                      vectorizable_simd_clone_call.  */
 421                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 422                     {
 423                       pattern_def_seq = NULL;
 424                       gsi_next (&si);
 425                     }
 426                   continue;
 427                 }
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: irregular stmt.");
 432                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 433                                     0);
 434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 435                 }
 436               return false;
 437             }
 438
 439           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: vector stmt in loop:");
 445                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 446                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 447                 }
 448               return false;
 449             }
 450
 451           if (STMT_VINFO_VECTYPE (stmt_info))
 452             {
 453               /* The only case when a vectype had been already set is for stmts
 454                  that contain a dataref, or for "pattern-stmts" (stmts
 455                  generated by the vectorizer to represent/replace a certain
 456                  idiom).  */
 457               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 458                           || is_pattern_stmt_p (stmt_info)
 459                           || !gsi_end_p (pattern_def_si));
 460               vectype = STMT_VINFO_VECTYPE (stmt_info);
 461             }
 462           else
 463             {
 464               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 465               if (is_gimple_call (stmt)
 466                   && gimple_call_internal_p (stmt)
 467                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 468                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 469               else
 470                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 471               if (dump_enabled_p ())
 472                 {
 473                   dump_printf_loc (MSG_NOTE, vect_location,
 474                                    "get vectype for scalar type:  ");
 475                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 476                   dump_printf (MSG_NOTE, "\n");
 477                 }
 478               vectype = get_vectype_for_scalar_type (scalar_type);
 479               if (!vectype)
 480                 {
 481                   if (dump_enabled_p ())
 482                     {
 483                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                                        "not vectorized: unsupported "
 485                                        "data-type ");
 486                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 487                                          scalar_type);
 488                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 489                     }
 490                   return false;
 491                 }
 492
 493               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 494
 495               if (dump_enabled_p ())
 496                 {
 497                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 498                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 499                   dump_printf (MSG_NOTE, "\n");
 500                 }
 501             }
 502
 503           /* The vectorization factor is according to the smallest
 504              scalar type (or the largest vector size, but we only
 505              support one vector size per loop).  */
 506           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                        &dummy);
 508           if (dump_enabled_p ())
 509             {
 510               dump_printf_loc (MSG_NOTE, vect_location,
 511                                "get vectype for scalar type:  ");
 512               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513               dump_printf (MSG_NOTE, "\n");
 514             }
 515           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   return true;
 583 }
 584
 585
 586 /* Function vect_is_simple_iv_evolution.
 587
 588    FORNOW: A simple evolution of an induction variables in the loop is
 589    considered a polynomial evolution.  */
 590
 591 static bool
 592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 593                              tree * step)
 594 {
 595   tree init_expr;
 596   tree step_expr;
 597   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 598   basic_block bb;
 599
 600   /* When there is no evolution in this loop, the evolution function
 601      is not "simple".  */
 602   if (evolution_part == NULL_TREE)
 603     return false;
 604
 605   /* When the evolution is a polynomial of degree >= 2
 606      the evolution function is not "simple".  */
 607   if (tree_is_chrec (evolution_part))
 608     return false;
 609
 610   step_expr = evolution_part;
 611   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 612
 613   if (dump_enabled_p ())
 614     {
 615       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 616       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 617       dump_printf (MSG_NOTE, ",  init: ");
 618       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 619       dump_printf (MSG_NOTE, "\n");
 620     }
 621
 622   *init = init_expr;
 623   *step = step_expr;
 624
 625   if (TREE_CODE (step_expr) != INTEGER_CST
 626       && (TREE_CODE (step_expr) != SSA_NAME
 627           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 628               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 629           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 630               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 631                   || !flag_associative_math)))
 632       && (TREE_CODE (step_expr) != REAL_CST
 633           || !flag_associative_math))
 634     {
 635       if (dump_enabled_p ())
 636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 637                          "step unknown.\n");
 638       return false;
 639     }
 640
 641   return true;
 642 }
 643
 644 /* Function vect_analyze_scalar_cycles_1.
 645
 646    Examine the cross iteration def-use cycles of scalar variables
 647    in LOOP.  LOOP_VINFO represents the loop that is now being
 648    considered for vectorization (can be LOOP, or an outer-loop
 649    enclosing LOOP).  */
 650
 651 static void
 652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 653 {
 654   basic_block bb = loop->header;
 655   tree init, step;
 656   auto_vec<gimple, 64> worklist;
 657   gphi_iterator gsi;
 658   bool double_reduc;
 659
 660   if (dump_enabled_p ())
 661     dump_printf_loc (MSG_NOTE, vect_location,
 662                      "=== vect_analyze_scalar_cycles ===\n");
 663
 664   /* First - identify all inductions.  Reduction detection assumes that all the
 665      inductions have been identified, therefore, this order must not be
 666      changed.  */
 667   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 668     {
 669       gphi *phi = gsi.phi ();
 670       tree access_fn = NULL;
 671       tree def = PHI_RESULT (phi);
 672       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 673
 674       if (dump_enabled_p ())
 675         {
 676           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 677           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 678           dump_printf (MSG_NOTE, "\n");
 679         }
 680
 681       /* Skip virtual phi's.  The data dependences that are associated with
 682          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 683       if (virtual_operand_p (def))
 684         continue;
 685
 686       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 687
 688       /* Analyze the evolution function.  */
 689       access_fn = analyze_scalar_evolution (loop, def);
 690       if (access_fn)
 691         {
 692           STRIP_NOPS (access_fn);
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_NOTE, vect_location,
 696                                "Access function of PHI: ");
 697               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 698               dump_printf (MSG_NOTE, "\n");
 699             }
 700           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 701             = evolution_part_in_loop_num (access_fn, loop->num);
 702         }
 703
 704       if (!access_fn
 705           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 706           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 707               && TREE_CODE (step) != INTEGER_CST))
 708         {
 709           worklist.safe_push (phi);
 710           continue;
 711         }
 712
 713       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 714
 715       if (dump_enabled_p ())
 716         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 717       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 718     }
 719
 720
 721   /* Second - identify all reductions and nested cycles.  */
 722   while (worklist.length () > 0)
 723     {
 724       gimple phi = worklist.pop ();
 725       tree def = PHI_RESULT (phi);
 726       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 727       gimple reduc_stmt;
 728       bool nested_cycle;
 729
 730       if (dump_enabled_p ())
 731         {
 732           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 733           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 734           dump_printf (MSG_NOTE, "\n");
 735         }
 736
 737       gcc_assert (!virtual_operand_p (def)
 738                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 739
 740       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 741       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 742                                                 &double_reduc);
 743       if (reduc_stmt)
 744         {
 745           if (double_reduc)
 746             {
 747               if (dump_enabled_p ())
 748                 dump_printf_loc (MSG_NOTE, vect_location,
 749                                  "Detected double reduction.\n");
 750
 751               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 752               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 753                                                     vect_double_reduction_def;
 754             }
 755           else
 756             {
 757               if (nested_cycle)
 758                 {
 759                   if (dump_enabled_p ())
 760                     dump_printf_loc (MSG_NOTE, vect_location,
 761                                      "Detected vectorizable nested cycle.\n");
 762
 763                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 764                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 765                                                              vect_nested_cycle;
 766                 }
 767               else
 768                 {
 769                   if (dump_enabled_p ())
 770                     dump_printf_loc (MSG_NOTE, vect_location,
 771                                      "Detected reduction.\n");
 772
 773                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 774                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 775                                                            vect_reduction_def;
 776                   /* Store the reduction cycles for possible vectorization in
 777                      loop-aware SLP.  */
 778                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 779                 }
 780             }
 781         }
 782       else
 783         if (dump_enabled_p ())
 784           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 785                            "Unknown def-use cycle pattern.\n");
 786     }
 787 }
 788
 789
 790 /* Function vect_analyze_scalar_cycles.
 791
 792    Examine the cross iteration def-use cycles of scalar variables, by
 793    analyzing the loop-header PHIs of scalar variables.  Classify each
 794    cycle as one of the following: invariant, induction, reduction, unknown.
 795    We do that for the loop represented by LOOP_VINFO, and also to its
 796    inner-loop, if exists.
 797    Examples for scalar cycles:
 798
 799    Example1: reduction:
 800
 801               loop1:
 802               for (i=0; i<N; i++)
 803                  sum += a[i];
 804
 805    Example2: induction:
 806
 807               loop2:
 808               for (i=0; i<N; i++)
 809                  a[i] = i;  */
 810
 811 static void
 812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 813 {
 814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 815
 816   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 817
 818   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 819      Reductions in such inner-loop therefore have different properties than
 820      the reductions in the nest that gets vectorized:
 821      1. When vectorized, they are executed in the same order as in the original
 822         scalar loop, so we can't change the order of computation when
 823         vectorizing them.
 824      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 825         current checks are too strict.  */
 826
 827   if (loop->inner)
 828     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 829 }
 830
 831
 832 /* Function vect_get_loop_niters.
 833
 834    Determine how many iterations the loop is executed and place it
 835    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 836    in NUMBER_OF_ITERATIONSM1.
 837
 838    Return the loop exit condition.  */
 839
 840
 841 static gcond *
 842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 843                       tree *number_of_iterationsm1)
 844 {
 845   tree niters;
 846
 847   if (dump_enabled_p ())
 848     dump_printf_loc (MSG_NOTE, vect_location,
 849                      "=== get_loop_niters ===\n");
 850
 851   niters = number_of_latch_executions (loop);
 852   *number_of_iterationsm1 = niters;
 853
 854   /* We want the number of loop header executions which is the number
 855      of latch executions plus one.
 856      ???  For UINT_MAX latch executions this number overflows to zero
 857      for loops like do { n++; } while (n != 0);  */
 858   if (niters && !chrec_contains_undetermined (niters))
 859     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 860                           build_int_cst (TREE_TYPE (niters), 1));
 861   *number_of_iterations = niters;
 862
 863   return get_loop_exit_condition (loop);
 864 }
 865
 866
 867 /* Function bb_in_loop_p
 868
 869    Used as predicate for dfs order traversal of the loop bbs.  */
 870
 871 static bool
 872 bb_in_loop_p (const_basic_block bb, const void *data)
 873 {
 874   const struct loop *const loop = (const struct loop *)data;
 875   if (flow_bb_inside_loop_p (loop, bb))
 876     return true;
 877   return false;
 878 }
 879
 880
 881 /* Function new_loop_vec_info.
 882
 883    Create and initialize a new loop_vec_info struct for LOOP, as well as
 884    stmt_vec_info structs for all the stmts in LOOP.  */
 885
 886 static loop_vec_info
 887 new_loop_vec_info (struct loop *loop)
 888 {
 889   loop_vec_info res;
 890   basic_block *bbs;
 891   gimple_stmt_iterator si;
 892   unsigned int i, nbbs;
 893
 894   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 895   LOOP_VINFO_LOOP (res) = loop;
 896
 897   bbs = get_loop_body (loop);
 898
 899   /* Create/Update stmt_info for all stmts in the loop.  */
 900   for (i = 0; i < loop->num_nodes; i++)
 901     {
 902       basic_block bb = bbs[i];
 903
 904       /* BBs in a nested inner-loop will have been already processed (because
 905          we will have called vect_analyze_loop_form for any nested inner-loop).
 906          Therefore, for stmts in an inner-loop we just want to update the
 907          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 908          loop_info of the outer-loop we are currently considering to vectorize
 909          (instead of the loop_info of the inner-loop).
 910          For stmts in other BBs we need to create a stmt_info from scratch.  */
 911       if (bb->loop_father != loop)
 912         {
 913           /* Inner-loop bb.  */
 914           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 915           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 916             {
 917               gimple phi = gsi_stmt (si);
 918               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 919               loop_vec_info inner_loop_vinfo =
 920                 STMT_VINFO_LOOP_VINFO (stmt_info);
 921               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 922               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 923             }
 924           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 925            {
 926               gimple stmt = gsi_stmt (si);
 927               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 928               loop_vec_info inner_loop_vinfo =
 929                  STMT_VINFO_LOOP_VINFO (stmt_info);
 930               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 931               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 932            }
 933         }
 934       else
 935         {
 936           /* bb in current nest.  */
 937           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 938             {
 939               gimple phi = gsi_stmt (si);
 940               gimple_set_uid (phi, 0);
 941               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 942             }
 943
 944           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 945             {
 946               gimple stmt = gsi_stmt (si);
 947               gimple_set_uid (stmt, 0);
 948               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 949             }
 950         }
 951     }
 952
 953   /* CHECKME: We want to visit all BBs before their successors (except for
 954      latch blocks, for which this assertion wouldn't hold).  In the simple
 955      case of the loop forms we allow, a dfs order of the BBs would the same
 956      as reversed postorder traversal, so we are safe.  */
 957
 958    free (bbs);
 959    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 960    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 961                               bbs, loop->num_nodes, loop);
 962    gcc_assert (nbbs == loop->num_nodes);
 963
 964   LOOP_VINFO_BBS (res) = bbs;
 965   LOOP_VINFO_NITERSM1 (res) = NULL;
 966   LOOP_VINFO_NITERS (res) = NULL;
 967   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 968   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 969   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 970   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 971   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 972   LOOP_VINFO_VECT_FACTOR (res) = 0;
 973   LOOP_VINFO_LOOP_NEST (res).create (3);
 974   LOOP_VINFO_DATAREFS (res).create (10);
 975   LOOP_VINFO_DDRS (res).create (10 * 10);
 976   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 977   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 978              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 979   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 980              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 981   LOOP_VINFO_GROUPED_STORES (res).create (10);
 982   LOOP_VINFO_REDUCTIONS (res).create (10);
 983   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 984   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 985   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 986   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 987   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 988   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 989   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 990
 991   return res;
 992 }
 993
 994
 995 /* Function destroy_loop_vec_info.
 996
 997    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 998    stmts in the loop.  */
 999
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1002 {
1003   struct loop *loop;
1004   basic_block *bbs;
1005   int nbbs;
1006   gimple_stmt_iterator si;
1007   int j;
1008   vec<slp_instance> slp_instances;
1009   slp_instance instance;
1010   bool swapped;
1011
1012   if (!loop_vinfo)
1013     return;
1014
1015   loop = LOOP_VINFO_LOOP (loop_vinfo);
1016
1017   bbs = LOOP_VINFO_BBS (loop_vinfo);
1018   nbbs = clean_stmts ? loop->num_nodes : 0;
1019   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1020
1021   for (j = 0; j < nbbs; j++)
1022     {
1023       basic_block bb = bbs[j];
1024       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025         free_stmt_vec_info (gsi_stmt (si));
1026
1027       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1028         {
1029           gimple stmt = gsi_stmt (si);
1030
1031           /* We may have broken canonical form by moving a constant
1032              into RHS1 of a commutative op.  Fix such occurrences.  */
1033           if (swapped && is_gimple_assign (stmt))
1034             {
1035               enum tree_code code = gimple_assign_rhs_code (stmt);
1036
1037               if ((code == PLUS_EXPR
1038                    || code == POINTER_PLUS_EXPR
1039                    || code == MULT_EXPR)
1040                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041                 swap_ssa_operands (stmt,
1042                                    gimple_assign_rhs1_ptr (stmt),
1043                                    gimple_assign_rhs2_ptr (stmt));
1044             }
1045
1046           /* Free stmt_vec_info.  */
1047           free_stmt_vec_info (stmt);
1048           gsi_next (&si);
1049         }
1050     }
1051
1052   free (LOOP_VINFO_BBS (loop_vinfo));
1053   vect_destroy_datarefs (loop_vinfo, NULL);
1054   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060     vect_free_slp_instance (instance);
1061
1062   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1066
1067   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1069
1070   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1071
1072   free (loop_vinfo);
1073   loop->aux = NULL;
1074 }
1075
1076
1077 /* Function vect_analyze_loop_1.
1078
1079    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080    for it. The different analyses will record information in the
1081    loop_vec_info struct.  This is a subset of the analyses applied in
1082    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083    that is now considered for (outer-loop) vectorization.  */
1084
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1087 {
1088   loop_vec_info loop_vinfo;
1089
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location,
1092                      "===== analyze_loop_nest_1 =====\n");
1093
1094   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1095
1096   loop_vinfo = vect_analyze_loop_form (loop);
1097   if (!loop_vinfo)
1098     {
1099       if (dump_enabled_p ())
1100         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101                          "bad inner-loop form.\n");
1102       return NULL;
1103     }
1104
1105   return loop_vinfo;
1106 }
1107
1108
1109 /* Function vect_analyze_loop_form.
1110
1111    Verify that certain CFG restrictions hold, including:
1112    - the loop has a pre-header
1113    - the loop has a single entry and exit
1114    - the loop exit condition is simple enough, and the number of iterations
1115      can be analyzed (a countable loop).  */
1116
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1119 {
1120   loop_vec_info loop_vinfo;
1121   gcond *loop_cond;
1122   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123   loop_vec_info inner_loop_vinfo = NULL;
1124
1125   if (dump_enabled_p ())
1126     dump_printf_loc (MSG_NOTE, vect_location,
1127                      "=== vect_analyze_loop_form ===\n");
1128
1129   /* Different restrictions apply when we are considering an inner-most loop,
1130      vs. an outer (nested) loop.
1131      (FORNOW. May want to relax some of these restrictions in the future).  */
1132
1133   if (!loop->inner)
1134     {
1135       /* Inner-most loop.  We currently require that the number of BBs is
1136          exactly 2 (the header and latch).  Vectorizable inner-most loops
1137          look like this:
1138
1139                         (pre-header)
1140                            |
1141                           header <--------+
1142                            | |            |
1143                            | +--> latch --+
1144                            |
1145                         (exit-bb)  */
1146
1147       if (loop->num_nodes != 2)
1148         {
1149           if (dump_enabled_p ())
1150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151                              "not vectorized: control flow in loop.\n");
1152           return NULL;
1153         }
1154
1155       if (empty_block_p (loop->header))
1156         {
1157           if (dump_enabled_p ())
1158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159                              "not vectorized: empty loop.\n");
1160           return NULL;
1161         }
1162     }
1163   else
1164     {
1165       struct loop *innerloop = loop->inner;
1166       edge entryedge;
1167
1168       /* Nested loop. We currently require that the loop is doubly-nested,
1169          contains a single inner loop, and the number of BBs is exactly 5.
1170          Vectorizable outer-loops look like this:
1171
1172                         (pre-header)
1173                            |
1174                           header <---+
1175                            |         |
1176                           inner-loop |
1177                            |         |
1178                           tail ------+
1179                            |
1180                         (exit-bb)
1181
1182          The inner-loop has the properties expected of inner-most loops
1183          as described above.  */
1184
1185       if ((loop->inner)->inner || (loop->inner)->next)
1186         {
1187           if (dump_enabled_p ())
1188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                              "not vectorized: multiple nested loops.\n");
1190           return NULL;
1191         }
1192
1193       /* Analyze the inner-loop.  */
1194       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195       if (!inner_loop_vinfo)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: Bad inner loop.\n");
1200           return NULL;
1201         }
1202
1203       if (!expr_invariant_in_loop_p (loop,
1204                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1205         {
1206           if (dump_enabled_p ())
1207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208                              "not vectorized: inner-loop count not"
1209                              " invariant.\n");
1210           destroy_loop_vec_info (inner_loop_vinfo, true);
1211           return NULL;
1212         }
1213
1214       if (loop->num_nodes != 5)
1215         {
1216           if (dump_enabled_p ())
1217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                              "not vectorized: control flow in loop.\n");
1219           destroy_loop_vec_info (inner_loop_vinfo, true);
1220           return NULL;
1221         }
1222
1223       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224       entryedge = EDGE_PRED (innerloop->header, 0);
1225       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226         entryedge = EDGE_PRED (innerloop->header, 1);
1227
1228       if (entryedge->src != loop->header
1229           || !single_exit (innerloop)
1230           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: unsupported outerloop form.\n");
1235           destroy_loop_vec_info (inner_loop_vinfo, true);
1236           return NULL;
1237         }
1238
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_NOTE, vect_location,
1241                          "Considering outer-loop vectorization.\n");
1242     }
1243
1244   if (!single_exit (loop)
1245       || EDGE_COUNT (loop->header->preds) != 2)
1246     {
1247       if (dump_enabled_p ())
1248         {
1249           if (!single_exit (loop))
1250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                              "not vectorized: multiple exits.\n");
1252           else if (EDGE_COUNT (loop->header->preds) != 2)
1253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                              "not vectorized: too many incoming edges.\n");
1255         }
1256       if (inner_loop_vinfo)
1257         destroy_loop_vec_info (inner_loop_vinfo, true);
1258       return NULL;
1259     }
1260
1261   /* We assume that the loop exit condition is at the end of the loop. i.e,
1262      that the loop is represented as a do-while (with a proper if-guard
1263      before the loop if needed), where the loop header contains all the
1264      executable statements, and the latch is empty.  */
1265   if (!empty_block_p (loop->latch)
1266       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267     {
1268       if (dump_enabled_p ())
1269         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270                          "not vectorized: latch block not empty.\n");
1271       if (inner_loop_vinfo)
1272         destroy_loop_vec_info (inner_loop_vinfo, true);
1273       return NULL;
1274     }
1275
1276   /* Make sure there exists a single-predecessor exit bb:  */
1277   if (!single_pred_p (single_exit (loop)->dest))
1278     {
1279       edge e = single_exit (loop);
1280       if (!(e->flags & EDGE_ABNORMAL))
1281         {
1282           split_loop_exit_edge (e);
1283           if (dump_enabled_p ())
1284             dump_printf (MSG_NOTE, "split exit edge.\n");
1285         }
1286       else
1287         {
1288           if (dump_enabled_p ())
1289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290                              "not vectorized: abnormal loop exit edge.\n");
1291           if (inner_loop_vinfo)
1292             destroy_loop_vec_info (inner_loop_vinfo, true);
1293           return NULL;
1294         }
1295     }
1296
1297   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298                                     &number_of_iterationsm1);
1299   if (!loop_cond)
1300     {
1301       if (dump_enabled_p ())
1302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                          "not vectorized: complicated exit condition.\n");
1304       if (inner_loop_vinfo)
1305         destroy_loop_vec_info (inner_loop_vinfo, true);
1306       return NULL;
1307     }
1308
1309   if (!number_of_iterations
1310       || chrec_contains_undetermined (number_of_iterations))
1311     {
1312       if (dump_enabled_p ())
1313         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                          "not vectorized: number of iterations cannot be "
1315                          "computed.\n");
1316       if (inner_loop_vinfo)
1317         destroy_loop_vec_info (inner_loop_vinfo, true);
1318       return NULL;
1319     }
1320
1321   if (integer_zerop (number_of_iterations))
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: number of iterations = 0.\n");
1326       if (inner_loop_vinfo)
1327         destroy_loop_vec_info (inner_loop_vinfo, true);
1328       return NULL;
1329     }
1330
1331   loop_vinfo = new_loop_vec_info (loop);
1332   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1335
1336   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1337     {
1338       if (dump_enabled_p ())
1339         {
1340           dump_printf_loc (MSG_NOTE, vect_location,
1341                            "Symbolic number of iterations is ");
1342           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343           dump_printf (MSG_NOTE, "\n");
1344         }
1345     }
1346
1347   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1348
1349   /* CHECKME: May want to keep it around it in the future.  */
1350   if (inner_loop_vinfo)
1351     destroy_loop_vec_info (inner_loop_vinfo, false);
1352
1353   gcc_assert (!loop->aux);
1354   loop->aux = loop_vinfo;
1355   return loop_vinfo;
1356 }
1357
1358
1359 /* Function vect_analyze_loop_operations.
1360
1361    Scan the loop stmts and make sure they are all vectorizable.  */
1362
1363 static bool
1364 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1365 {
1366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1368   int nbbs = loop->num_nodes;
1369   unsigned int vectorization_factor = 0;
1370   int i;
1371   stmt_vec_info stmt_info;
1372   bool need_to_vectorize = false;
1373   int min_profitable_iters;
1374   int min_scalar_loop_bound;
1375   unsigned int th;
1376   bool only_slp_in_loop = true, ok;
1377   HOST_WIDE_INT max_niter;
1378   HOST_WIDE_INT estimated_niter;
1379   int min_profitable_estimate;
1380
1381   if (dump_enabled_p ())
1382     dump_printf_loc (MSG_NOTE, vect_location,
1383                      "=== vect_analyze_loop_operations ===\n");
1384
1385   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1386   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387   if (slp)
1388     {
1389       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390          vectorization factor of the loop is the unrolling factor required by
1391          the SLP instances.  If that unrolling factor is 1, we say, that we
1392          perform pure SLP on loop - cross iteration parallelism is not
1393          exploited.  */
1394       for (i = 0; i < nbbs; i++)
1395         {
1396           basic_block bb = bbs[i];
1397           for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398                gsi_next (&si))
1399             {
1400               gimple stmt = gsi_stmt (si);
1401               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1402               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1403                   && STMT_VINFO_RELATED_STMT (stmt_info))
1404                 {
1405                   stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1406                   stmt_info = vinfo_for_stmt (stmt);
1407                 }
1408               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1409                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1410                   && !PURE_SLP_STMT (stmt_info))
1411                 /* STMT needs both SLP and loop-based vectorization.  */
1412                 only_slp_in_loop = false;
1413             }
1414         }
1415
1416       if (only_slp_in_loop)
1417         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1418       else
1419         vectorization_factor = least_common_multiple (vectorization_factor,
1420                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1421
1422       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1423       if (dump_enabled_p ())
1424         dump_printf_loc (MSG_NOTE, vect_location,
1425                          "Updating vectorization factor to %d\n",
1426                          vectorization_factor);
1427     }
1428
1429   for (i = 0; i < nbbs; i++)
1430     {
1431       basic_block bb = bbs[i];
1432
1433       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1434            gsi_next (&si))
1435         {
1436           gphi *phi = si.phi ();
1437           ok = true;
1438
1439           stmt_info = vinfo_for_stmt (phi);
1440           if (dump_enabled_p ())
1441             {
1442               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1443               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1444               dump_printf (MSG_NOTE, "\n");
1445             }
1446
1447           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1448              (i.e., a phi in the tail of the outer-loop).  */
1449           if (! is_loop_header_bb_p (bb))
1450             {
1451               /* FORNOW: we currently don't support the case that these phis
1452                  are not used in the outerloop (unless it is double reduction,
1453                  i.e., this phi is vect_reduction_def), cause this case
1454                  requires to actually do something here.  */
1455               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1456                    || STMT_VINFO_LIVE_P (stmt_info))
1457                   && STMT_VINFO_DEF_TYPE (stmt_info)
1458                      != vect_double_reduction_def)
1459                 {
1460                   if (dump_enabled_p ())
1461                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1462                                      "Unsupported loop-closed phi in "
1463                                      "outer-loop.\n");
1464                   return false;
1465                 }
1466
1467               /* If PHI is used in the outer loop, we check that its operand
1468                  is defined in the inner loop.  */
1469               if (STMT_VINFO_RELEVANT_P (stmt_info))
1470                 {
1471                   tree phi_op;
1472                   gimple op_def_stmt;
1473
1474                   if (gimple_phi_num_args (phi) != 1)
1475                     return false;
1476
1477                   phi_op = PHI_ARG_DEF (phi, 0);
1478                   if (TREE_CODE (phi_op) != SSA_NAME)
1479                     return false;
1480
1481                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1482                   if (gimple_nop_p (op_def_stmt)
1483                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1484                       || !vinfo_for_stmt (op_def_stmt))
1485                     return false;
1486
1487                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1488                         != vect_used_in_outer
1489                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1490                            != vect_used_in_outer_by_reduction)
1491                     return false;
1492                 }
1493
1494               continue;
1495             }
1496
1497           gcc_assert (stmt_info);
1498
1499           if (STMT_VINFO_LIVE_P (stmt_info))
1500             {
1501               /* FORNOW: not yet supported.  */
1502               if (dump_enabled_p ())
1503                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504                                  "not vectorized: value used after loop.\n");
1505               return false;
1506             }
1507
1508           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1509               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1510             {
1511               /* A scalar-dependence cycle that we don't support.  */
1512               if (dump_enabled_p ())
1513                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1514                                  "not vectorized: scalar dependence cycle.\n");
1515               return false;
1516             }
1517
1518           if (STMT_VINFO_RELEVANT_P (stmt_info))
1519             {
1520               need_to_vectorize = true;
1521               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1522                 ok = vectorizable_induction (phi, NULL, NULL);
1523             }
1524
1525           if (!ok)
1526             {
1527               if (dump_enabled_p ())
1528                 {
1529                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530                                    "not vectorized: relevant phi not "
1531                                    "supported: ");
1532                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1533                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1534                 }
1535               return false;
1536             }
1537         }
1538
1539       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1540            gsi_next (&si))
1541         {
1542           gimple stmt = gsi_stmt (si);
1543           if (!gimple_clobber_p (stmt)
1544               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1545             return false;
1546         }
1547     } /* bbs */
1548
1549   /* All operations in the loop are either irrelevant (deal with loop
1550      control, or dead), or only used outside the loop and can be moved
1551      out of the loop (e.g. invariants, inductions).  The loop can be
1552      optimized away by scalar optimizations.  We're better off not
1553      touching this loop.  */
1554   if (!need_to_vectorize)
1555     {
1556       if (dump_enabled_p ())
1557         dump_printf_loc (MSG_NOTE, vect_location,
1558                          "All the computation can be taken out of the loop.\n");
1559       if (dump_enabled_p ())
1560         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561                          "not vectorized: redundant loop. no profit to "
1562                          "vectorize.\n");
1563       return false;
1564     }
1565
1566   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1567     dump_printf_loc (MSG_NOTE, vect_location,
1568                      "vectorization_factor = %d, niters = "
1569                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1570                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1571
1572   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1573        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1574       || ((max_niter = max_stmt_executions_int (loop)) != -1
1575           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1576     {
1577       if (dump_enabled_p ())
1578         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579                          "not vectorized: iteration count too small.\n");
1580       if (dump_enabled_p ())
1581         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1582                          "not vectorized: iteration count smaller than "
1583                          "vectorization factor.\n");
1584       return false;
1585     }
1586
1587   /* Analyze cost.  Decide if worth while to vectorize.  */
1588
1589   /* Once VF is set, SLP costs should be updated since the number of created
1590      vector stmts depends on VF.  */
1591   vect_update_slp_costs_according_to_vf (loop_vinfo);
1592
1593   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1594                                       &min_profitable_estimate);
1595   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1596
1597   if (min_profitable_iters < 0)
1598     {
1599       if (dump_enabled_p ())
1600         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                          "not vectorized: vectorization not profitable.\n");
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1604                          "not vectorized: vector version will never be "
1605                          "profitable.\n");
1606       return false;
1607     }
1608
1609   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1610                             * vectorization_factor) - 1);
1611
1612
1613   /* Use the cost model only if it is more conservative than user specified
1614      threshold.  */
1615
1616   th = (unsigned) min_scalar_loop_bound;
1617   if (min_profitable_iters
1618       && (!min_scalar_loop_bound
1619           || min_profitable_iters > min_scalar_loop_bound))
1620     th = (unsigned) min_profitable_iters;
1621
1622   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1623
1624   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1625       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1626     {
1627       if (dump_enabled_p ())
1628         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1629                          "not vectorized: vectorization not profitable.\n");
1630       if (dump_enabled_p ())
1631         dump_printf_loc (MSG_NOTE, vect_location,
1632                          "not vectorized: iteration count smaller than user "
1633                          "specified loop bound parameter or minimum profitable "
1634                          "iterations (whichever is more conservative).\n");
1635       return false;
1636     }
1637
1638   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1639       && ((unsigned HOST_WIDE_INT) estimated_niter
1640           <= MAX (th, (unsigned)min_profitable_estimate)))
1641     {
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644                          "not vectorized: estimated iteration count too "
1645                          "small.\n");
1646       if (dump_enabled_p ())
1647         dump_printf_loc (MSG_NOTE, vect_location,
1648                          "not vectorized: estimated iteration count smaller "
1649                          "than specified loop bound parameter or minimum "
1650                          "profitable iterations (whichever is more "
1651                          "conservative).\n");
1652       return false;
1653     }
1654
1655   return true;
1656 }
1657
1658
1659 /* Function vect_analyze_loop_2.
1660
1661    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1662    for it.  The different analyses will record information in the
1663    loop_vec_info struct.  */
1664 static bool
1665 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1666 {
1667   bool ok, slp = false;
1668   int max_vf = MAX_VECTORIZATION_FACTOR;
1669   int min_vf = 2;
1670   unsigned int th;
1671   unsigned int n_stmts = 0;
1672
1673   /* Find all data references in the loop (which correspond to vdefs/vuses)
1674      and analyze their evolution in the loop.  Also adjust the minimal
1675      vectorization factor according to the loads and stores.
1676
1677      FORNOW: Handle only simple, array references, which
1678      alignment can be forced, and aligned pointer-references.  */
1679
1680   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1681   if (!ok)
1682     {
1683       if (dump_enabled_p ())
1684         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685                          "bad data references.\n");
1686       return false;
1687     }
1688
1689   /* Classify all cross-iteration scalar data-flow cycles.
1690      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1691
1692   vect_analyze_scalar_cycles (loop_vinfo);
1693
1694   vect_pattern_recog (loop_vinfo, NULL);
1695
1696   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1697      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1698
1699   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1700   if (!ok)
1701     {
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "bad data access.\n");
1705       return false;
1706     }
1707
1708   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1709
1710   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1711   if (!ok)
1712     {
1713       if (dump_enabled_p ())
1714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715                          "unexpected pattern.\n");
1716       return false;
1717     }
1718
1719   /* Analyze data dependences between the data-refs in the loop
1720      and adjust the maximum vectorization factor according to
1721      the dependences.
1722      FORNOW: fail at the first data dependence that we encounter.  */
1723
1724   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1725   if (!ok
1726       || max_vf < min_vf)
1727     {
1728       if (dump_enabled_p ())
1729             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                              "bad data dependence.\n");
1731       return false;
1732     }
1733
1734   ok = vect_determine_vectorization_factor (loop_vinfo);
1735   if (!ok)
1736     {
1737       if (dump_enabled_p ())
1738         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1739                          "can't determine vectorization factor.\n");
1740       return false;
1741     }
1742   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1743     {
1744       if (dump_enabled_p ())
1745         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1746                          "bad data dependence.\n");
1747       return false;
1748     }
1749
1750   /* Analyze the alignment of the data-refs in the loop.
1751      Fail if a data reference is found that cannot be vectorized.  */
1752
1753   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1754   if (!ok)
1755     {
1756       if (dump_enabled_p ())
1757         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1758                          "bad data alignment.\n");
1759       return false;
1760     }
1761
1762   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1763      It is important to call pruning after vect_analyze_data_ref_accesses,
1764      since we use grouping information gathered by interleaving analysis.  */
1765   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1766   if (!ok)
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1770                          "number of versioning for alias "
1771                          "run-time tests exceeds %d "
1772                          "(--param vect-max-version-for-alias-checks)\n",
1773                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1774       return false;
1775     }
1776
1777   /* This pass will decide on using loop versioning and/or loop peeling in
1778      order to enhance the alignment of data references in the loop.  */
1779
1780   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1781   if (!ok)
1782     {
1783       if (dump_enabled_p ())
1784         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785                          "bad data alignment.\n");
1786       return false;
1787     }
1788
1789   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1790   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1791   if (ok)
1792     {
1793       /* Decide which possible SLP instances to SLP.  */
1794       slp = vect_make_slp_decision (loop_vinfo);
1795
1796       /* Find stmts that need to be both vectorized and SLPed.  */
1797       vect_detect_hybrid_slp (loop_vinfo);
1798     }
1799   else
1800     return false;
1801
1802   /* Scan all the operations in the loop and make sure they are
1803      vectorizable.  */
1804
1805   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1806   if (!ok)
1807     {
1808       if (dump_enabled_p ())
1809         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1810                          "bad operation or unsupported loop bound.\n");
1811       return false;
1812     }
1813
1814   /* Decide whether we need to create an epilogue loop to handle
1815      remaining scalar iterations.  */
1816   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1817         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1818        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1819
1820   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1821       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1822     {
1823       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1824                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1825           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1826         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1827     }
1828   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1829            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1830                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1831                /* In case of versioning, check if the maximum number of
1832                   iterations is greater than th.  If they are identical,
1833                   the epilogue is unnecessary.  */
1834                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1835                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1836                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1837                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1838     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1839
1840   /* If an epilogue loop is required make sure we can create one.  */
1841   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1842       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1843     {
1844       if (dump_enabled_p ())
1845         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1846       if (!vect_can_advance_ivs_p (loop_vinfo)
1847           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1848                                            single_exit (LOOP_VINFO_LOOP
1849                                                          (loop_vinfo))))
1850         {
1851           if (dump_enabled_p ())
1852             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853                              "not vectorized: can't create required "
1854                              "epilog loop\n");
1855           return false;
1856         }
1857     }
1858
1859   return true;
1860 }
1861
1862 /* Function vect_analyze_loop.
1863
1864    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1865    for it.  The different analyses will record information in the
1866    loop_vec_info struct.  */
1867 loop_vec_info
1868 vect_analyze_loop (struct loop *loop)
1869 {
1870   loop_vec_info loop_vinfo;
1871   unsigned int vector_sizes;
1872
1873   /* Autodetect first vector size we try.  */
1874   current_vector_size = 0;
1875   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1876
1877   if (dump_enabled_p ())
1878     dump_printf_loc (MSG_NOTE, vect_location,
1879                      "===== analyze_loop_nest =====\n");
1880
1881   if (loop_outer (loop)
1882       && loop_vec_info_for_loop (loop_outer (loop))
1883       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1884     {
1885       if (dump_enabled_p ())
1886         dump_printf_loc (MSG_NOTE, vect_location,
1887                          "outer-loop already vectorized.\n");
1888       return NULL;
1889     }
1890
1891   while (1)
1892     {
1893       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1894       loop_vinfo = vect_analyze_loop_form (loop);
1895       if (!loop_vinfo)
1896         {
1897           if (dump_enabled_p ())
1898             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                              "bad loop form.\n");
1900           return NULL;
1901         }
1902
1903       if (vect_analyze_loop_2 (loop_vinfo))
1904         {
1905           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1906
1907           return loop_vinfo;
1908         }
1909
1910       destroy_loop_vec_info (loop_vinfo, true);
1911
1912       vector_sizes &= ~current_vector_size;
1913       if (vector_sizes == 0
1914           || current_vector_size == 0)
1915         return NULL;
1916
1917       /* Try the next biggest vector size.  */
1918       current_vector_size = 1 << floor_log2 (vector_sizes);
1919       if (dump_enabled_p ())
1920         dump_printf_loc (MSG_NOTE, vect_location,
1921                          "***** Re-trying analysis with "
1922                          "vector size %d\n", current_vector_size);
1923     }
1924 }
1925
1926
1927 /* Function reduction_code_for_scalar_code
1928
1929    Input:
1930    CODE - tree_code of a reduction operations.
1931
1932    Output:
1933    REDUC_CODE - the corresponding tree-code to be used to reduce the
1934       vector of partial results into a single scalar result, or ERROR_MARK
1935       if the operation is a supported reduction operation, but does not have
1936       such a tree-code.
1937
1938    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1939
1940 static bool
1941 reduction_code_for_scalar_code (enum tree_code code,
1942                                 enum tree_code *reduc_code)
1943 {
1944   switch (code)
1945     {
1946       case MAX_EXPR:
1947         *reduc_code = REDUC_MAX_EXPR;
1948         return true;
1949
1950       case MIN_EXPR:
1951         *reduc_code = REDUC_MIN_EXPR;
1952         return true;
1953
1954       case PLUS_EXPR:
1955         *reduc_code = REDUC_PLUS_EXPR;
1956         return true;
1957
1958       case MULT_EXPR:
1959       case MINUS_EXPR:
1960       case BIT_IOR_EXPR:
1961       case BIT_XOR_EXPR:
1962       case BIT_AND_EXPR:
1963         *reduc_code = ERROR_MARK;
1964         return true;
1965
1966       default:
1967        return false;
1968     }
1969 }
1970
1971
1972 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1973    STMT is printed with a message MSG. */
1974
1975 static void
1976 report_vect_op (int msg_type, gimple stmt, const char *msg)
1977 {
1978   dump_printf_loc (msg_type, vect_location, "%s", msg);
1979   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1980   dump_printf (msg_type, "\n");
1981 }
1982
1983
1984 /* Detect SLP reduction of the form:
1985
1986    #a1 = phi <a5, a0>
1987    a2 = operation (a1)
1988    a3 = operation (a2)
1989    a4 = operation (a3)
1990    a5 = operation (a4)
1991
1992    #a = phi <a5>
1993
1994    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1995    FIRST_STMT is the first reduction stmt in the chain
1996    (a2 = operation (a1)).
1997
1998    Return TRUE if a reduction chain was detected.  */
1999
2000 static bool
2001 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2002 {
2003   struct loop *loop = (gimple_bb (phi))->loop_father;
2004   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2005   enum tree_code code;
2006   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2007   stmt_vec_info use_stmt_info, current_stmt_info;
2008   tree lhs;
2009   imm_use_iterator imm_iter;
2010   use_operand_p use_p;
2011   int nloop_uses, size = 0, n_out_of_loop_uses;
2012   bool found = false;
2013
2014   if (loop != vect_loop)
2015     return false;
2016
2017   lhs = PHI_RESULT (phi);
2018   code = gimple_assign_rhs_code (first_stmt);
2019   while (1)
2020     {
2021       nloop_uses = 0;
2022       n_out_of_loop_uses = 0;
2023       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2024         {
2025           gimple use_stmt = USE_STMT (use_p);
2026           if (is_gimple_debug (use_stmt))
2027             continue;
2028
2029           /* Check if we got back to the reduction phi.  */
2030           if (use_stmt == phi)
2031             {
2032               loop_use_stmt = use_stmt;
2033               found = true;
2034               break;
2035             }
2036
2037           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2038             {
2039               loop_use_stmt = use_stmt;
2040               nloop_uses++;
2041             }
2042            else
2043              n_out_of_loop_uses++;
2044
2045            /* There are can be either a single use in the loop or two uses in
2046               phi nodes.  */
2047            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2048              return false;
2049         }
2050
2051       if (found)
2052         break;
2053
2054       /* We reached a statement with no loop uses.  */
2055       if (nloop_uses == 0)
2056         return false;
2057
2058       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2059       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2060         return false;
2061
2062       if (!is_gimple_assign (loop_use_stmt)
2063           || code != gimple_assign_rhs_code (loop_use_stmt)
2064           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2065         return false;
2066
2067       /* Insert USE_STMT into reduction chain.  */
2068       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2069       if (current_stmt)
2070         {
2071           current_stmt_info = vinfo_for_stmt (current_stmt);
2072           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2073           GROUP_FIRST_ELEMENT (use_stmt_info)
2074             = GROUP_FIRST_ELEMENT (current_stmt_info);
2075         }
2076       else
2077         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2078
2079       lhs = gimple_assign_lhs (loop_use_stmt);
2080       current_stmt = loop_use_stmt;
2081       size++;
2082    }
2083
2084   if (!found || loop_use_stmt != phi || size < 2)
2085     return false;
2086
2087   /* Swap the operands, if needed, to make the reduction operand be the second
2088      operand.  */
2089   lhs = PHI_RESULT (phi);
2090   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2091   while (next_stmt)
2092     {
2093       if (gimple_assign_rhs2 (next_stmt) == lhs)
2094         {
2095           tree op = gimple_assign_rhs1 (next_stmt);
2096           gimple def_stmt = NULL;
2097
2098           if (TREE_CODE (op) == SSA_NAME)
2099             def_stmt = SSA_NAME_DEF_STMT (op);
2100
2101           /* Check that the other def is either defined in the loop
2102              ("vect_internal_def"), or it's an induction (defined by a
2103              loop-header phi-node).  */
2104           if (def_stmt
2105               && gimple_bb (def_stmt)
2106               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2107               && (is_gimple_assign (def_stmt)
2108                   || is_gimple_call (def_stmt)
2109                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2110                            == vect_induction_def
2111                   || (gimple_code (def_stmt) == GIMPLE_PHI
2112                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2113                                   == vect_internal_def
2114                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2115             {
2116               lhs = gimple_assign_lhs (next_stmt);
2117               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2118               continue;
2119             }
2120
2121           return false;
2122         }
2123       else
2124         {
2125           tree op = gimple_assign_rhs2 (next_stmt);
2126           gimple def_stmt = NULL;
2127
2128           if (TREE_CODE (op) == SSA_NAME)
2129             def_stmt = SSA_NAME_DEF_STMT (op);
2130
2131           /* Check that the other def is either defined in the loop
2132             ("vect_internal_def"), or it's an induction (defined by a
2133             loop-header phi-node).  */
2134           if (def_stmt
2135               && gimple_bb (def_stmt)
2136               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2137               && (is_gimple_assign (def_stmt)
2138                   || is_gimple_call (def_stmt)
2139                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2140                               == vect_induction_def
2141                   || (gimple_code (def_stmt) == GIMPLE_PHI
2142                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2143                                   == vect_internal_def
2144                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2145             {
2146               if (dump_enabled_p ())
2147                 {
2148                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2149                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2150                   dump_printf (MSG_NOTE, "\n");
2151                 }
2152
2153               swap_ssa_operands (next_stmt,
2154                                  gimple_assign_rhs1_ptr (next_stmt),
2155                                  gimple_assign_rhs2_ptr (next_stmt));
2156               update_stmt (next_stmt);
2157
2158               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2159                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2160             }
2161           else
2162             return false;
2163         }
2164
2165       lhs = gimple_assign_lhs (next_stmt);
2166       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2167     }
2168
2169   /* Save the chain for further analysis in SLP detection.  */
2170   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2171   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2172   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2173
2174   return true;
2175 }
2176
2177
2178 /* Function vect_is_simple_reduction_1
2179
2180    (1) Detect a cross-iteration def-use cycle that represents a simple
2181    reduction computation.  We look for the following pattern:
2182
2183    loop_header:
2184      a1 = phi < a0, a2 >
2185      a3 = ...
2186      a2 = operation (a3, a1)
2187
2188    or
2189
2190    a3 = ...
2191    loop_header:
2192      a1 = phi < a0, a2 >
2193      a2 = operation (a3, a1)
2194
2195    such that:
2196    1. operation is commutative and associative and it is safe to
2197       change the order of the computation (if CHECK_REDUCTION is true)
2198    2. no uses for a2 in the loop (a2 is used out of the loop)
2199    3. no uses of a1 in the loop besides the reduction operation
2200    4. no uses of a1 outside the loop.
2201
2202    Conditions 1,4 are tested here.
2203    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2204
2205    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2206    nested cycles, if CHECK_REDUCTION is false.
2207
2208    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2209    reductions:
2210
2211      a1 = phi < a0, a2 >
2212      inner loop (def of a3)
2213      a2 = phi < a3 >
2214
2215    If MODIFY is true it tries also to rework the code in-place to enable
2216    detection of more reduction patterns.  For the time being we rewrite
2217    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2218 */
2219
2220 static gimple
2221 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2222                             bool check_reduction, bool *double_reduc,
2223                             bool modify)
2224 {
2225   struct loop *loop = (gimple_bb (phi))->loop_father;
2226   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2227   edge latch_e = loop_latch_edge (loop);
2228   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2229   gimple def_stmt, def1 = NULL, def2 = NULL;
2230   enum tree_code orig_code, code;
2231   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2232   tree type;
2233   int nloop_uses;
2234   tree name;
2235   imm_use_iterator imm_iter;
2236   use_operand_p use_p;
2237   bool phi_def;
2238
2239   *double_reduc = false;
2240
2241   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2242      otherwise, we assume outer loop vectorization.  */
2243   gcc_assert ((check_reduction && loop == vect_loop)
2244               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2245
2246   name = PHI_RESULT (phi);
2247   /* ???  If there are no uses of the PHI result the inner loop reduction
2248      won't be detected as possibly double-reduction by vectorizable_reduction
2249      because that tries to walk the PHI arg from the preheader edge which
2250      can be constant.  See PR60382.  */
2251   if (has_zero_uses (name))
2252     return NULL;
2253   nloop_uses = 0;
2254   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2255     {
2256       gimple use_stmt = USE_STMT (use_p);
2257       if (is_gimple_debug (use_stmt))
2258         continue;
2259
2260       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2261         {
2262           if (dump_enabled_p ())
2263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264                              "intermediate value used outside loop.\n");
2265
2266           return NULL;
2267         }
2268
2269       nloop_uses++;
2270       if (nloop_uses > 1)
2271         {
2272           if (dump_enabled_p ())
2273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274                              "reduction used in loop.\n");
2275           return NULL;
2276         }
2277     }
2278
2279   if (TREE_CODE (loop_arg) != SSA_NAME)
2280     {
2281       if (dump_enabled_p ())
2282         {
2283           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2284                            "reduction: not ssa_name: ");
2285           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2286           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2287         }
2288       return NULL;
2289     }
2290
2291   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2292   if (!def_stmt)
2293     {
2294       if (dump_enabled_p ())
2295         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296                          "reduction: no def_stmt.\n");
2297       return NULL;
2298     }
2299
2300   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2301     {
2302       if (dump_enabled_p ())
2303         {
2304           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2305           dump_printf (MSG_NOTE, "\n");
2306         }
2307       return NULL;
2308     }
2309
2310   if (is_gimple_assign (def_stmt))
2311     {
2312       name = gimple_assign_lhs (def_stmt);
2313       phi_def = false;
2314     }
2315   else
2316     {
2317       name = PHI_RESULT (def_stmt);
2318       phi_def = true;
2319     }
2320
2321   nloop_uses = 0;
2322   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2323     {
2324       gimple use_stmt = USE_STMT (use_p);
2325       if (is_gimple_debug (use_stmt))
2326         continue;
2327       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2328         nloop_uses++;
2329       if (nloop_uses > 1)
2330         {
2331           if (dump_enabled_p ())
2332             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                              "reduction used in loop.\n");
2334           return NULL;
2335         }
2336     }
2337
2338   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2339      defined in the inner loop.  */
2340   if (phi_def)
2341     {
2342       op1 = PHI_ARG_DEF (def_stmt, 0);
2343
2344       if (gimple_phi_num_args (def_stmt) != 1
2345           || TREE_CODE (op1) != SSA_NAME)
2346         {
2347           if (dump_enabled_p ())
2348             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349                              "unsupported phi node definition.\n");
2350
2351           return NULL;
2352         }
2353
2354       def1 = SSA_NAME_DEF_STMT (op1);
2355       if (gimple_bb (def1)
2356           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2357           && loop->inner
2358           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2359           && is_gimple_assign (def1))
2360         {
2361           if (dump_enabled_p ())
2362             report_vect_op (MSG_NOTE, def_stmt,
2363                             "detected double reduction: ");
2364
2365           *double_reduc = true;
2366           return def_stmt;
2367         }
2368
2369       return NULL;
2370     }
2371
2372   code = orig_code = gimple_assign_rhs_code (def_stmt);
2373
2374   /* We can handle "res -= x[i]", which is non-associative by
2375      simply rewriting this into "res += -x[i]".  Avoid changing
2376      gimple instruction for the first simple tests and only do this
2377      if we're allowed to change code at all.  */
2378   if (code == MINUS_EXPR
2379       && modify
2380       && (op1 = gimple_assign_rhs1 (def_stmt))
2381       && TREE_CODE (op1) == SSA_NAME
2382       && SSA_NAME_DEF_STMT (op1) == phi)
2383     code = PLUS_EXPR;
2384
2385   if (check_reduction
2386       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2387     {
2388       if (dump_enabled_p ())
2389         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2390                         "reduction: not commutative/associative: ");
2391       return NULL;
2392     }
2393
2394   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2395     {
2396       if (code != COND_EXPR)
2397         {
2398           if (dump_enabled_p ())
2399             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2400                             "reduction: not binary operation: ");
2401
2402           return NULL;
2403         }
2404
2405       op3 = gimple_assign_rhs1 (def_stmt);
2406       if (COMPARISON_CLASS_P (op3))
2407         {
2408           op4 = TREE_OPERAND (op3, 1);
2409           op3 = TREE_OPERAND (op3, 0);
2410         }
2411
2412       op1 = gimple_assign_rhs2 (def_stmt);
2413       op2 = gimple_assign_rhs3 (def_stmt);
2414
2415       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2416         {
2417           if (dump_enabled_p ())
2418             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2419                             "reduction: uses not ssa_names: ");
2420
2421           return NULL;
2422         }
2423     }
2424   else
2425     {
2426       op1 = gimple_assign_rhs1 (def_stmt);
2427       op2 = gimple_assign_rhs2 (def_stmt);
2428
2429       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2430         {
2431           if (dump_enabled_p ())
2432             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2433                             "reduction: uses not ssa_names: ");
2434
2435           return NULL;
2436         }
2437    }
2438
2439   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2440   if ((TREE_CODE (op1) == SSA_NAME
2441        && !types_compatible_p (type,TREE_TYPE (op1)))
2442       || (TREE_CODE (op2) == SSA_NAME
2443           && !types_compatible_p (type, TREE_TYPE (op2)))
2444       || (op3 && TREE_CODE (op3) == SSA_NAME
2445           && !types_compatible_p (type, TREE_TYPE (op3)))
2446       || (op4 && TREE_CODE (op4) == SSA_NAME
2447           && !types_compatible_p (type, TREE_TYPE (op4))))
2448     {
2449       if (dump_enabled_p ())
2450         {
2451           dump_printf_loc (MSG_NOTE, vect_location,
2452                            "reduction: multiple types: operation type: ");
2453           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2454           dump_printf (MSG_NOTE, ", operands types: ");
2455           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2456                              TREE_TYPE (op1));
2457           dump_printf (MSG_NOTE, ",");
2458           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2459                              TREE_TYPE (op2));
2460           if (op3)
2461             {
2462               dump_printf (MSG_NOTE, ",");
2463               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2464                                  TREE_TYPE (op3));
2465             }
2466
2467           if (op4)
2468             {
2469               dump_printf (MSG_NOTE, ",");
2470               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2471                                  TREE_TYPE (op4));
2472             }
2473           dump_printf (MSG_NOTE, "\n");
2474         }
2475
2476       return NULL;
2477     }
2478
2479   /* Check that it's ok to change the order of the computation.
2480      Generally, when vectorizing a reduction we change the order of the
2481      computation.  This may change the behavior of the program in some
2482      cases, so we need to check that this is ok.  One exception is when
2483      vectorizing an outer-loop: the inner-loop is executed sequentially,
2484      and therefore vectorizing reductions in the inner-loop during
2485      outer-loop vectorization is safe.  */
2486
2487   /* CHECKME: check for !flag_finite_math_only too?  */
2488   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2489       && check_reduction)
2490     {
2491       /* Changing the order of operations changes the semantics.  */
2492       if (dump_enabled_p ())
2493         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2494                         "reduction: unsafe fp math optimization: ");
2495       return NULL;
2496     }
2497   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2498            && check_reduction)
2499     {
2500       /* Changing the order of operations changes the semantics.  */
2501       if (dump_enabled_p ())
2502         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2503                         "reduction: unsafe int math optimization: ");
2504       return NULL;
2505     }
2506   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2507     {
2508       /* Changing the order of operations changes the semantics.  */
2509       if (dump_enabled_p ())
2510         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2511                         "reduction: unsafe fixed-point math optimization: ");
2512       return NULL;
2513     }
2514
2515   /* If we detected "res -= x[i]" earlier, rewrite it into
2516      "res += -x[i]" now.  If this turns out to be useless reassoc
2517      will clean it up again.  */
2518   if (orig_code == MINUS_EXPR)
2519     {
2520       tree rhs = gimple_assign_rhs2 (def_stmt);
2521       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2522       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2523       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2524       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2525                                                           loop_info, NULL));
2526       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2527       gimple_assign_set_rhs2 (def_stmt, negrhs);
2528       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2529       update_stmt (def_stmt);
2530     }
2531
2532   /* Reduction is safe. We're dealing with one of the following:
2533      1) integer arithmetic and no trapv
2534      2) floating point arithmetic, and special flags permit this optimization
2535      3) nested cycle (i.e., outer loop vectorization).  */
2536   if (TREE_CODE (op1) == SSA_NAME)
2537     def1 = SSA_NAME_DEF_STMT (op1);
2538
2539   if (TREE_CODE (op2) == SSA_NAME)
2540     def2 = SSA_NAME_DEF_STMT (op2);
2541
2542   if (code != COND_EXPR
2543       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2544     {
2545       if (dump_enabled_p ())
2546         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2547       return NULL;
2548     }
2549
2550   /* Check that one def is the reduction def, defined by PHI,
2551      the other def is either defined in the loop ("vect_internal_def"),
2552      or it's an induction (defined by a loop-header phi-node).  */
2553
2554   if (def2 && def2 == phi
2555       && (code == COND_EXPR
2556           || !def1 || gimple_nop_p (def1)
2557           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2558           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2559               && (is_gimple_assign (def1)
2560                   || is_gimple_call (def1)
2561                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2562                       == vect_induction_def
2563                   || (gimple_code (def1) == GIMPLE_PHI
2564                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2565                           == vect_internal_def
2566                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2567     {
2568       if (dump_enabled_p ())
2569         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2570       return def_stmt;
2571     }
2572
2573   if (def1 && def1 == phi
2574       && (code == COND_EXPR
2575           || !def2 || gimple_nop_p (def2)
2576           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2577           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2578               && (is_gimple_assign (def2)
2579                   || is_gimple_call (def2)
2580                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2581                       == vect_induction_def
2582                   || (gimple_code (def2) == GIMPLE_PHI
2583                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2584                           == vect_internal_def
2585                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2586     {
2587       if (check_reduction)
2588         {
2589           /* Swap operands (just for simplicity - so that the rest of the code
2590              can assume that the reduction variable is always the last (second)
2591              argument).  */
2592           if (dump_enabled_p ())
2593             report_vect_op (MSG_NOTE, def_stmt,
2594                             "detected reduction: need to swap operands: ");
2595
2596           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2597                              gimple_assign_rhs2_ptr (def_stmt));
2598
2599           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2600             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2601         }
2602       else
2603         {
2604           if (dump_enabled_p ())
2605             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2606         }
2607
2608       return def_stmt;
2609     }
2610
2611   /* Try to find SLP reduction chain.  */
2612   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2613     {
2614       if (dump_enabled_p ())
2615         report_vect_op (MSG_NOTE, def_stmt,
2616                         "reduction: detected reduction chain: ");
2617
2618       return def_stmt;
2619     }
2620
2621   if (dump_enabled_p ())
2622     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2623                     "reduction: unknown pattern: ");
2624
2625   return NULL;
2626 }
2627
2628 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2629    in-place.  Arguments as there.  */
2630
2631 static gimple
2632 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2633                           bool check_reduction, bool *double_reduc)
2634 {
2635   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2636                                      double_reduc, false);
2637 }
2638
2639 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2640    in-place if it enables detection of more reductions.  Arguments
2641    as there.  */
2642
2643 gimple
2644 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2645                           bool check_reduction, bool *double_reduc)
2646 {
2647   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2648                                      double_reduc, true);
2649 }
2650
2651 /* Calculate the cost of one scalar iteration of the loop.  */
2652 int
2653 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2654                                        stmt_vector_for_cost *scalar_cost_vec)
2655 {
2656   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2657   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2658   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2659   int innerloop_iters, i;
2660
2661   /* Count statements in scalar loop.  Using this as scalar cost for a single
2662      iteration for now.
2663
2664      TODO: Add outer loop support.
2665
2666      TODO: Consider assigning different costs to different scalar
2667      statements.  */
2668
2669   /* FORNOW.  */
2670   innerloop_iters = 1;
2671   if (loop->inner)
2672     innerloop_iters = 50; /* FIXME */
2673
2674   for (i = 0; i < nbbs; i++)
2675     {
2676       gimple_stmt_iterator si;
2677       basic_block bb = bbs[i];
2678
2679       if (bb->loop_father == loop->inner)
2680         factor = innerloop_iters;
2681       else
2682         factor = 1;
2683
2684       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2685         {
2686           gimple stmt = gsi_stmt (si);
2687           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2688
2689           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2690             continue;
2691
2692           /* Skip stmts that are not vectorized inside the loop.  */
2693           if (stmt_info
2694               && !STMT_VINFO_RELEVANT_P (stmt_info)
2695               && (!STMT_VINFO_LIVE_P (stmt_info)
2696                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2697               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2698             continue;
2699
2700           vect_cost_for_stmt kind;
2701           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2702             {
2703               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2704                kind = scalar_load;
2705              else
2706                kind = scalar_store;
2707             }
2708           else
2709             kind = scalar_stmt;
2710
2711           scalar_single_iter_cost
2712             += record_stmt_cost (scalar_cost_vec, factor, kind,
2713                                  NULL, 0, vect_prologue);
2714         }
2715     }
2716   return scalar_single_iter_cost;
2717 }
2718
2719 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2720 int
2721 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2722                              int *peel_iters_epilogue,
2723                              stmt_vector_for_cost *scalar_cost_vec,
2724                              stmt_vector_for_cost *prologue_cost_vec,
2725                              stmt_vector_for_cost *epilogue_cost_vec)
2726 {
2727   int retval = 0;
2728   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2729
2730   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2731     {
2732       *peel_iters_epilogue = vf/2;
2733       if (dump_enabled_p ())
2734         dump_printf_loc (MSG_NOTE, vect_location,
2735                          "cost model: epilogue peel iters set to vf/2 "
2736                          "because loop iterations are unknown .\n");
2737
2738       /* If peeled iterations are known but number of scalar loop
2739          iterations are unknown, count a taken branch per peeled loop.  */
2740       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2741                                  NULL, 0, vect_prologue);
2742       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2743                                  NULL, 0, vect_epilogue);
2744     }
2745   else
2746     {
2747       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2748       peel_iters_prologue = niters < peel_iters_prologue ?
2749                             niters : peel_iters_prologue;
2750       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2751       /* If we need to peel for gaps, but no peeling is required, we have to
2752          peel VF iterations.  */
2753       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2754         *peel_iters_epilogue = vf;
2755     }
2756
2757   stmt_info_for_cost *si;
2758   int j;
2759   if (peel_iters_prologue)
2760     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2761       retval += record_stmt_cost (prologue_cost_vec,
2762                                   si->count * peel_iters_prologue,
2763                                   si->kind, NULL, si->misalign,
2764                                   vect_prologue);
2765   if (*peel_iters_epilogue)
2766     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2767       retval += record_stmt_cost (epilogue_cost_vec,
2768                                   si->count * *peel_iters_epilogue,
2769                                   si->kind, NULL, si->misalign,
2770                                   vect_epilogue);
2771
2772   return retval;
2773 }
2774
2775 /* Function vect_estimate_min_profitable_iters
2776
2777    Return the number of iterations required for the vector version of the
2778    loop to be profitable relative to the cost of the scalar version of the
2779    loop.  */
2780
2781 static void
2782 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2783                                     int *ret_min_profitable_niters,
2784                                     int *ret_min_profitable_estimate)
2785 {
2786   int min_profitable_iters;
2787   int min_profitable_estimate;
2788   int peel_iters_prologue;
2789   int peel_iters_epilogue;
2790   unsigned vec_inside_cost = 0;
2791   int vec_outside_cost = 0;
2792   unsigned vec_prologue_cost = 0;
2793   unsigned vec_epilogue_cost = 0;
2794   int scalar_single_iter_cost = 0;
2795   int scalar_outside_cost = 0;
2796   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2797   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2798   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2799
2800   /* Cost model disabled.  */
2801   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2802     {
2803       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2804       *ret_min_profitable_niters = 0;
2805       *ret_min_profitable_estimate = 0;
2806       return;
2807     }
2808
2809   /* Requires loop versioning tests to handle misalignment.  */
2810   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2811     {
2812       /*  FIXME: Make cost depend on complexity of individual check.  */
2813       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2814       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2815                             vect_prologue);
2816       dump_printf (MSG_NOTE,
2817                    "cost model: Adding cost of checks for loop "
2818                    "versioning to treat misalignment.\n");
2819     }
2820
2821   /* Requires loop versioning with alias checks.  */
2822   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2823     {
2824       /*  FIXME: Make cost depend on complexity of individual check.  */
2825       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2826       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2827                             vect_prologue);
2828       dump_printf (MSG_NOTE,
2829                    "cost model: Adding cost of checks for loop "
2830                    "versioning aliasing.\n");
2831     }
2832
2833   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2834       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2835     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2836                           vect_prologue);
2837
2838   /* Count statements in scalar loop.  Using this as scalar cost for a single
2839      iteration for now.
2840
2841      TODO: Add outer loop support.
2842
2843      TODO: Consider assigning different costs to different scalar
2844      statements.  */
2845
2846   auto_vec<stmt_info_for_cost> scalar_cost_vec;
2847   scalar_single_iter_cost
2848      = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2849
2850   /* Add additional cost for the peeled instructions in prologue and epilogue
2851      loop.
2852
2853      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2854      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2855
2856      TODO: Build an expression that represents peel_iters for prologue and
2857      epilogue to be used in a run-time test.  */
2858
2859   if (npeel  < 0)
2860     {
2861       peel_iters_prologue = vf/2;
2862       dump_printf (MSG_NOTE, "cost model: "
2863                    "prologue peel iters set to vf/2.\n");
2864
2865       /* If peeling for alignment is unknown, loop bound of main loop becomes
2866          unknown.  */
2867       peel_iters_epilogue = vf/2;
2868       dump_printf (MSG_NOTE, "cost model: "
2869                    "epilogue peel iters set to vf/2 because "
2870                    "peeling for alignment is unknown.\n");
2871
2872       /* If peeled iterations are unknown, count a taken branch and a not taken
2873          branch per peeled loop. Even if scalar loop iterations are known,
2874          vector iterations are not known since peeled prologue iterations are
2875          not known. Hence guards remain the same.  */
2876       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2877                             NULL, 0, vect_prologue);
2878       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2879                             NULL, 0, vect_prologue);
2880       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2881                             NULL, 0, vect_epilogue);
2882       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2883                             NULL, 0, vect_epilogue);
2884       stmt_info_for_cost *si;
2885       int j;
2886       FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2887         {
2888           struct _stmt_vec_info *stmt_info
2889             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2890           (void) add_stmt_cost (target_cost_data,
2891                                 si->count * peel_iters_prologue,
2892                                 si->kind, stmt_info, si->misalign,
2893                                 vect_prologue);
2894           (void) add_stmt_cost (target_cost_data,
2895                                 si->count * peel_iters_epilogue,
2896                                 si->kind, stmt_info, si->misalign,
2897                                 vect_epilogue);
2898         }
2899     }
2900   else
2901     {
2902       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2903       stmt_info_for_cost *si;
2904       int j;
2905       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2906
2907       prologue_cost_vec.create (2);
2908       epilogue_cost_vec.create (2);
2909       peel_iters_prologue = npeel;
2910
2911       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2912                                           &peel_iters_epilogue,
2913                                           &scalar_cost_vec,
2914                                           &prologue_cost_vec,
2915                                           &epilogue_cost_vec);
2916
2917       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2918         {
2919           struct _stmt_vec_info *stmt_info
2920             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2921           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2922                                 si->misalign, vect_prologue);
2923         }
2924
2925       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2926         {
2927           struct _stmt_vec_info *stmt_info
2928             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2929           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2930                                 si->misalign, vect_epilogue);
2931         }
2932
2933       prologue_cost_vec.release ();
2934       epilogue_cost_vec.release ();
2935     }
2936
2937   /* FORNOW: The scalar outside cost is incremented in one of the
2938      following ways:
2939
2940      1. The vectorizer checks for alignment and aliasing and generates
2941      a condition that allows dynamic vectorization.  A cost model
2942      check is ANDED with the versioning condition.  Hence scalar code
2943      path now has the added cost of the versioning check.
2944
2945        if (cost > th & versioning_check)
2946          jmp to vector code
2947
2948      Hence run-time scalar is incremented by not-taken branch cost.
2949
2950      2. The vectorizer then checks if a prologue is required.  If the
2951      cost model check was not done before during versioning, it has to
2952      be done before the prologue check.
2953
2954        if (cost <= th)
2955          prologue = scalar_iters
2956        if (prologue == 0)
2957          jmp to vector code
2958        else
2959          execute prologue
2960        if (prologue == num_iters)
2961          go to exit
2962
2963      Hence the run-time scalar cost is incremented by a taken branch,
2964      plus a not-taken branch, plus a taken branch cost.
2965
2966      3. The vectorizer then checks if an epilogue is required.  If the
2967      cost model check was not done before during prologue check, it
2968      has to be done with the epilogue check.
2969
2970        if (prologue == 0)
2971          jmp to vector code
2972        else
2973          execute prologue
2974        if (prologue == num_iters)
2975          go to exit
2976        vector code:
2977          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2978            jmp to epilogue
2979
2980      Hence the run-time scalar cost should be incremented by 2 taken
2981      branches.
2982
2983      TODO: The back end may reorder the BBS's differently and reverse
2984      conditions/branch directions.  Change the estimates below to
2985      something more reasonable.  */
2986
2987   /* If the number of iterations is known and we do not do versioning, we can
2988      decide whether to vectorize at compile time.  Hence the scalar version
2989      do not carry cost model guard costs.  */
2990   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2991       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2992       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2993     {
2994       /* Cost model check occurs at versioning.  */
2995       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2996           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2997         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2998       else
2999         {
3000           /* Cost model check occurs at prologue generation.  */
3001           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3002             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3003               + vect_get_stmt_cost (cond_branch_not_taken);
3004           /* Cost model check occurs at epilogue generation.  */
3005           else
3006             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3007         }
3008     }
3009
3010   /* Complete the target-specific cost calculations.  */
3011   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3012                &vec_inside_cost, &vec_epilogue_cost);
3013
3014   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3015
3016   if (dump_enabled_p ())
3017     {
3018       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3019       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3020                    vec_inside_cost);
3021       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3022                    vec_prologue_cost);
3023       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3024                    vec_epilogue_cost);
3025       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3026                    scalar_single_iter_cost);
3027       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3028                    scalar_outside_cost);
3029       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3030                    vec_outside_cost);
3031       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3032                    peel_iters_prologue);
3033       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3034                    peel_iters_epilogue);
3035     }
3036
3037   /* Calculate number of iterations required to make the vector version
3038      profitable, relative to the loop bodies only.  The following condition
3039      must hold true:
3040      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3041      where
3042      SIC = scalar iteration cost, VIC = vector iteration cost,
3043      VOC = vector outside cost, VF = vectorization factor,
3044      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3045      SOC = scalar outside cost for run time cost model check.  */
3046
3047   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3048     {
3049       if (vec_outside_cost <= 0)
3050         min_profitable_iters = 1;
3051       else
3052         {
3053           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3054                                   - vec_inside_cost * peel_iters_prologue
3055                                   - vec_inside_cost * peel_iters_epilogue)
3056                                  / ((scalar_single_iter_cost * vf)
3057                                     - vec_inside_cost);
3058
3059           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3060               <= (((int) vec_inside_cost * min_profitable_iters)
3061                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3062             min_profitable_iters++;
3063         }
3064     }
3065   /* vector version will never be profitable.  */
3066   else
3067     {
3068       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3069         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3070                     "did not happen for a simd loop");
3071
3072       if (dump_enabled_p ())
3073         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3074                          "cost model: the vector iteration cost = %d "
3075                          "divided by the scalar iteration cost = %d "
3076                          "is greater or equal to the vectorization factor = %d"
3077                          ".\n",
3078                          vec_inside_cost, scalar_single_iter_cost, vf);
3079       *ret_min_profitable_niters = -1;
3080       *ret_min_profitable_estimate = -1;
3081       return;
3082     }
3083
3084   dump_printf (MSG_NOTE,
3085                "  Calculated minimum iters for profitability: %d\n",
3086                min_profitable_iters);
3087
3088   min_profitable_iters =
3089         min_profitable_iters < vf ? vf : min_profitable_iters;
3090
3091   /* Because the condition we create is:
3092      if (niters <= min_profitable_iters)
3093        then skip the vectorized loop.  */
3094   min_profitable_iters--;
3095
3096   if (dump_enabled_p ())
3097     dump_printf_loc (MSG_NOTE, vect_location,
3098                      "  Runtime profitability threshold = %d\n",
3099                      min_profitable_iters);
3100
3101   *ret_min_profitable_niters = min_profitable_iters;
3102
3103   /* Calculate number of iterations required to make the vector version
3104      profitable, relative to the loop bodies only.
3105
3106      Non-vectorized variant is SIC * niters and it must win over vector
3107      variant on the expected loop trip count.  The following condition must hold true:
3108      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3109
3110   if (vec_outside_cost <= 0)
3111     min_profitable_estimate = 1;
3112   else
3113     {
3114       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3115                                  - vec_inside_cost * peel_iters_prologue
3116                                  - vec_inside_cost * peel_iters_epilogue)
3117                                  / ((scalar_single_iter_cost * vf)
3118                                    - vec_inside_cost);
3119     }
3120   min_profitable_estimate --;
3121   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3122   if (dump_enabled_p ())
3123     dump_printf_loc (MSG_NOTE, vect_location,
3124                      "  Static estimate profitability threshold = %d\n",
3125                       min_profitable_iters);
3126
3127   *ret_min_profitable_estimate = min_profitable_estimate;
3128 }
3129
3130 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3131    vector elements (not bits) for a vector of mode MODE.  */
3132 static void
3133 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3134                               unsigned char *sel)
3135 {
3136   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3137
3138   for (i = 0; i < nelt; i++)
3139     sel[i] = (i + offset) & (2*nelt - 1);
3140 }
3141
3142 /* Checks whether the target supports whole-vector shifts for vectors of mode
3143    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3144    it supports vec_perm_const with masks for all necessary shift amounts.  */
3145 static bool
3146 have_whole_vector_shift (enum machine_mode mode)
3147 {
3148   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3149     return true;
3150
3151   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3152     return false;
3153
3154   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3155   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3156
3157   for (i = nelt/2; i >= 1; i/=2)
3158     {
3159       calc_vec_perm_mask_for_shift (mode, i, sel);
3160       if (!can_vec_perm_p (mode, false, sel))
3161         return false;
3162     }
3163   return true;
3164 }
3165
3166 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3167
3168 static tree
3169 get_reduction_op (gimple stmt, int reduc_index)
3170 {
3171   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3172     {
3173     case GIMPLE_SINGLE_RHS:
3174       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3175                   == ternary_op);
3176       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3177     case GIMPLE_UNARY_RHS:
3178       return gimple_assign_rhs1 (stmt);
3179     case GIMPLE_BINARY_RHS:
3180       return (reduc_index
3181               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3182     case GIMPLE_TERNARY_RHS:
3183       return gimple_op (stmt, reduc_index + 1);
3184     default:
3185       gcc_unreachable ();
3186     }
3187 }
3188
3189 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3190    functions. Design better to avoid maintenance issues.  */
3191
3192 /* Function vect_model_reduction_cost.
3193
3194    Models cost for a reduction operation, including the vector ops
3195    generated within the strip-mine loop, the initial definition before
3196    the loop, and the epilogue code that must be generated.  */
3197
3198 static bool
3199 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3200                            int ncopies, int reduc_index)
3201 {
3202   int prologue_cost = 0, epilogue_cost = 0;
3203   enum tree_code code;
3204   optab optab;
3205   tree vectype;
3206   gimple stmt, orig_stmt;
3207   tree reduction_op;
3208   machine_mode mode;
3209   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3210   struct loop *loop = NULL;
3211   void *target_cost_data;
3212
3213   if (loop_vinfo)
3214     {
3215       loop = LOOP_VINFO_LOOP (loop_vinfo);
3216       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3217     }
3218   else
3219     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3220
3221   /* Cost of reduction op inside loop.  */
3222   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3223                                         stmt_info, 0, vect_body);
3224   stmt = STMT_VINFO_STMT (stmt_info);
3225
3226   reduction_op = get_reduction_op (stmt, reduc_index);
3227
3228   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3229   if (!vectype)
3230     {
3231       if (dump_enabled_p ())
3232         {
3233           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3234                            "unsupported data-type ");
3235           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3236                              TREE_TYPE (reduction_op));
3237           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3238         }
3239       return false;
3240    }
3241
3242   mode = TYPE_MODE (vectype);
3243   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3244
3245   if (!orig_stmt)
3246     orig_stmt = STMT_VINFO_STMT (stmt_info);
3247
3248   code = gimple_assign_rhs_code (orig_stmt);
3249
3250   /* Add in cost for initial definition.  */
3251   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3252                                   stmt_info, 0, vect_prologue);
3253
3254   /* Determine cost of epilogue code.
3255
3256      We have a reduction operator that will reduce the vector in one statement.
3257      Also requires scalar extract.  */
3258
3259   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3260     {
3261       if (reduc_code != ERROR_MARK)
3262         {
3263           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3264                                           stmt_info, 0, vect_epilogue);
3265           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3266                                           stmt_info, 0, vect_epilogue);
3267         }
3268       else
3269         {
3270           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3271           tree bitsize =
3272             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3273           int element_bitsize = tree_to_uhwi (bitsize);
3274           int nelements = vec_size_in_bits / element_bitsize;
3275
3276           optab = optab_for_tree_code (code, vectype, optab_default);
3277
3278           /* We have a whole vector shift available.  */
3279           if (VECTOR_MODE_P (mode)
3280               && optab_handler (optab, mode) != CODE_FOR_nothing
3281               && have_whole_vector_shift (mode))
3282             {
3283               /* Final reduction via vector shifts and the reduction operator.
3284                  Also requires scalar extract.  */
3285               epilogue_cost += add_stmt_cost (target_cost_data,
3286                                               exact_log2 (nelements) * 2,
3287                                               vector_stmt, stmt_info, 0,
3288                                               vect_epilogue);
3289               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3290                                               vec_to_scalar, stmt_info, 0,
3291                                               vect_epilogue);
3292             }
3293           else
3294             /* Use extracts and reduction op for final reduction.  For N
3295                elements, we have N extracts and N-1 reduction ops.  */
3296             epilogue_cost += add_stmt_cost (target_cost_data,
3297                                             nelements + nelements - 1,
3298                                             vector_stmt, stmt_info, 0,
3299                                             vect_epilogue);
3300         }
3301     }
3302
3303   if (dump_enabled_p ())
3304     dump_printf (MSG_NOTE,
3305                  "vect_model_reduction_cost: inside_cost = %d, "
3306                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3307                  prologue_cost, epilogue_cost);
3308
3309   return true;
3310 }
3311
3312
3313 /* Function vect_model_induction_cost.
3314
3315    Models cost for induction operations.  */
3316
3317 static void
3318 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3319 {
3320   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3321   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3322   unsigned inside_cost, prologue_cost;
3323
3324   /* loop cost for vec_loop.  */
3325   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3326                                stmt_info, 0, vect_body);
3327
3328   /* prologue cost for vec_init and vec_step.  */
3329   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3330                                  stmt_info, 0, vect_prologue);
3331
3332   if (dump_enabled_p ())
3333     dump_printf_loc (MSG_NOTE, vect_location,
3334                      "vect_model_induction_cost: inside_cost = %d, "
3335                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3336 }
3337
3338
3339 /* Function get_initial_def_for_induction
3340
3341    Input:
3342    STMT - a stmt that performs an induction operation in the loop.
3343    IV_PHI - the initial value of the induction variable
3344
3345    Output:
3346    Return a vector variable, initialized with the first VF values of
3347    the induction variable.  E.g., for an iv with IV_PHI='X' and
3348    evolution S, for a vector of 4 units, we want to return:
3349    [X, X + S, X + 2*S, X + 3*S].  */
3350
3351 static tree
3352 get_initial_def_for_induction (gimple iv_phi)
3353 {
3354   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3355   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3356   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3357   tree vectype;
3358   int nunits;
3359   edge pe = loop_preheader_edge (loop);
3360   struct loop *iv_loop;
3361   basic_block new_bb;
3362   tree new_vec, vec_init, vec_step, t;
3363   tree new_var;
3364   tree new_name;
3365   gimple init_stmt, new_stmt;
3366   gphi *induction_phi;
3367   tree induc_def, vec_def, vec_dest;
3368   tree init_expr, step_expr;
3369   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3370   int i;
3371   int ncopies;
3372   tree expr;
3373   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3374   bool nested_in_vect_loop = false;
3375   gimple_seq stmts = NULL;
3376   imm_use_iterator imm_iter;
3377   use_operand_p use_p;
3378   gimple exit_phi;
3379   edge latch_e;
3380   tree loop_arg;
3381   gimple_stmt_iterator si;
3382   basic_block bb = gimple_bb (iv_phi);
3383   tree stepvectype;
3384   tree resvectype;
3385
3386   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3387   if (nested_in_vect_loop_p (loop, iv_phi))
3388     {
3389       nested_in_vect_loop = true;
3390       iv_loop = loop->inner;
3391     }
3392   else
3393     iv_loop = loop;
3394   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3395
3396   latch_e = loop_latch_edge (iv_loop);
3397   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3398
3399   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3400   gcc_assert (step_expr != NULL_TREE);
3401
3402   pe = loop_preheader_edge (iv_loop);
3403   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3404                                      loop_preheader_edge (iv_loop));
3405
3406   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3407   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3408   gcc_assert (vectype);
3409   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3410   ncopies = vf / nunits;
3411
3412   gcc_assert (phi_info);
3413   gcc_assert (ncopies >= 1);
3414
3415   /* Convert the step to the desired type.  */
3416   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3417                                                   step_expr),
3418                                     &stmts, true, NULL_TREE);
3419   if (stmts)
3420     {
3421       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3422       gcc_assert (!new_bb);
3423     }
3424
3425   /* Find the first insertion point in the BB.  */
3426   si = gsi_after_labels (bb);
3427
3428   /* Create the vector that holds the initial_value of the induction.  */
3429   if (nested_in_vect_loop)
3430     {
3431       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3432          been created during vectorization of previous stmts.  We obtain it
3433          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3434       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3435       /* If the initial value is not of proper type, convert it.  */
3436       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3437         {
3438           new_stmt
3439             = gimple_build_assign (vect_get_new_vect_var (vectype,
3440                                                           vect_simple_var,
3441                                                           "vec_iv_"),
3442                                    VIEW_CONVERT_EXPR,
3443                                    build1 (VIEW_CONVERT_EXPR, vectype,
3444                                            vec_init));
3445           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3446           gimple_assign_set_lhs (new_stmt, vec_init);
3447           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3448                                                  new_stmt);
3449           gcc_assert (!new_bb);
3450           set_vinfo_for_stmt (new_stmt,
3451                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3452         }
3453     }
3454   else
3455     {
3456       vec<constructor_elt, va_gc> *v;
3457
3458       /* iv_loop is the loop to be vectorized. Create:
3459          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3460       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3461                                        vect_scalar_var, "var_");
3462       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3463                                                      init_expr),
3464                                        &stmts, false, new_var);
3465       if (stmts)
3466         {
3467           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3468           gcc_assert (!new_bb);
3469         }
3470
3471       vec_alloc (v, nunits);
3472       bool constant_p = is_gimple_min_invariant (new_name);
3473       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3474       for (i = 1; i < nunits; i++)
3475         {
3476           /* Create: new_name_i = new_name + step_expr  */
3477           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3478                                   new_name, step_expr);
3479           if (!is_gimple_min_invariant (new_name))
3480             {
3481               init_stmt = gimple_build_assign (new_var, new_name);
3482               new_name = make_ssa_name (new_var, init_stmt);
3483               gimple_assign_set_lhs (init_stmt, new_name);
3484               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3485               gcc_assert (!new_bb);
3486               if (dump_enabled_p ())
3487                 {
3488                   dump_printf_loc (MSG_NOTE, vect_location,
3489                                    "created new init_stmt: ");
3490                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3491                   dump_printf (MSG_NOTE, "\n");
3492                 }
3493               constant_p = false;
3494             }
3495           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3496         }
3497       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3498       if (constant_p)
3499         new_vec = build_vector_from_ctor (vectype, v);
3500       else
3501         new_vec = build_constructor (vectype, v);
3502       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3503     }
3504
3505
3506   /* Create the vector that holds the step of the induction.  */
3507   if (nested_in_vect_loop)
3508     /* iv_loop is nested in the loop to be vectorized. Generate:
3509        vec_step = [S, S, S, S]  */
3510     new_name = step_expr;
3511   else
3512     {
3513       /* iv_loop is the loop to be vectorized. Generate:
3514           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3515       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3516         {
3517           expr = build_int_cst (integer_type_node, vf);
3518           expr = fold_convert (TREE_TYPE (step_expr), expr);
3519         }
3520       else
3521         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3522       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3523                               expr, step_expr);
3524       if (TREE_CODE (step_expr) == SSA_NAME)
3525         new_name = vect_init_vector (iv_phi, new_name,
3526                                      TREE_TYPE (step_expr), NULL);
3527     }
3528
3529   t = unshare_expr (new_name);
3530   gcc_assert (CONSTANT_CLASS_P (new_name)
3531               || TREE_CODE (new_name) == SSA_NAME);
3532   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3533   gcc_assert (stepvectype);
3534   new_vec = build_vector_from_val (stepvectype, t);
3535   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3536
3537
3538   /* Create the following def-use cycle:
3539      loop prolog:
3540          vec_init = ...
3541          vec_step = ...
3542      loop:
3543          vec_iv = PHI <vec_init, vec_loop>
3544          ...
3545          STMT
3546          ...
3547          vec_loop = vec_iv + vec_step;  */
3548
3549   /* Create the induction-phi that defines the induction-operand.  */
3550   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3551   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3552   set_vinfo_for_stmt (induction_phi,
3553                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3554   induc_def = PHI_RESULT (induction_phi);
3555
3556   /* Create the iv update inside the loop  */
3557   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3558   vec_def = make_ssa_name (vec_dest, new_stmt);
3559   gimple_assign_set_lhs (new_stmt, vec_def);
3560   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3561   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3562                                                    NULL));
3563
3564   /* Set the arguments of the phi node:  */
3565   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3566   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3567                UNKNOWN_LOCATION);
3568
3569
3570   /* In case that vectorization factor (VF) is bigger than the number
3571      of elements that we can fit in a vectype (nunits), we have to generate
3572      more than one vector stmt - i.e - we need to "unroll" the
3573      vector stmt by a factor VF/nunits.  For more details see documentation
3574      in vectorizable_operation.  */
3575
3576   if (ncopies > 1)
3577     {
3578       stmt_vec_info prev_stmt_vinfo;
3579       /* FORNOW. This restriction should be relaxed.  */
3580       gcc_assert (!nested_in_vect_loop);
3581
3582       /* Create the vector that holds the step of the induction.  */
3583       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3584         {
3585           expr = build_int_cst (integer_type_node, nunits);
3586           expr = fold_convert (TREE_TYPE (step_expr), expr);
3587         }
3588       else
3589         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3590       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3591                               expr, step_expr);
3592       if (TREE_CODE (step_expr) == SSA_NAME)
3593         new_name = vect_init_vector (iv_phi, new_name,
3594                                      TREE_TYPE (step_expr), NULL);
3595       t = unshare_expr (new_name);
3596       gcc_assert (CONSTANT_CLASS_P (new_name)
3597                   || TREE_CODE (new_name) == SSA_NAME);
3598       new_vec = build_vector_from_val (stepvectype, t);
3599       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3600
3601       vec_def = induc_def;
3602       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3603       for (i = 1; i < ncopies; i++)
3604         {
3605           /* vec_i = vec_prev + vec_step  */
3606           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3607                                           vec_def, vec_step);
3608           vec_def = make_ssa_name (vec_dest, new_stmt);
3609           gimple_assign_set_lhs (new_stmt, vec_def);
3610
3611           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3612           if (!useless_type_conversion_p (resvectype, vectype))
3613             {
3614               new_stmt
3615                 = gimple_build_assign
3616                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3617                                                 "vec_iv_"),
3618                          VIEW_CONVERT_EXPR,
3619                          build1 (VIEW_CONVERT_EXPR, resvectype,
3620                                  gimple_assign_lhs (new_stmt)));
3621               gimple_assign_set_lhs (new_stmt,
3622                                      make_ssa_name
3623                                        (gimple_assign_lhs (new_stmt), new_stmt));
3624               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3625             }
3626           set_vinfo_for_stmt (new_stmt,
3627                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3628           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3629           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3630         }
3631     }
3632
3633   if (nested_in_vect_loop)
3634     {
3635       /* Find the loop-closed exit-phi of the induction, and record
3636          the final vector of induction results:  */
3637       exit_phi = NULL;
3638       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3639         {
3640           gimple use_stmt = USE_STMT (use_p);
3641           if (is_gimple_debug (use_stmt))
3642             continue;
3643
3644           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3645             {
3646               exit_phi = use_stmt;
3647               break;
3648             }
3649         }
3650       if (exit_phi)
3651         {
3652           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3653           /* FORNOW. Currently not supporting the case that an inner-loop induction
3654              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3655           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3656                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3657
3658           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3659           if (dump_enabled_p ())
3660             {
3661               dump_printf_loc (MSG_NOTE, vect_location,
3662                                "vector of inductions after inner-loop:");
3663               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3664               dump_printf (MSG_NOTE, "\n");
3665             }
3666         }
3667     }
3668
3669
3670   if (dump_enabled_p ())
3671     {
3672       dump_printf_loc (MSG_NOTE, vect_location,
3673                        "transform induction: created def-use cycle: ");
3674       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3675       dump_printf (MSG_NOTE, "\n");
3676       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3677                         SSA_NAME_DEF_STMT (vec_def), 0);
3678       dump_printf (MSG_NOTE, "\n");
3679     }
3680
3681   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3682   if (!useless_type_conversion_p (resvectype, vectype))
3683     {
3684       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3685                                                              vect_simple_var,
3686                                                              "vec_iv_"),
3687                                       VIEW_CONVERT_EXPR,
3688                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3689                                               induc_def));
3690       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3691       gimple_assign_set_lhs (new_stmt, induc_def);
3692       si = gsi_after_labels (bb);
3693       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3694       set_vinfo_for_stmt (new_stmt,
3695                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3696       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3697         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3698     }
3699
3700   return induc_def;
3701 }
3702
3703
3704 /* Function get_initial_def_for_reduction
3705
3706    Input:
3707    STMT - a stmt that performs a reduction operation in the loop.
3708    INIT_VAL - the initial value of the reduction variable
3709
3710    Output:
3711    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3712         of the reduction (used for adjusting the epilog - see below).
3713    Return a vector variable, initialized according to the operation that STMT
3714         performs. This vector will be used as the initial value of the
3715         vector of partial results.
3716
3717    Option1 (adjust in epilog): Initialize the vector as follows:
3718      add/bit or/xor:    [0,0,...,0,0]
3719      mult/bit and:      [1,1,...,1,1]
3720      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3721    and when necessary (e.g. add/mult case) let the caller know
3722    that it needs to adjust the result by init_val.
3723
3724    Option2: Initialize the vector as follows:
3725      add/bit or/xor:    [init_val,0,0,...,0]
3726      mult/bit and:      [init_val,1,1,...,1]
3727      min/max/cond_expr: [init_val,init_val,...,init_val]
3728    and no adjustments are needed.
3729
3730    For example, for the following code:
3731
3732    s = init_val;
3733    for (i=0;i<n;i++)
3734      s = s + a[i];
3735
3736    STMT is 's = s + a[i]', and the reduction variable is 's'.
3737    For a vector of 4 units, we want to return either [0,0,0,init_val],
3738    or [0,0,0,0] and let the caller know that it needs to adjust
3739    the result at the end by 'init_val'.
3740
3741    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3742    initialization vector is simpler (same element in all entries), if
3743    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3744
3745    A cost model should help decide between these two schemes.  */
3746
3747 tree
3748 get_initial_def_for_reduction (gimple stmt, tree init_val,
3749                                tree *adjustment_def)
3750 {
3751   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3752   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3753   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3754   tree scalar_type = TREE_TYPE (init_val);
3755   tree vectype = get_vectype_for_scalar_type (scalar_type);
3756   int nunits;
3757   enum tree_code code = gimple_assign_rhs_code (stmt);
3758   tree def_for_init;
3759   tree init_def;
3760   tree *elts;
3761   int i;
3762   bool nested_in_vect_loop = false;
3763   tree init_value;
3764   REAL_VALUE_TYPE real_init_val = dconst0;
3765   int int_init_val = 0;
3766   gimple def_stmt = NULL;
3767
3768   gcc_assert (vectype);
3769   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3770
3771   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3772               || SCALAR_FLOAT_TYPE_P (scalar_type));
3773
3774   if (nested_in_vect_loop_p (loop, stmt))
3775     nested_in_vect_loop = true;
3776   else
3777     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3778
3779   /* In case of double reduction we only create a vector variable to be put
3780      in the reduction phi node.  The actual statement creation is done in
3781      vect_create_epilog_for_reduction.  */
3782   if (adjustment_def && nested_in_vect_loop
3783       && TREE_CODE (init_val) == SSA_NAME
3784       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3785       && gimple_code (def_stmt) == GIMPLE_PHI
3786       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3787       && vinfo_for_stmt (def_stmt)
3788       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3789           == vect_double_reduction_def)
3790     {
3791       *adjustment_def = NULL;
3792       return vect_create_destination_var (init_val, vectype);
3793     }
3794
3795   if (TREE_CONSTANT (init_val))
3796     {
3797       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3798         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3799       else
3800         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3801     }
3802   else
3803     init_value = init_val;
3804
3805   switch (code)
3806     {
3807       case WIDEN_SUM_EXPR:
3808       case DOT_PROD_EXPR:
3809       case SAD_EXPR:
3810       case PLUS_EXPR:
3811       case MINUS_EXPR:
3812       case BIT_IOR_EXPR:
3813       case BIT_XOR_EXPR:
3814       case MULT_EXPR:
3815       case BIT_AND_EXPR:
3816         /* ADJUSMENT_DEF is NULL when called from
3817            vect_create_epilog_for_reduction to vectorize double reduction.  */
3818         if (adjustment_def)
3819           {
3820             if (nested_in_vect_loop)
3821               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3822                                                               NULL);
3823             else
3824               *adjustment_def = init_val;
3825           }
3826
3827         if (code == MULT_EXPR)
3828           {
3829             real_init_val = dconst1;
3830             int_init_val = 1;
3831           }
3832
3833         if (code == BIT_AND_EXPR)
3834           int_init_val = -1;
3835
3836         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3837           def_for_init = build_real (scalar_type, real_init_val);
3838         else
3839           def_for_init = build_int_cst (scalar_type, int_init_val);
3840
3841         /* Create a vector of '0' or '1' except the first element.  */
3842         elts = XALLOCAVEC (tree, nunits);
3843         for (i = nunits - 2; i >= 0; --i)
3844           elts[i + 1] = def_for_init;
3845
3846         /* Option1: the first element is '0' or '1' as well.  */
3847         if (adjustment_def)
3848           {
3849             elts[0] = def_for_init;
3850             init_def = build_vector (vectype, elts);
3851             break;
3852           }
3853
3854         /* Option2: the first element is INIT_VAL.  */
3855         elts[0] = init_val;
3856         if (TREE_CONSTANT (init_val))
3857           init_def = build_vector (vectype, elts);
3858         else
3859           {
3860             vec<constructor_elt, va_gc> *v;
3861             vec_alloc (v, nunits);
3862             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3863             for (i = 1; i < nunits; ++i)
3864               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3865             init_def = build_constructor (vectype, v);
3866           }
3867
3868         break;
3869
3870       case MIN_EXPR:
3871       case MAX_EXPR:
3872       case COND_EXPR:
3873         if (adjustment_def)
3874           {
3875             *adjustment_def = NULL_TREE;
3876             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3877             break;
3878           }
3879
3880         init_def = build_vector_from_val (vectype, init_value);
3881         break;
3882
3883       default:
3884         gcc_unreachable ();
3885     }
3886
3887   return init_def;
3888 }
3889
3890 /* Function vect_create_epilog_for_reduction
3891
3892    Create code at the loop-epilog to finalize the result of a reduction
3893    computation.
3894
3895    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3896      reduction statements.
3897    STMT is the scalar reduction stmt that is being vectorized.
3898    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3899      number of elements that we can fit in a vectype (nunits).  In this case
3900      we have to generate more than one vector stmt - i.e - we need to "unroll"
3901      the vector stmt by a factor VF/nunits.  For more details see documentation
3902      in vectorizable_operation.
3903    REDUC_CODE is the tree-code for the epilog reduction.
3904    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3905      computation.
3906    REDUC_INDEX is the index of the operand in the right hand side of the
3907      statement that is defined by REDUCTION_PHI.
3908    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3909    SLP_NODE is an SLP node containing a group of reduction statements. The
3910      first one in this group is STMT.
3911
3912    This function:
3913    1. Creates the reduction def-use cycles: sets the arguments for
3914       REDUCTION_PHIS:
3915       The loop-entry argument is the vectorized initial-value of the reduction.
3916       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3917       sums.
3918    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3919       by applying the operation specified by REDUC_CODE if available, or by
3920       other means (whole-vector shifts or a scalar loop).
3921       The function also creates a new phi node at the loop exit to preserve
3922       loop-closed form, as illustrated below.
3923
3924      The flow at the entry to this function:
3925
3926         loop:
3927           vec_def = phi <null, null>            # REDUCTION_PHI
3928           VECT_DEF = vector_stmt                # vectorized form of STMT
3929           s_loop = scalar_stmt                  # (scalar) STMT
3930         loop_exit:
3931           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3932           use <s_out0>
3933           use <s_out0>
3934
3935      The above is transformed by this function into:
3936
3937         loop:
3938           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3939           VECT_DEF = vector_stmt                # vectorized form of STMT
3940           s_loop = scalar_stmt                  # (scalar) STMT
3941         loop_exit:
3942           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3943           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3944           v_out2 = reduce <v_out1>
3945           s_out3 = extract_field <v_out2, 0>
3946           s_out4 = adjust_result <s_out3>
3947           use <s_out4>
3948           use <s_out4>
3949 */
3950
3951 static void
3952 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3953                                   int ncopies, enum tree_code reduc_code,
3954                                   vec<gimple> reduction_phis,
3955                                   int reduc_index, bool double_reduc,
3956                                   slp_tree slp_node)
3957 {
3958   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3959   stmt_vec_info prev_phi_info;
3960   tree vectype;
3961   machine_mode mode;
3962   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3963   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3964   basic_block exit_bb;
3965   tree scalar_dest;
3966   tree scalar_type;
3967   gimple new_phi = NULL, phi;
3968   gimple_stmt_iterator exit_gsi;
3969   tree vec_dest;
3970   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3971   gimple epilog_stmt = NULL;
3972   enum tree_code code = gimple_assign_rhs_code (stmt);
3973   gimple exit_phi;
3974   tree bitsize;
3975   tree adjustment_def = NULL;
3976   tree vec_initial_def = NULL;
3977   tree reduction_op, expr, def;
3978   tree orig_name, scalar_result;
3979   imm_use_iterator imm_iter, phi_imm_iter;
3980   use_operand_p use_p, phi_use_p;
3981   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3982   bool nested_in_vect_loop = false;
3983   auto_vec<gimple> new_phis;
3984   auto_vec<gimple> inner_phis;
3985   enum vect_def_type dt = vect_unknown_def_type;
3986   int j, i;
3987   auto_vec<tree> scalar_results;
3988   unsigned int group_size = 1, k, ratio;
3989   auto_vec<tree> vec_initial_defs;
3990   auto_vec<gimple> phis;
3991   bool slp_reduc = false;
3992   tree new_phi_result;
3993   gimple inner_phi = NULL;
3994
3995   if (slp_node)
3996     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3997
3998   if (nested_in_vect_loop_p (loop, stmt))
3999     {
4000       outer_loop = loop;
4001       loop = loop->inner;
4002       nested_in_vect_loop = true;
4003       gcc_assert (!slp_node);
4004     }
4005
4006   reduction_op = get_reduction_op (stmt, reduc_index);
4007
4008   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4009   gcc_assert (vectype);
4010   mode = TYPE_MODE (vectype);
4011
4012   /* 1. Create the reduction def-use cycle:
4013      Set the arguments of REDUCTION_PHIS, i.e., transform
4014
4015         loop:
4016           vec_def = phi <null, null>            # REDUCTION_PHI
4017           VECT_DEF = vector_stmt                # vectorized form of STMT
4018           ...
4019
4020      into:
4021
4022         loop:
4023           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4024           VECT_DEF = vector_stmt                # vectorized form of STMT
4025           ...
4026
4027      (in case of SLP, do it for all the phis). */
4028
4029   /* Get the loop-entry arguments.  */
4030   if (slp_node)
4031     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4032                        NULL, slp_node, reduc_index);
4033   else
4034     {
4035       vec_initial_defs.create (1);
4036      /* For the case of reduction, vect_get_vec_def_for_operand returns
4037         the scalar def before the loop, that defines the initial value
4038         of the reduction variable.  */
4039       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4040                                                       &adjustment_def);
4041       vec_initial_defs.quick_push (vec_initial_def);
4042     }
4043
4044   /* Set phi nodes arguments.  */
4045   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4046     {
4047       tree vec_init_def, def;
4048       gimple_seq stmts;
4049       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4050                                            true, NULL_TREE);
4051       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4052       def = vect_defs[i];
4053       for (j = 0; j < ncopies; j++)
4054         {
4055           /* Set the loop-entry arg of the reduction-phi.  */
4056           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4057                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4058
4059           /* Set the loop-latch arg for the reduction-phi.  */
4060           if (j > 0)
4061             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4062
4063           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4064                        UNKNOWN_LOCATION);
4065
4066           if (dump_enabled_p ())
4067             {
4068               dump_printf_loc (MSG_NOTE, vect_location,
4069                                "transform reduction: created def-use cycle: ");
4070               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4071               dump_printf (MSG_NOTE, "\n");
4072               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4073               dump_printf (MSG_NOTE, "\n");
4074             }
4075
4076           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4077         }
4078     }
4079
4080   /* 2. Create epilog code.
4081         The reduction epilog code operates across the elements of the vector
4082         of partial results computed by the vectorized loop.
4083         The reduction epilog code consists of:
4084
4085         step 1: compute the scalar result in a vector (v_out2)
4086         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4087         step 3: adjust the scalar result (s_out3) if needed.
4088
4089         Step 1 can be accomplished using one the following three schemes:
4090           (scheme 1) using reduc_code, if available.
4091           (scheme 2) using whole-vector shifts, if available.
4092           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4093                      combined.
4094
4095           The overall epilog code looks like this:
4096
4097           s_out0 = phi <s_loop>         # original EXIT_PHI
4098           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4099           v_out2 = reduce <v_out1>              # step 1
4100           s_out3 = extract_field <v_out2, 0>    # step 2
4101           s_out4 = adjust_result <s_out3>       # step 3
4102
4103           (step 3 is optional, and steps 1 and 2 may be combined).
4104           Lastly, the uses of s_out0 are replaced by s_out4.  */
4105
4106
4107   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4108          v_out1 = phi <VECT_DEF>
4109          Store them in NEW_PHIS.  */
4110
4111   exit_bb = single_exit (loop)->dest;
4112   prev_phi_info = NULL;
4113   new_phis.create (vect_defs.length ());
4114   FOR_EACH_VEC_ELT (vect_defs, i, def)
4115     {
4116       for (j = 0; j < ncopies; j++)
4117         {
4118           tree new_def = copy_ssa_name (def);
4119           phi = create_phi_node (new_def, exit_bb);
4120           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4121           if (j == 0)
4122             new_phis.quick_push (phi);
4123           else
4124             {
4125               def = vect_get_vec_def_for_stmt_copy (dt, def);
4126               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4127             }
4128
4129           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4130           prev_phi_info = vinfo_for_stmt (phi);
4131         }
4132     }
4133
4134   /* The epilogue is created for the outer-loop, i.e., for the loop being
4135      vectorized.  Create exit phis for the outer loop.  */
4136   if (double_reduc)
4137     {
4138       loop = outer_loop;
4139       exit_bb = single_exit (loop)->dest;
4140       inner_phis.create (vect_defs.length ());
4141       FOR_EACH_VEC_ELT (new_phis, i, phi)
4142         {
4143           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4144           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4145           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4146                            PHI_RESULT (phi));
4147           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4148                                                             loop_vinfo, NULL));
4149           inner_phis.quick_push (phi);
4150           new_phis[i] = outer_phi;
4151           prev_phi_info = vinfo_for_stmt (outer_phi);
4152           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4153             {
4154               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4155               new_result = copy_ssa_name (PHI_RESULT (phi));
4156               outer_phi = create_phi_node (new_result, exit_bb);
4157               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4158                                PHI_RESULT (phi));
4159               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4160                                                         loop_vinfo, NULL));
4161               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4162               prev_phi_info = vinfo_for_stmt (outer_phi);
4163             }
4164         }
4165     }
4166
4167   exit_gsi = gsi_after_labels (exit_bb);
4168
4169   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4170          (i.e. when reduc_code is not available) and in the final adjustment
4171          code (if needed).  Also get the original scalar reduction variable as
4172          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4173          represents a reduction pattern), the tree-code and scalar-def are
4174          taken from the original stmt that the pattern-stmt (STMT) replaces.
4175          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4176          are taken from STMT.  */
4177
4178   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4179   if (!orig_stmt)
4180     {
4181       /* Regular reduction  */
4182       orig_stmt = stmt;
4183     }
4184   else
4185     {
4186       /* Reduction pattern  */
4187       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4188       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4189       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4190     }
4191
4192   code = gimple_assign_rhs_code (orig_stmt);
4193   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4194      partial results are added and not subtracted.  */
4195   if (code == MINUS_EXPR)
4196     code = PLUS_EXPR;
4197
4198   scalar_dest = gimple_assign_lhs (orig_stmt);
4199   scalar_type = TREE_TYPE (scalar_dest);
4200   scalar_results.create (group_size);
4201   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4202   bitsize = TYPE_SIZE (scalar_type);
4203
4204   /* In case this is a reduction in an inner-loop while vectorizing an outer
4205      loop - we don't need to extract a single scalar result at the end of the
4206      inner-loop (unless it is double reduction, i.e., the use of reduction is
4207      outside the outer-loop).  The final vector of partial results will be used
4208      in the vectorized outer-loop, or reduced to a scalar result at the end of
4209      the outer-loop.  */
4210   if (nested_in_vect_loop && !double_reduc)
4211     goto vect_finalize_reduction;
4212
4213   /* SLP reduction without reduction chain, e.g.,
4214      # a1 = phi <a2, a0>
4215      # b1 = phi <b2, b0>
4216      a2 = operation (a1)
4217      b2 = operation (b1)  */
4218   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4219
4220   /* In case of reduction chain, e.g.,
4221      # a1 = phi <a3, a0>
4222      a2 = operation (a1)
4223      a3 = operation (a2),
4224
4225      we may end up with more than one vector result.  Here we reduce them to
4226      one vector.  */
4227   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4228     {
4229       tree first_vect = PHI_RESULT (new_phis[0]);
4230       tree tmp;
4231       gassign *new_vec_stmt = NULL;
4232
4233       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4234       for (k = 1; k < new_phis.length (); k++)
4235         {
4236           gimple next_phi = new_phis[k];
4237           tree second_vect = PHI_RESULT (next_phi);
4238
4239           tmp = build2 (code, vectype,  first_vect, second_vect);
4240           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4241           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4242           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4243           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4244         }
4245
4246       new_phi_result = first_vect;
4247       if (new_vec_stmt)
4248         {
4249           new_phis.truncate (0);
4250           new_phis.safe_push (new_vec_stmt);
4251         }
4252     }
4253   else
4254     new_phi_result = PHI_RESULT (new_phis[0]);
4255
4256   /* 2.3 Create the reduction code, using one of the three schemes described
4257          above. In SLP we simply need to extract all the elements from the
4258          vector (without reducing them), so we use scalar shifts.  */
4259   if (reduc_code != ERROR_MARK && !slp_reduc)
4260     {
4261       tree tmp;
4262       tree vec_elem_type;
4263
4264       /*** Case 1:  Create:
4265            v_out2 = reduc_expr <v_out1>  */
4266
4267       if (dump_enabled_p ())
4268         dump_printf_loc (MSG_NOTE, vect_location,
4269                          "Reduce using direct vector reduction.\n");
4270
4271       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4272       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4273         {
4274           tree tmp_dest =
4275               vect_create_destination_var (scalar_dest, vec_elem_type);
4276           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4277           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4278           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4279           gimple_assign_set_lhs (epilog_stmt, new_temp);
4280           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4281
4282           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4283         }
4284       else
4285         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4286       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4287       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4288       gimple_assign_set_lhs (epilog_stmt, new_temp);
4289       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4290       scalar_results.safe_push (new_temp);
4291     }
4292   else
4293     {
4294       bool reduce_with_shift = have_whole_vector_shift (mode);
4295       int element_bitsize = tree_to_uhwi (bitsize);
4296       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4297       tree vec_temp;
4298
4299       /* Regardless of whether we have a whole vector shift, if we're
4300          emulating the operation via tree-vect-generic, we don't want
4301          to use it.  Only the first round of the reduction is likely
4302          to still be profitable via emulation.  */
4303       /* ??? It might be better to emit a reduction tree code here, so that
4304          tree-vect-generic can expand the first round via bit tricks.  */
4305       if (!VECTOR_MODE_P (mode))
4306         reduce_with_shift = false;
4307       else
4308         {
4309           optab optab = optab_for_tree_code (code, vectype, optab_default);
4310           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4311             reduce_with_shift = false;
4312         }
4313
4314       if (reduce_with_shift && !slp_reduc)
4315         {
4316           int nelements = vec_size_in_bits / element_bitsize;
4317           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4318
4319           int elt_offset;
4320
4321           tree zero_vec = build_zero_cst (vectype);
4322           /*** Case 2: Create:
4323              for (offset = nelements/2; offset >= 1; offset/=2)
4324                 {
4325                   Create:  va' = vec_shift <va, offset>
4326                   Create:  va = vop <va, va'>
4327                 }  */
4328
4329           tree rhs;
4330
4331           if (dump_enabled_p ())
4332             dump_printf_loc (MSG_NOTE, vect_location,
4333                              "Reduce using vector shifts\n");
4334
4335           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4336           new_temp = new_phi_result;
4337           for (elt_offset = nelements / 2;
4338                elt_offset >= 1;
4339                elt_offset /= 2)
4340             {
4341               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4342               tree mask = vect_gen_perm_mask_any (vectype, sel);
4343               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4344                                                  new_temp, zero_vec, mask);
4345               new_name = make_ssa_name (vec_dest, epilog_stmt);
4346               gimple_assign_set_lhs (epilog_stmt, new_name);
4347               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4348
4349               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4350                                                  new_temp);
4351               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4352               gimple_assign_set_lhs (epilog_stmt, new_temp);
4353               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4354             }
4355
4356           /* 2.4  Extract the final scalar result.  Create:
4357              s_out3 = extract_field <v_out2, bitpos>  */
4358
4359           if (dump_enabled_p ())
4360             dump_printf_loc (MSG_NOTE, vect_location,
4361                              "extract scalar result\n");
4362
4363           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4364                         bitsize, bitsize_zero_node);
4365           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4366           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4367           gimple_assign_set_lhs (epilog_stmt, new_temp);
4368           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4369           scalar_results.safe_push (new_temp);
4370         }
4371       else
4372         {
4373           /*** Case 3: Create:
4374              s = extract_field <v_out2, 0>
4375              for (offset = element_size;
4376                   offset < vector_size;
4377                   offset += element_size;)
4378                {
4379                  Create:  s' = extract_field <v_out2, offset>
4380                  Create:  s = op <s, s'>  // For non SLP cases
4381                }  */
4382
4383           if (dump_enabled_p ())
4384             dump_printf_loc (MSG_NOTE, vect_location,
4385                              "Reduce using scalar code.\n");
4386
4387           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4388           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4389             {
4390               int bit_offset;
4391               if (gimple_code (new_phi) == GIMPLE_PHI)
4392                 vec_temp = PHI_RESULT (new_phi);
4393               else
4394                 vec_temp = gimple_assign_lhs (new_phi);
4395               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4396                             bitsize_zero_node);
4397               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4398               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4399               gimple_assign_set_lhs (epilog_stmt, new_temp);
4400               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4401
4402               /* In SLP we don't need to apply reduction operation, so we just
4403                  collect s' values in SCALAR_RESULTS.  */
4404               if (slp_reduc)
4405                 scalar_results.safe_push (new_temp);
4406
4407               for (bit_offset = element_bitsize;
4408                    bit_offset < vec_size_in_bits;
4409                    bit_offset += element_bitsize)
4410                 {
4411                   tree bitpos = bitsize_int (bit_offset);
4412                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4413                                      bitsize, bitpos);
4414
4415                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4416                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4417                   gimple_assign_set_lhs (epilog_stmt, new_name);
4418                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4419
4420                   if (slp_reduc)
4421                     {
4422                       /* In SLP we don't need to apply reduction operation, so
4423                          we just collect s' values in SCALAR_RESULTS.  */
4424                       new_temp = new_name;
4425                       scalar_results.safe_push (new_name);
4426                     }
4427                   else
4428                     {
4429                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4430                                                          new_name, new_temp);
4431                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4432                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4433                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4434                     }
4435                 }
4436             }
4437
4438           /* The only case where we need to reduce scalar results in SLP, is
4439              unrolling.  If the size of SCALAR_RESULTS is greater than
4440              GROUP_SIZE, we reduce them combining elements modulo
4441              GROUP_SIZE.  */
4442           if (slp_reduc)
4443             {
4444               tree res, first_res, new_res;
4445               gimple new_stmt;
4446
4447               /* Reduce multiple scalar results in case of SLP unrolling.  */
4448               for (j = group_size; scalar_results.iterate (j, &res);
4449                    j++)
4450                 {
4451                   first_res = scalar_results[j % group_size];
4452                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4453                                                   first_res, res);
4454                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4455                   gimple_assign_set_lhs (new_stmt, new_res);
4456                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4457                   scalar_results[j % group_size] = new_res;
4458                 }
4459             }
4460           else
4461             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4462             scalar_results.safe_push (new_temp);
4463         }
4464     }
4465
4466 vect_finalize_reduction:
4467
4468   if (double_reduc)
4469     loop = loop->inner;
4470
4471   /* 2.5 Adjust the final result by the initial value of the reduction
4472          variable. (When such adjustment is not needed, then
4473          'adjustment_def' is zero).  For example, if code is PLUS we create:
4474          new_temp = loop_exit_def + adjustment_def  */
4475
4476   if (adjustment_def)
4477     {
4478       gcc_assert (!slp_reduc);
4479       if (nested_in_vect_loop)
4480         {
4481           new_phi = new_phis[0];
4482           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4483           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4484           new_dest = vect_create_destination_var (scalar_dest, vectype);
4485         }
4486       else
4487         {
4488           new_temp = scalar_results[0];
4489           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4490           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4491           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4492         }
4493
4494       epilog_stmt = gimple_build_assign (new_dest, expr);
4495       new_temp = make_ssa_name (new_dest, epilog_stmt);
4496       gimple_assign_set_lhs (epilog_stmt, new_temp);
4497       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4498       if (nested_in_vect_loop)
4499         {
4500           set_vinfo_for_stmt (epilog_stmt,
4501                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4502                                                  NULL));
4503           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4504                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4505
4506           if (!double_reduc)
4507             scalar_results.quick_push (new_temp);
4508           else
4509             scalar_results[0] = new_temp;
4510         }
4511       else
4512         scalar_results[0] = new_temp;
4513
4514       new_phis[0] = epilog_stmt;
4515     }
4516
4517   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4518           phis with new adjusted scalar results, i.e., replace use <s_out0>
4519           with use <s_out4>.
4520
4521      Transform:
4522         loop_exit:
4523           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4524           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4525           v_out2 = reduce <v_out1>
4526           s_out3 = extract_field <v_out2, 0>
4527           s_out4 = adjust_result <s_out3>
4528           use <s_out0>
4529           use <s_out0>
4530
4531      into:
4532
4533         loop_exit:
4534           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4535           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4536           v_out2 = reduce <v_out1>
4537           s_out3 = extract_field <v_out2, 0>
4538           s_out4 = adjust_result <s_out3>
4539           use <s_out4>
4540           use <s_out4> */
4541
4542
4543   /* In SLP reduction chain we reduce vector results into one vector if
4544      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4545      the last stmt in the reduction chain, since we are looking for the loop
4546      exit phi node.  */
4547   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4548     {
4549       scalar_dest = gimple_assign_lhs (
4550                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4551       group_size = 1;
4552     }
4553
4554   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4555      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4556      need to match SCALAR_RESULTS with corresponding statements.  The first
4557      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4558      the first vector stmt, etc.
4559      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4560   if (group_size > new_phis.length ())
4561     {
4562       ratio = group_size / new_phis.length ();
4563       gcc_assert (!(group_size % new_phis.length ()));
4564     }
4565   else
4566     ratio = 1;
4567
4568   for (k = 0; k < group_size; k++)
4569     {
4570       if (k % ratio == 0)
4571         {
4572           epilog_stmt = new_phis[k / ratio];
4573           reduction_phi = reduction_phis[k / ratio];
4574           if (double_reduc)
4575             inner_phi = inner_phis[k / ratio];
4576         }
4577
4578       if (slp_reduc)
4579         {
4580           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4581
4582           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4583           /* SLP statements can't participate in patterns.  */
4584           gcc_assert (!orig_stmt);
4585           scalar_dest = gimple_assign_lhs (current_stmt);
4586         }
4587
4588       phis.create (3);
4589       /* Find the loop-closed-use at the loop exit of the original scalar
4590          result.  (The reduction result is expected to have two immediate uses -
4591          one at the latch block, and one at the loop exit).  */
4592       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4593         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4594             && !is_gimple_debug (USE_STMT (use_p)))
4595           phis.safe_push (USE_STMT (use_p));
4596
4597       /* While we expect to have found an exit_phi because of loop-closed-ssa
4598          form we can end up without one if the scalar cycle is dead.  */
4599
4600       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4601         {
4602           if (outer_loop)
4603             {
4604               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4605               gphi *vect_phi;
4606
4607               /* FORNOW. Currently not supporting the case that an inner-loop
4608                  reduction is not used in the outer-loop (but only outside the
4609                  outer-loop), unless it is double reduction.  */
4610               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4611                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4612                           || double_reduc);
4613
4614               if (double_reduc)
4615                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4616               else
4617                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4618               if (!double_reduc
4619                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4620                       != vect_double_reduction_def)
4621                 continue;
4622
4623               /* Handle double reduction:
4624
4625                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4626                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4627                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4628                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4629
4630                  At that point the regular reduction (stmt2 and stmt3) is
4631                  already vectorized, as well as the exit phi node, stmt4.
4632                  Here we vectorize the phi node of double reduction, stmt1, and
4633                  update all relevant statements.  */
4634
4635               /* Go through all the uses of s2 to find double reduction phi
4636                  node, i.e., stmt1 above.  */
4637               orig_name = PHI_RESULT (exit_phi);
4638               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4639                 {
4640                   stmt_vec_info use_stmt_vinfo;
4641                   stmt_vec_info new_phi_vinfo;
4642                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4643                   basic_block bb = gimple_bb (use_stmt);
4644                   gimple use;
4645
4646                   /* Check that USE_STMT is really double reduction phi
4647                      node.  */
4648                   if (gimple_code (use_stmt) != GIMPLE_PHI
4649                       || gimple_phi_num_args (use_stmt) != 2
4650                       || bb->loop_father != outer_loop)
4651                     continue;
4652                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4653                   if (!use_stmt_vinfo
4654                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4655                           != vect_double_reduction_def)
4656                     continue;
4657
4658                   /* Create vector phi node for double reduction:
4659                      vs1 = phi <vs0, vs2>
4660                      vs1 was created previously in this function by a call to
4661                        vect_get_vec_def_for_operand and is stored in
4662                        vec_initial_def;
4663                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4664                      vs0 is created here.  */
4665
4666                   /* Create vector phi node.  */
4667                   vect_phi = create_phi_node (vec_initial_def, bb);
4668                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4669                                     loop_vec_info_for_loop (outer_loop), NULL);
4670                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4671
4672                   /* Create vs0 - initial def of the double reduction phi.  */
4673                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4674                                              loop_preheader_edge (outer_loop));
4675                   init_def = get_initial_def_for_reduction (stmt,
4676                                                           preheader_arg, NULL);
4677                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4678                                                     vectype, NULL);
4679
4680                   /* Update phi node arguments with vs0 and vs2.  */
4681                   add_phi_arg (vect_phi, vect_phi_init,
4682                                loop_preheader_edge (outer_loop),
4683                                UNKNOWN_LOCATION);
4684                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4685                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4686                   if (dump_enabled_p ())
4687                     {
4688                       dump_printf_loc (MSG_NOTE, vect_location,
4689                                        "created double reduction phi node: ");
4690                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4691                       dump_printf (MSG_NOTE, "\n");
4692                     }
4693
4694                   vect_phi_res = PHI_RESULT (vect_phi);
4695
4696                   /* Replace the use, i.e., set the correct vs1 in the regular
4697                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4698                      loop is redundant.  */
4699                   use = reduction_phi;
4700                   for (j = 0; j < ncopies; j++)
4701                     {
4702                       edge pr_edge = loop_preheader_edge (loop);
4703                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4704                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4705                     }
4706                 }
4707             }
4708         }
4709
4710       phis.release ();
4711       if (nested_in_vect_loop)
4712         {
4713           if (double_reduc)
4714             loop = outer_loop;
4715           else
4716             continue;
4717         }
4718
4719       phis.create (3);
4720       /* Find the loop-closed-use at the loop exit of the original scalar
4721          result.  (The reduction result is expected to have two immediate uses,
4722          one at the latch block, and one at the loop exit).  For double
4723          reductions we are looking for exit phis of the outer loop.  */
4724       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4725         {
4726           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4727             {
4728               if (!is_gimple_debug (USE_STMT (use_p)))
4729                 phis.safe_push (USE_STMT (use_p));
4730             }
4731           else
4732             {
4733               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4734                 {
4735                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4736
4737                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4738                     {
4739                       if (!flow_bb_inside_loop_p (loop,
4740                                              gimple_bb (USE_STMT (phi_use_p)))
4741                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4742                         phis.safe_push (USE_STMT (phi_use_p));
4743                     }
4744                 }
4745             }
4746         }
4747
4748       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4749         {
4750           /* Replace the uses:  */
4751           orig_name = PHI_RESULT (exit_phi);
4752           scalar_result = scalar_results[k];
4753           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4754             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4755               SET_USE (use_p, scalar_result);
4756         }
4757
4758       phis.release ();
4759     }
4760 }
4761
4762
4763 /* Function vectorizable_reduction.
4764
4765    Check if STMT performs a reduction operation that can be vectorized.
4766    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4767    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4768    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4769
4770    This function also handles reduction idioms (patterns) that have been
4771    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4772    of this form:
4773      X = pattern_expr (arg0, arg1, ..., X)
4774    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4775    sequence that had been detected and replaced by the pattern-stmt (STMT).
4776
4777    In some cases of reduction patterns, the type of the reduction variable X is
4778    different than the type of the other arguments of STMT.
4779    In such cases, the vectype that is used when transforming STMT into a vector
4780    stmt is different than the vectype that is used to determine the
4781    vectorization factor, because it consists of a different number of elements
4782    than the actual number of elements that are being operated upon in parallel.
4783
4784    For example, consider an accumulation of shorts into an int accumulator.
4785    On some targets it's possible to vectorize this pattern operating on 8
4786    shorts at a time (hence, the vectype for purposes of determining the
4787    vectorization factor should be V8HI); on the other hand, the vectype that
4788    is used to create the vector form is actually V4SI (the type of the result).
4789
4790    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4791    indicates what is the actual level of parallelism (V8HI in the example), so
4792    that the right vectorization factor would be derived.  This vectype
4793    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4794    be used to create the vectorized stmt.  The right vectype for the vectorized
4795    stmt is obtained from the type of the result X:
4796         get_vectype_for_scalar_type (TREE_TYPE (X))
4797
4798    This means that, contrary to "regular" reductions (or "regular" stmts in
4799    general), the following equation:
4800       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4801    does *NOT* necessarily hold for reduction patterns.  */
4802
4803 bool
4804 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4805                         gimple *vec_stmt, slp_tree slp_node)
4806 {
4807   tree vec_dest;
4808   tree scalar_dest;
4809   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4810   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4811   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4812   tree vectype_in = NULL_TREE;
4813   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4815   enum tree_code code, orig_code, epilog_reduc_code;
4816   machine_mode vec_mode;
4817   int op_type;
4818   optab optab, reduc_optab;
4819   tree new_temp = NULL_TREE;
4820   tree def;
4821   gimple def_stmt;
4822   enum vect_def_type dt;
4823   gphi *new_phi = NULL;
4824   tree scalar_type;
4825   bool is_simple_use;
4826   gimple orig_stmt;
4827   stmt_vec_info orig_stmt_info;
4828   tree expr = NULL_TREE;
4829   int i;
4830   int ncopies;
4831   int epilog_copies;
4832   stmt_vec_info prev_stmt_info, prev_phi_info;
4833   bool single_defuse_cycle = false;
4834   tree reduc_def = NULL_TREE;
4835   gimple new_stmt = NULL;
4836   int j;
4837   tree ops[3];
4838   bool nested_cycle = false, found_nested_cycle_def = false;
4839   gimple reduc_def_stmt = NULL;
4840   bool double_reduc = false, dummy;
4841   basic_block def_bb;
4842   struct loop * def_stmt_loop, *outer_loop = NULL;
4843   tree def_arg;
4844   gimple def_arg_stmt;
4845   auto_vec<tree> vec_oprnds0;
4846   auto_vec<tree> vec_oprnds1;
4847   auto_vec<tree> vect_defs;
4848   auto_vec<gimple> phis;
4849   int vec_num;
4850   tree def0, def1, tem, op0, op1 = NULL_TREE;
4851
4852   /* In case of reduction chain we switch to the first stmt in the chain, but
4853      we don't update STMT_INFO, since only the last stmt is marked as reduction
4854      and has reduction properties.  */
4855   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4856     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4857
4858   if (nested_in_vect_loop_p (loop, stmt))
4859     {
4860       outer_loop = loop;
4861       loop = loop->inner;
4862       nested_cycle = true;
4863     }
4864
4865   /* 1. Is vectorizable reduction?  */
4866   /* Not supportable if the reduction variable is used in the loop, unless
4867      it's a reduction chain.  */
4868   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4869       && !GROUP_FIRST_ELEMENT (stmt_info))
4870     return false;
4871
4872   /* Reductions that are not used even in an enclosing outer-loop,
4873      are expected to be "live" (used out of the loop).  */
4874   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4875       && !STMT_VINFO_LIVE_P (stmt_info))
4876     return false;
4877
4878   /* Make sure it was already recognized as a reduction computation.  */
4879   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4880       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4881     return false;
4882
4883   /* 2. Has this been recognized as a reduction pattern?
4884
4885      Check if STMT represents a pattern that has been recognized
4886      in earlier analysis stages.  For stmts that represent a pattern,
4887      the STMT_VINFO_RELATED_STMT field records the last stmt in
4888      the original sequence that constitutes the pattern.  */
4889
4890   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4891   if (orig_stmt)
4892     {
4893       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4894       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4895       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4896     }
4897
4898   /* 3. Check the operands of the operation.  The first operands are defined
4899         inside the loop body. The last operand is the reduction variable,
4900         which is defined by the loop-header-phi.  */
4901
4902   gcc_assert (is_gimple_assign (stmt));
4903
4904   /* Flatten RHS.  */
4905   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4906     {
4907     case GIMPLE_SINGLE_RHS:
4908       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4909       if (op_type == ternary_op)
4910         {
4911           tree rhs = gimple_assign_rhs1 (stmt);
4912           ops[0] = TREE_OPERAND (rhs, 0);
4913           ops[1] = TREE_OPERAND (rhs, 1);
4914           ops[2] = TREE_OPERAND (rhs, 2);
4915           code = TREE_CODE (rhs);
4916         }
4917       else
4918         return false;
4919       break;
4920
4921     case GIMPLE_BINARY_RHS:
4922       code = gimple_assign_rhs_code (stmt);
4923       op_type = TREE_CODE_LENGTH (code);
4924       gcc_assert (op_type == binary_op);
4925       ops[0] = gimple_assign_rhs1 (stmt);
4926       ops[1] = gimple_assign_rhs2 (stmt);
4927       break;
4928
4929     case GIMPLE_TERNARY_RHS:
4930       code = gimple_assign_rhs_code (stmt);
4931       op_type = TREE_CODE_LENGTH (code);
4932       gcc_assert (op_type == ternary_op);
4933       ops[0] = gimple_assign_rhs1 (stmt);
4934       ops[1] = gimple_assign_rhs2 (stmt);
4935       ops[2] = gimple_assign_rhs3 (stmt);
4936       break;
4937
4938     case GIMPLE_UNARY_RHS:
4939       return false;
4940
4941     default:
4942       gcc_unreachable ();
4943     }
4944   /* The default is that the reduction variable is the last in statement.  */
4945   int reduc_index = op_type - 1;
4946
4947   if (code == COND_EXPR && slp_node)
4948     return false;
4949
4950   scalar_dest = gimple_assign_lhs (stmt);
4951   scalar_type = TREE_TYPE (scalar_dest);
4952   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4953       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4954     return false;
4955
4956   /* Do not try to vectorize bit-precision reductions.  */
4957   if ((TYPE_PRECISION (scalar_type)
4958        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4959     return false;
4960
4961   /* All uses but the last are expected to be defined in the loop.
4962      The last use is the reduction variable.  In case of nested cycle this
4963      assumption is not true: we use reduc_index to record the index of the
4964      reduction variable.  */
4965   for (i = 0; i < op_type - 1; i++)
4966     {
4967       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4968       if (i == 0 && code == COND_EXPR)
4969         continue;
4970
4971       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4972                                             &def_stmt, &def, &dt, &tem);
4973       if (!vectype_in)
4974         vectype_in = tem;
4975       gcc_assert (is_simple_use);
4976
4977       if (dt != vect_internal_def
4978           && dt != vect_external_def
4979           && dt != vect_constant_def
4980           && dt != vect_induction_def
4981           && !(dt == vect_nested_cycle && nested_cycle))
4982         return false;
4983
4984       if (dt == vect_nested_cycle)
4985         {
4986           found_nested_cycle_def = true;
4987           reduc_def_stmt = def_stmt;
4988           reduc_index = i;
4989         }
4990     }
4991
4992   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4993                                         &def_stmt, &def, &dt, &tem);
4994   if (!vectype_in)
4995     vectype_in = tem;
4996   gcc_assert (is_simple_use);
4997   if (!found_nested_cycle_def)
4998     reduc_def_stmt = def_stmt;
4999
5000   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5001     return false;
5002
5003   if (!(dt == vect_reduction_def
5004         || dt == vect_nested_cycle
5005         || ((dt == vect_internal_def || dt == vect_external_def
5006              || dt == vect_constant_def || dt == vect_induction_def)
5007             && nested_cycle && found_nested_cycle_def)))
5008     {
5009       /* For pattern recognized stmts, orig_stmt might be a reduction,
5010          but some helper statements for the pattern might not, or
5011          might be COND_EXPRs with reduction uses in the condition.  */
5012       gcc_assert (orig_stmt);
5013       return false;
5014     }
5015
5016   if (orig_stmt)
5017     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5018                                                        reduc_def_stmt,
5019                                                        !nested_cycle,
5020                                                        &dummy));
5021   else
5022     {
5023       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5024                                              !nested_cycle, &dummy);
5025       /* We changed STMT to be the first stmt in reduction chain, hence we
5026          check that in this case the first element in the chain is STMT.  */
5027       gcc_assert (stmt == tmp
5028                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5029     }
5030
5031   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5032     return false;
5033
5034   if (slp_node || PURE_SLP_STMT (stmt_info))
5035     ncopies = 1;
5036   else
5037     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5038                / TYPE_VECTOR_SUBPARTS (vectype_in));
5039
5040   gcc_assert (ncopies >= 1);
5041
5042   vec_mode = TYPE_MODE (vectype_in);
5043
5044   if (code == COND_EXPR)
5045     {
5046       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5047         {
5048           if (dump_enabled_p ())
5049             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5050                              "unsupported condition in reduction\n");
5051
5052           return false;
5053         }
5054     }
5055   else
5056     {
5057       /* 4. Supportable by target?  */
5058
5059       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5060           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5061         {
5062           /* Shifts and rotates are only supported by vectorizable_shifts,
5063              not vectorizable_reduction.  */
5064           if (dump_enabled_p ())
5065             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5066                              "unsupported shift or rotation.\n");
5067           return false;
5068         }
5069
5070       /* 4.1. check support for the operation in the loop  */
5071       optab = optab_for_tree_code (code, vectype_in, optab_default);
5072       if (!optab)
5073         {
5074           if (dump_enabled_p ())
5075             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5076                              "no optab.\n");
5077
5078           return false;
5079         }
5080
5081       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5082         {
5083           if (dump_enabled_p ())
5084             dump_printf (MSG_NOTE, "op not supported by target.\n");
5085
5086           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5087               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5088                   < vect_min_worthwhile_factor (code))
5089             return false;
5090
5091           if (dump_enabled_p ())
5092             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5093         }
5094
5095       /* Worthwhile without SIMD support?  */
5096       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5097           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5098              < vect_min_worthwhile_factor (code))
5099         {
5100           if (dump_enabled_p ())
5101             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102                              "not worthwhile without SIMD support.\n");
5103
5104           return false;
5105         }
5106     }
5107
5108   /* 4.2. Check support for the epilog operation.
5109
5110           If STMT represents a reduction pattern, then the type of the
5111           reduction variable may be different than the type of the rest
5112           of the arguments.  For example, consider the case of accumulation
5113           of shorts into an int accumulator; The original code:
5114                         S1: int_a = (int) short_a;
5115           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5116
5117           was replaced with:
5118                         STMT: int_acc = widen_sum <short_a, int_acc>
5119
5120           This means that:
5121           1. The tree-code that is used to create the vector operation in the
5122              epilog code (that reduces the partial results) is not the
5123              tree-code of STMT, but is rather the tree-code of the original
5124              stmt from the pattern that STMT is replacing.  I.e, in the example
5125              above we want to use 'widen_sum' in the loop, but 'plus' in the
5126              epilog.
5127           2. The type (mode) we use to check available target support
5128              for the vector operation to be created in the *epilog*, is
5129              determined by the type of the reduction variable (in the example
5130              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5131              However the type (mode) we use to check available target support
5132              for the vector operation to be created *inside the loop*, is
5133              determined by the type of the other arguments to STMT (in the
5134              example we'd check this: optab_handler (widen_sum_optab,
5135              vect_short_mode)).
5136
5137           This is contrary to "regular" reductions, in which the types of all
5138           the arguments are the same as the type of the reduction variable.
5139           For "regular" reductions we can therefore use the same vector type
5140           (and also the same tree-code) when generating the epilog code and
5141           when generating the code inside the loop.  */
5142
5143   if (orig_stmt)
5144     {
5145       /* This is a reduction pattern: get the vectype from the type of the
5146          reduction variable, and get the tree-code from orig_stmt.  */
5147       orig_code = gimple_assign_rhs_code (orig_stmt);
5148       gcc_assert (vectype_out);
5149       vec_mode = TYPE_MODE (vectype_out);
5150     }
5151   else
5152     {
5153       /* Regular reduction: use the same vectype and tree-code as used for
5154          the vector code inside the loop can be used for the epilog code. */
5155       orig_code = code;
5156     }
5157
5158   if (nested_cycle)
5159     {
5160       def_bb = gimple_bb (reduc_def_stmt);
5161       def_stmt_loop = def_bb->loop_father;
5162       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5163                                        loop_preheader_edge (def_stmt_loop));
5164       if (TREE_CODE (def_arg) == SSA_NAME
5165           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5166           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5167           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5168           && vinfo_for_stmt (def_arg_stmt)
5169           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5170               == vect_double_reduction_def)
5171         double_reduc = true;
5172     }
5173
5174   epilog_reduc_code = ERROR_MARK;
5175   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5176     {
5177       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5178                                          optab_default);
5179       if (!reduc_optab)
5180         {
5181           if (dump_enabled_p ())
5182             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5183                              "no optab for reduction.\n");
5184
5185           epilog_reduc_code = ERROR_MARK;
5186         }
5187       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5188         {
5189           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5190           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5191             {
5192               if (dump_enabled_p ())
5193                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5194                                  "reduc op not supported by target.\n");
5195
5196               epilog_reduc_code = ERROR_MARK;
5197             }
5198         }
5199     }
5200   else
5201     {
5202       if (!nested_cycle || double_reduc)
5203         {
5204           if (dump_enabled_p ())
5205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5206                              "no reduc code for scalar code.\n");
5207
5208           return false;
5209         }
5210     }
5211
5212   if (double_reduc && ncopies > 1)
5213     {
5214       if (dump_enabled_p ())
5215         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5216                          "multiple types in double reduction\n");
5217
5218       return false;
5219     }
5220
5221   /* In case of widenning multiplication by a constant, we update the type
5222      of the constant to be the type of the other operand.  We check that the
5223      constant fits the type in the pattern recognition pass.  */
5224   if (code == DOT_PROD_EXPR
5225       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5226     {
5227       if (TREE_CODE (ops[0]) == INTEGER_CST)
5228         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5229       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5230         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5231       else
5232         {
5233           if (dump_enabled_p ())
5234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5235                              "invalid types in dot-prod\n");
5236
5237           return false;
5238         }
5239     }
5240
5241   if (!vec_stmt) /* transformation not required.  */
5242     {
5243       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5244                                       reduc_index))
5245         return false;
5246       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5247       return true;
5248     }
5249
5250   /** Transform.  **/
5251
5252   if (dump_enabled_p ())
5253     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5254
5255   /* FORNOW: Multiple types are not supported for condition.  */
5256   if (code == COND_EXPR)
5257     gcc_assert (ncopies == 1);
5258
5259   /* Create the destination vector  */
5260   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5261
5262   /* In case the vectorization factor (VF) is bigger than the number
5263      of elements that we can fit in a vectype (nunits), we have to generate
5264      more than one vector stmt - i.e - we need to "unroll" the
5265      vector stmt by a factor VF/nunits.  For more details see documentation
5266      in vectorizable_operation.  */
5267
5268   /* If the reduction is used in an outer loop we need to generate
5269      VF intermediate results, like so (e.g. for ncopies=2):
5270         r0 = phi (init, r0)
5271         r1 = phi (init, r1)
5272         r0 = x0 + r0;
5273         r1 = x1 + r1;
5274     (i.e. we generate VF results in 2 registers).
5275     In this case we have a separate def-use cycle for each copy, and therefore
5276     for each copy we get the vector def for the reduction variable from the
5277     respective phi node created for this copy.
5278
5279     Otherwise (the reduction is unused in the loop nest), we can combine
5280     together intermediate results, like so (e.g. for ncopies=2):
5281         r = phi (init, r)
5282         r = x0 + r;
5283         r = x1 + r;
5284    (i.e. we generate VF/2 results in a single register).
5285    In this case for each copy we get the vector def for the reduction variable
5286    from the vectorized reduction operation generated in the previous iteration.
5287   */
5288
5289   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5290     {
5291       single_defuse_cycle = true;
5292       epilog_copies = 1;
5293     }
5294   else
5295     epilog_copies = ncopies;
5296
5297   prev_stmt_info = NULL;
5298   prev_phi_info = NULL;
5299   if (slp_node)
5300     {
5301       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5302       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5303                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5304     }
5305   else
5306     {
5307       vec_num = 1;
5308       vec_oprnds0.create (1);
5309       if (op_type == ternary_op)
5310         vec_oprnds1.create (1);
5311     }
5312
5313   phis.create (vec_num);
5314   vect_defs.create (vec_num);
5315   if (!slp_node)
5316     vect_defs.quick_push (NULL_TREE);
5317
5318   for (j = 0; j < ncopies; j++)
5319     {
5320       if (j == 0 || !single_defuse_cycle)
5321         {
5322           for (i = 0; i < vec_num; i++)
5323             {
5324               /* Create the reduction-phi that defines the reduction
5325                  operand.  */
5326               new_phi = create_phi_node (vec_dest, loop->header);
5327               set_vinfo_for_stmt (new_phi,
5328                                   new_stmt_vec_info (new_phi, loop_vinfo,
5329                                                      NULL));
5330                if (j == 0 || slp_node)
5331                  phis.quick_push (new_phi);
5332             }
5333         }
5334
5335       if (code == COND_EXPR)
5336         {
5337           gcc_assert (!slp_node);
5338           vectorizable_condition (stmt, gsi, vec_stmt,
5339                                   PHI_RESULT (phis[0]),
5340                                   reduc_index, NULL);
5341           /* Multiple types are not supported for condition.  */
5342           break;
5343         }
5344
5345       /* Handle uses.  */
5346       if (j == 0)
5347         {
5348           op0 = ops[!reduc_index];
5349           if (op_type == ternary_op)
5350             {
5351               if (reduc_index == 0)
5352                 op1 = ops[2];
5353               else
5354                 op1 = ops[1];
5355             }
5356
5357           if (slp_node)
5358             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5359                                slp_node, -1);
5360           else
5361             {
5362               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5363                                                             stmt, NULL);
5364               vec_oprnds0.quick_push (loop_vec_def0);
5365               if (op_type == ternary_op)
5366                {
5367                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5368                                                                NULL);
5369                  vec_oprnds1.quick_push (loop_vec_def1);
5370                }
5371             }
5372         }
5373       else
5374         {
5375           if (!slp_node)
5376             {
5377               enum vect_def_type dt;
5378               gimple dummy_stmt;
5379               tree dummy;
5380
5381               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5382                                   &dummy_stmt, &dummy, &dt);
5383               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5384                                                               loop_vec_def0);
5385               vec_oprnds0[0] = loop_vec_def0;
5386               if (op_type == ternary_op)
5387                 {
5388                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5389                                       &dummy, &dt);
5390                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5391                                                                 loop_vec_def1);
5392                   vec_oprnds1[0] = loop_vec_def1;
5393                 }
5394             }
5395
5396           if (single_defuse_cycle)
5397             reduc_def = gimple_assign_lhs (new_stmt);
5398
5399           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5400         }
5401
5402       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5403         {
5404           if (slp_node)
5405             reduc_def = PHI_RESULT (phis[i]);
5406           else
5407             {
5408               if (!single_defuse_cycle || j == 0)
5409                 reduc_def = PHI_RESULT (new_phi);
5410             }
5411
5412           def1 = ((op_type == ternary_op)
5413                   ? vec_oprnds1[i] : NULL);
5414           if (op_type == binary_op)
5415             {
5416               if (reduc_index == 0)
5417                 expr = build2 (code, vectype_out, reduc_def, def0);
5418               else
5419                 expr = build2 (code, vectype_out, def0, reduc_def);
5420             }
5421           else
5422             {
5423               if (reduc_index == 0)
5424                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5425               else
5426                 {
5427                   if (reduc_index == 1)
5428                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5429                   else
5430                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5431                 }
5432             }
5433
5434           new_stmt = gimple_build_assign (vec_dest, expr);
5435           new_temp = make_ssa_name (vec_dest, new_stmt);
5436           gimple_assign_set_lhs (new_stmt, new_temp);
5437           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5438
5439           if (slp_node)
5440             {
5441               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5442               vect_defs.quick_push (new_temp);
5443             }
5444           else
5445             vect_defs[0] = new_temp;
5446         }
5447
5448       if (slp_node)
5449         continue;
5450
5451       if (j == 0)
5452         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5453       else
5454         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5455
5456       prev_stmt_info = vinfo_for_stmt (new_stmt);
5457       prev_phi_info = vinfo_for_stmt (new_phi);
5458     }
5459
5460   /* Finalize the reduction-phi (set its arguments) and create the
5461      epilog reduction code.  */
5462   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5463     {
5464       new_temp = gimple_assign_lhs (*vec_stmt);
5465       vect_defs[0] = new_temp;
5466     }
5467
5468   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5469                                     epilog_reduc_code, phis, reduc_index,
5470                                     double_reduc, slp_node);
5471
5472   return true;
5473 }
5474
5475 /* Function vect_min_worthwhile_factor.
5476
5477    For a loop where we could vectorize the operation indicated by CODE,
5478    return the minimum vectorization factor that makes it worthwhile
5479    to use generic vectors.  */
5480 int
5481 vect_min_worthwhile_factor (enum tree_code code)
5482 {
5483   switch (code)
5484     {
5485     case PLUS_EXPR:
5486     case MINUS_EXPR:
5487     case NEGATE_EXPR:
5488       return 4;
5489
5490     case BIT_AND_EXPR:
5491     case BIT_IOR_EXPR:
5492     case BIT_XOR_EXPR:
5493     case BIT_NOT_EXPR:
5494       return 2;
5495
5496     default:
5497       return INT_MAX;
5498     }
5499 }
5500
5501
5502 /* Function vectorizable_induction
5503
5504    Check if PHI performs an induction computation that can be vectorized.
5505    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5506    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5507    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5508
5509 bool
5510 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5511                         gimple *vec_stmt)
5512 {
5513   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5514   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5515   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5516   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5517   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5518   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5519   tree vec_def;
5520
5521   gcc_assert (ncopies >= 1);
5522   /* FORNOW. These restrictions should be relaxed.  */
5523   if (nested_in_vect_loop_p (loop, phi))
5524     {
5525       imm_use_iterator imm_iter;
5526       use_operand_p use_p;
5527       gimple exit_phi;
5528       edge latch_e;
5529       tree loop_arg;
5530
5531       if (ncopies > 1)
5532         {
5533           if (dump_enabled_p ())
5534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5535                              "multiple types in nested loop.\n");
5536           return false;
5537         }
5538
5539       exit_phi = NULL;
5540       latch_e = loop_latch_edge (loop->inner);
5541       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5542       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5543         {
5544           gimple use_stmt = USE_STMT (use_p);
5545           if (is_gimple_debug (use_stmt))
5546             continue;
5547
5548           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5549             {
5550               exit_phi = use_stmt;
5551               break;
5552             }
5553         }
5554       if (exit_phi)
5555         {
5556           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5557           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5558                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5559             {
5560               if (dump_enabled_p ())
5561                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5562                                  "inner-loop induction only used outside "
5563                                  "of the outer vectorized loop.\n");
5564               return false;
5565             }
5566         }
5567     }
5568
5569   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5570     return false;
5571
5572   /* FORNOW: SLP not supported.  */
5573   if (STMT_SLP_TYPE (stmt_info))
5574     return false;
5575
5576   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5577
5578   if (gimple_code (phi) != GIMPLE_PHI)
5579     return false;
5580
5581   if (!vec_stmt) /* transformation not required.  */
5582     {
5583       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5584       if (dump_enabled_p ())
5585         dump_printf_loc (MSG_NOTE, vect_location,
5586                          "=== vectorizable_induction ===\n");
5587       vect_model_induction_cost (stmt_info, ncopies);
5588       return true;
5589     }
5590
5591   /** Transform.  **/
5592
5593   if (dump_enabled_p ())
5594     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5595
5596   vec_def = get_initial_def_for_induction (phi);
5597   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5598   return true;
5599 }
5600
5601 /* Function vectorizable_live_operation.
5602
5603    STMT computes a value that is used outside the loop.  Check if
5604    it can be supported.  */
5605
5606 bool
5607 vectorizable_live_operation (gimple stmt,
5608                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5609                              gimple *vec_stmt)
5610 {
5611   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5612   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5613   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5614   int i;
5615   int op_type;
5616   tree op;
5617   tree def;
5618   gimple def_stmt;
5619   enum vect_def_type dt;
5620   enum tree_code code;
5621   enum gimple_rhs_class rhs_class;
5622
5623   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5624
5625   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5626     return false;
5627
5628   if (!is_gimple_assign (stmt))
5629     {
5630       if (gimple_call_internal_p (stmt)
5631           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5632           && gimple_call_lhs (stmt)
5633           && loop->simduid
5634           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5635           && loop->simduid
5636              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5637         {
5638           edge e = single_exit (loop);
5639           basic_block merge_bb = e->dest;
5640           imm_use_iterator imm_iter;
5641           use_operand_p use_p;
5642           tree lhs = gimple_call_lhs (stmt);
5643
5644           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5645             {
5646               gimple use_stmt = USE_STMT (use_p);
5647               if (gimple_code (use_stmt) == GIMPLE_PHI
5648                   && gimple_bb (use_stmt) == merge_bb)
5649                 {
5650                   if (vec_stmt)
5651                     {
5652                       tree vfm1
5653                         = build_int_cst (unsigned_type_node,
5654                                          loop_vinfo->vectorization_factor - 1);
5655                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5656                     }
5657                   return true;
5658                 }
5659             }
5660         }
5661
5662       return false;
5663     }
5664
5665   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5666     return false;
5667
5668   /* FORNOW. CHECKME. */
5669   if (nested_in_vect_loop_p (loop, stmt))
5670     return false;
5671
5672   code = gimple_assign_rhs_code (stmt);
5673   op_type = TREE_CODE_LENGTH (code);
5674   rhs_class = get_gimple_rhs_class (code);
5675   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5676   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5677
5678   /* FORNOW: support only if all uses are invariant.  This means
5679      that the scalar operations can remain in place, unvectorized.
5680      The original last scalar value that they compute will be used.  */
5681
5682   for (i = 0; i < op_type; i++)
5683     {
5684       if (rhs_class == GIMPLE_SINGLE_RHS)
5685         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5686       else
5687         op = gimple_op (stmt, i + 1);
5688       if (op
5689           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5690                                   &dt))
5691         {
5692           if (dump_enabled_p ())
5693             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5694                              "use not simple.\n");
5695           return false;
5696         }
5697
5698       if (dt != vect_external_def && dt != vect_constant_def)
5699         return false;
5700     }
5701
5702   /* No transformation is required for the cases we currently support.  */
5703   return true;
5704 }
5705
5706 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5707
5708 static void
5709 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5710 {
5711   ssa_op_iter op_iter;
5712   imm_use_iterator imm_iter;
5713   def_operand_p def_p;
5714   gimple ustmt;
5715
5716   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5717     {
5718       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5719         {
5720           basic_block bb;
5721
5722           if (!is_gimple_debug (ustmt))
5723             continue;
5724
5725           bb = gimple_bb (ustmt);
5726
5727           if (!flow_bb_inside_loop_p (loop, bb))
5728             {
5729               if (gimple_debug_bind_p (ustmt))
5730                 {
5731                   if (dump_enabled_p ())
5732                     dump_printf_loc (MSG_NOTE, vect_location,
5733                                      "killing debug use\n");
5734
5735                   gimple_debug_bind_reset_value (ustmt);
5736                   update_stmt (ustmt);
5737                 }
5738               else
5739                 gcc_unreachable ();
5740             }
5741         }
5742     }
5743 }
5744
5745
5746 /* This function builds ni_name = number of iterations.  Statements
5747    are emitted on the loop preheader edge.  */
5748
5749 static tree
5750 vect_build_loop_niters (loop_vec_info loop_vinfo)
5751 {
5752   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5753   if (TREE_CODE (ni) == INTEGER_CST)
5754     return ni;
5755   else
5756     {
5757       tree ni_name, var;
5758       gimple_seq stmts = NULL;
5759       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5760
5761       var = create_tmp_var (TREE_TYPE (ni), "niters");
5762       ni_name = force_gimple_operand (ni, &stmts, false, var);
5763       if (stmts)
5764         gsi_insert_seq_on_edge_immediate (pe, stmts);
5765
5766       return ni_name;
5767     }
5768 }
5769
5770
5771 /* This function generates the following statements:
5772
5773    ni_name = number of iterations loop executes
5774    ratio = ni_name / vf
5775    ratio_mult_vf_name = ratio * vf
5776
5777    and places them on the loop preheader edge.  */
5778
5779 static void
5780 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5781                                  tree ni_name,
5782                                  tree *ratio_mult_vf_name_ptr,
5783                                  tree *ratio_name_ptr)
5784 {
5785   tree ni_minus_gap_name;
5786   tree var;
5787   tree ratio_name;
5788   tree ratio_mult_vf_name;
5789   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5790   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5791   tree log_vf;
5792
5793   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5794
5795   /* If epilogue loop is required because of data accesses with gaps, we
5796      subtract one iteration from the total number of iterations here for
5797      correct calculation of RATIO.  */
5798   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5799     {
5800       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5801                                        ni_name,
5802                                        build_one_cst (TREE_TYPE (ni_name)));
5803       if (!is_gimple_val (ni_minus_gap_name))
5804         {
5805           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5806           gimple stmts = NULL;
5807           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5808                                                     true, var);
5809           gsi_insert_seq_on_edge_immediate (pe, stmts);
5810         }
5811     }
5812   else
5813     ni_minus_gap_name = ni_name;
5814
5815   /* Create: ratio = ni >> log2(vf) */
5816   /* ???  As we have ni == number of latch executions + 1, ni could
5817      have overflown to zero.  So avoid computing ratio based on ni
5818      but compute it using the fact that we know ratio will be at least
5819      one, thus via (ni - vf) >> log2(vf) + 1.  */
5820   ratio_name
5821     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5822                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5823                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5824                                              ni_minus_gap_name,
5825                                              build_int_cst
5826                                                (TREE_TYPE (ni_name), vf)),
5827                                 log_vf),
5828                    build_int_cst (TREE_TYPE (ni_name), 1));
5829   if (!is_gimple_val (ratio_name))
5830     {
5831       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5832       gimple stmts = NULL;
5833       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5834       gsi_insert_seq_on_edge_immediate (pe, stmts);
5835     }
5836   *ratio_name_ptr = ratio_name;
5837
5838   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5839
5840   if (ratio_mult_vf_name_ptr)
5841     {
5842       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5843                                         ratio_name, log_vf);
5844       if (!is_gimple_val (ratio_mult_vf_name))
5845         {
5846           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5847           gimple stmts = NULL;
5848           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5849                                                      true, var);
5850           gsi_insert_seq_on_edge_immediate (pe, stmts);
5851         }
5852       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5853     }
5854
5855   return;
5856 }
5857
5858
5859 /* Function vect_transform_loop.
5860
5861    The analysis phase has determined that the loop is vectorizable.
5862    Vectorize the loop - created vectorized stmts to replace the scalar
5863    stmts in the loop, and update the loop exit condition.  */
5864
5865 void
5866 vect_transform_loop (loop_vec_info loop_vinfo)
5867 {
5868   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5869   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5870   int nbbs = loop->num_nodes;
5871   int i;
5872   tree ratio = NULL;
5873   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5874   bool grouped_store;
5875   bool slp_scheduled = false;
5876   gimple stmt, pattern_stmt;
5877   gimple_seq pattern_def_seq = NULL;
5878   gimple_stmt_iterator pattern_def_si = gsi_none ();
5879   bool transform_pattern_stmt = false;
5880   bool check_profitability = false;
5881   int th;
5882   /* Record number of iterations before we started tampering with the profile. */
5883   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5884
5885   if (dump_enabled_p ())
5886     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5887
5888   /* If profile is inprecise, we have chance to fix it up.  */
5889   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5890     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5891
5892   /* Use the more conservative vectorization threshold.  If the number
5893      of iterations is constant assume the cost check has been performed
5894      by our caller.  If the threshold makes all loops profitable that
5895      run at least the vectorization factor number of times checking
5896      is pointless, too.  */
5897   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5898   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5899       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5900     {
5901       if (dump_enabled_p ())
5902         dump_printf_loc (MSG_NOTE, vect_location,
5903                          "Profitability threshold is %d loop iterations.\n",
5904                          th);
5905       check_profitability = true;
5906     }
5907
5908   /* Version the loop first, if required, so the profitability check
5909      comes first.  */
5910
5911   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5912       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5913     {
5914       vect_loop_versioning (loop_vinfo, th, check_profitability);
5915       check_profitability = false;
5916     }
5917
5918   tree ni_name = vect_build_loop_niters (loop_vinfo);
5919   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5920
5921   /* Peel the loop if there are data refs with unknown alignment.
5922      Only one data ref with unknown store is allowed.  */
5923
5924   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5925     {
5926       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5927                                      th, check_profitability);
5928       check_profitability = false;
5929       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5930          be re-computed.  */
5931       ni_name = NULL_TREE;
5932     }
5933
5934   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5935      compile time constant), or it is a constant that doesn't divide by the
5936      vectorization factor, then an epilog loop needs to be created.
5937      We therefore duplicate the loop: the original loop will be vectorized,
5938      and will compute the first (n/VF) iterations.  The second copy of the loop
5939      will remain scalar and will compute the remaining (n%VF) iterations.
5940      (VF is the vectorization factor).  */
5941
5942   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5943       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5944     {
5945       tree ratio_mult_vf;
5946       if (!ni_name)
5947         ni_name = vect_build_loop_niters (loop_vinfo);
5948       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5949                                        &ratio);
5950       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5951                                       th, check_profitability);
5952     }
5953   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5954     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5955                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5956   else
5957     {
5958       if (!ni_name)
5959         ni_name = vect_build_loop_niters (loop_vinfo);
5960       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5961     }
5962
5963   /* 1) Make sure the loop header has exactly two entries
5964      2) Make sure we have a preheader basic block.  */
5965
5966   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5967
5968   split_edge (loop_preheader_edge (loop));
5969
5970   /* FORNOW: the vectorizer supports only loops which body consist
5971      of one basic block (header + empty latch). When the vectorizer will
5972      support more involved loop forms, the order by which the BBs are
5973      traversed need to be reconsidered.  */
5974
5975   for (i = 0; i < nbbs; i++)
5976     {
5977       basic_block bb = bbs[i];
5978       stmt_vec_info stmt_info;
5979
5980       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5981            gsi_next (&si))
5982         {
5983           gphi *phi = si.phi ();
5984           if (dump_enabled_p ())
5985             {
5986               dump_printf_loc (MSG_NOTE, vect_location,
5987                                "------>vectorizing phi: ");
5988               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5989               dump_printf (MSG_NOTE, "\n");
5990             }
5991           stmt_info = vinfo_for_stmt (phi);
5992           if (!stmt_info)
5993             continue;
5994
5995           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5996             vect_loop_kill_debug_uses (loop, phi);
5997
5998           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5999               && !STMT_VINFO_LIVE_P (stmt_info))
6000             continue;
6001
6002           if (STMT_VINFO_VECTYPE (stmt_info)
6003               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6004                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6005               && dump_enabled_p ())
6006             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6007
6008           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6009             {
6010               if (dump_enabled_p ())
6011                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6012               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6013             }
6014         }
6015
6016       pattern_stmt = NULL;
6017       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6018            !gsi_end_p (si) || transform_pattern_stmt;)
6019         {
6020           bool is_store;
6021
6022           if (transform_pattern_stmt)
6023             stmt = pattern_stmt;
6024           else
6025             {
6026               stmt = gsi_stmt (si);
6027               /* During vectorization remove existing clobber stmts.  */
6028               if (gimple_clobber_p (stmt))
6029                 {
6030                   unlink_stmt_vdef (stmt);
6031                   gsi_remove (&si, true);
6032                   release_defs (stmt);
6033                   continue;
6034                 }
6035             }
6036
6037           if (dump_enabled_p ())
6038             {
6039               dump_printf_loc (MSG_NOTE, vect_location,
6040                                "------>vectorizing statement: ");
6041               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6042               dump_printf (MSG_NOTE, "\n");
6043             }
6044
6045           stmt_info = vinfo_for_stmt (stmt);
6046
6047           /* vector stmts created in the outer-loop during vectorization of
6048              stmts in an inner-loop may not have a stmt_info, and do not
6049              need to be vectorized.  */
6050           if (!stmt_info)
6051             {
6052               gsi_next (&si);
6053               continue;
6054             }
6055
6056           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6057             vect_loop_kill_debug_uses (loop, stmt);
6058
6059           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6060               && !STMT_VINFO_LIVE_P (stmt_info))
6061             {
6062               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6063                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6064                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6065                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6066                 {
6067                   stmt = pattern_stmt;
6068                   stmt_info = vinfo_for_stmt (stmt);
6069                 }
6070               else
6071                 {
6072                   gsi_next (&si);
6073                   continue;
6074                 }
6075             }
6076           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6077                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6078                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6079                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6080             transform_pattern_stmt = true;
6081
6082           /* If pattern statement has def stmts, vectorize them too.  */
6083           if (is_pattern_stmt_p (stmt_info))
6084             {
6085               if (pattern_def_seq == NULL)
6086                 {
6087                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6088                   pattern_def_si = gsi_start (pattern_def_seq);
6089                 }
6090               else if (!gsi_end_p (pattern_def_si))
6091                 gsi_next (&pattern_def_si);
6092               if (pattern_def_seq != NULL)
6093                 {
6094                   gimple pattern_def_stmt = NULL;
6095                   stmt_vec_info pattern_def_stmt_info = NULL;
6096
6097                   while (!gsi_end_p (pattern_def_si))
6098                     {
6099                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6100                       pattern_def_stmt_info
6101                         = vinfo_for_stmt (pattern_def_stmt);
6102                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6103                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6104                         break;
6105                       gsi_next (&pattern_def_si);
6106                     }
6107
6108                   if (!gsi_end_p (pattern_def_si))
6109                     {
6110                       if (dump_enabled_p ())
6111                         {
6112                           dump_printf_loc (MSG_NOTE, vect_location,
6113                                            "==> vectorizing pattern def "
6114                                            "stmt: ");
6115                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6116                                             pattern_def_stmt, 0);
6117                           dump_printf (MSG_NOTE, "\n");
6118                         }
6119
6120                       stmt = pattern_def_stmt;
6121                       stmt_info = pattern_def_stmt_info;
6122                     }
6123                   else
6124                     {
6125                       pattern_def_si = gsi_none ();
6126                       transform_pattern_stmt = false;
6127                     }
6128                 }
6129               else
6130                 transform_pattern_stmt = false;
6131             }
6132
6133           if (STMT_VINFO_VECTYPE (stmt_info))
6134             {
6135               unsigned int nunits
6136                 = (unsigned int)
6137                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6138               if (!STMT_SLP_TYPE (stmt_info)
6139                   && nunits != (unsigned int) vectorization_factor
6140                   && dump_enabled_p ())
6141                   /* For SLP VF is set according to unrolling factor, and not
6142                      to vector size, hence for SLP this print is not valid.  */
6143                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6144             }
6145
6146           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6147              reached.  */
6148           if (STMT_SLP_TYPE (stmt_info))
6149             {
6150               if (!slp_scheduled)
6151                 {
6152                   slp_scheduled = true;
6153
6154                   if (dump_enabled_p ())
6155                     dump_printf_loc (MSG_NOTE, vect_location,
6156                                      "=== scheduling SLP instances ===\n");
6157
6158                   vect_schedule_slp (loop_vinfo, NULL);
6159                 }
6160
6161               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6162               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6163                 {
6164                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6165                     {
6166                       pattern_def_seq = NULL;
6167                       gsi_next (&si);
6168                     }
6169                   continue;
6170                 }
6171             }
6172
6173           /* -------- vectorize statement ------------ */
6174           if (dump_enabled_p ())
6175             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6176
6177           grouped_store = false;
6178           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6179           if (is_store)
6180             {
6181               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6182                 {
6183                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6184                      interleaving chain was completed - free all the stores in
6185                      the chain.  */
6186                   gsi_next (&si);
6187                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6188                 }
6189               else
6190                 {
6191                   /* Free the attached stmt_vec_info and remove the stmt.  */
6192                   gimple store = gsi_stmt (si);
6193                   free_stmt_vec_info (store);
6194                   unlink_stmt_vdef (store);
6195                   gsi_remove (&si, true);
6196                   release_defs (store);
6197                 }
6198
6199               /* Stores can only appear at the end of pattern statements.  */
6200               gcc_assert (!transform_pattern_stmt);
6201               pattern_def_seq = NULL;
6202             }
6203           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6204             {
6205               pattern_def_seq = NULL;
6206               gsi_next (&si);
6207             }
6208         }                       /* stmts in BB */
6209     }                           /* BBs in loop */
6210
6211   slpeel_make_loop_iterate_ntimes (loop, ratio);
6212
6213   /* Reduce loop iterations by the vectorization factor.  */
6214   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6215                       expected_iterations / vectorization_factor);
6216   loop->nb_iterations_upper_bound
6217     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6218   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6219       && loop->nb_iterations_upper_bound != 0)
6220     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6221   if (loop->any_estimate)
6222     {
6223       loop->nb_iterations_estimate
6224         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6225        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6226            && loop->nb_iterations_estimate != 0)
6227          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6228     }
6229
6230   if (dump_enabled_p ())
6231     {
6232       dump_printf_loc (MSG_NOTE, vect_location,
6233                        "LOOP VECTORIZED\n");
6234       if (loop->inner)
6235         dump_printf_loc (MSG_NOTE, vect_location,
6236                          "OUTER LOOP VECTORIZED\n");
6237       dump_printf (MSG_NOTE, "\n");
6238     }
6239 }