gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "predict.h"
  40 #include "hard-reg-set.h"
  41 #include "function.h"
  42 #include "dominance.h"
  43 #include "cfg.h"
  44 #include "cfganal.h"
  45 #include "basic-block.h"
  46 #include "gimple-pretty-print.h"
  47 #include "tree-ssa-alias.h"
  48 #include "internal-fn.h"
  49 #include "gimple-expr.h"
  50 #include "is-a.h"
  51 #include "gimple.h"
  52 #include "gimplify.h"
  53 #include "gimple-iterator.h"
  54 #include "gimplify-me.h"
  55 #include "gimple-ssa.h"
  56 #include "tree-phinodes.h"
  57 #include "ssa-iterators.h"
  58 #include "stringpool.h"
  59 #include "tree-ssanames.h"
  60 #include "tree-ssa-loop-ivopts.h"
  61 #include "tree-ssa-loop-manip.h"
  62 #include "tree-ssa-loop-niter.h"
  63 #include "tree-pass.h"
  64 #include "cfgloop.h"
  65 #include "hashtab.h"
  66 #include "rtl.h"
  67 #include "flags.h"
  68 #include "statistics.h"
  69 #include "real.h"
  70 #include "fixed-value.h"
  71 #include "insn-config.h"
  72 #include "expmed.h"
  73 #include "dojump.h"
  74 #include "explow.h"
  75 #include "calls.h"
  76 #include "emit-rtl.h"
  77 #include "varasm.h"
  78 #include "stmt.h"
  79 #include "expr.h"
  80 #include "recog.h"
  81 #include "insn-codes.h"
  82 #include "optabs.h"
  83 #include "params.h"
  84 #include "diagnostic-core.h"
  85 #include "tree-chrec.h"
  86 #include "tree-scalar-evolution.h"
  87 #include "tree-vectorizer.h"
  88 #include "target.h"
  89
  90 /* Loop Vectorization Pass.
  91
  92    This pass tries to vectorize loops.
  93
  94    For example, the vectorizer transforms the following simple loop:
  95
  96         short a[N]; short b[N]; short c[N]; int i;
  97
  98         for (i=0; i<N; i++){
  99           a[i] = b[i] + c[i];
 100         }
 101
 102    as if it was manually vectorized by rewriting the source code into:
 103
 104         typedef int __attribute__((mode(V8HI))) v8hi;
 105         short a[N];  short b[N]; short c[N];   int i;
 106         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
 107         v8hi va, vb, vc;
 108
 109         for (i=0; i<N/8; i++){
 110           vb = pb[i];
 111           vc = pc[i];
 112           va = vb + vc;
 113           pa[i] = va;
 114         }
 115
 116         The main entry to this pass is vectorize_loops(), in which
 117    the vectorizer applies a set of analyses on a given set of loops,
 118    followed by the actual vectorization transformation for the loops that
 119    had successfully passed the analysis phase.
 120         Throughout this pass we make a distinction between two types of
 121    data: scalars (which are represented by SSA_NAMES), and memory references
 122    ("data-refs").  These two types of data require different handling both
 123    during analysis and transformation. The types of data-refs that the
 124    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 125    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 126    accesses are required to have a simple (consecutive) access pattern.
 127
 128    Analysis phase:
 129    ===============
 130         The driver for the analysis phase is vect_analyze_loop().
 131    It applies a set of analyses, some of which rely on the scalar evolution
 132    analyzer (scev) developed by Sebastian Pop.
 133
 134         During the analysis phase the vectorizer records some information
 135    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 136    loop, as well as general information about the loop as a whole, which is
 137    recorded in a "loop_vec_info" struct attached to each loop.
 138
 139    Transformation phase:
 140    =====================
 141         The loop transformation phase scans all the stmts in the loop, and
 142    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 143    the loop that needs to be vectorized.  It inserts the vector code sequence
 144    just before the scalar stmt S, and records a pointer to the vector code
 145    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 146    attached to S).  This pointer will be used for the vectorization of following
 147    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 148    otherwise, we rely on dead code elimination for removing it.
 149
 150         For example, say stmt S1 was vectorized into stmt VS1:
 151
 152    VS1: vb = px[i];
 153    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 154    S2:  a = b;
 155
 156    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 157    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 158    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 159    resulting sequence would be:
 160
 161    VS1: vb = px[i];
 162    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 163    VS2: va = vb;
 164    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 165
 166         Operands that are not SSA_NAMEs, are data-refs that appear in
 167    load/store operations (like 'x[i]' in S1), and are handled differently.
 168
 169    Target modeling:
 170    =================
 171         Currently the only target specific information that is used is the
 172    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 173    Targets that can support different sizes of vectors, for now will need
 174    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 175    flexibility will be added in the future.
 176
 177         Since we only vectorize operations which vector form can be
 178    expressed using existing tree codes, to verify that an operation is
 179    supported, the vectorizer checks the relevant optab at the relevant
 180    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 181    the value found is CODE_FOR_nothing, then there's no target support, and
 182    we can't vectorize the stmt.
 183
 184    For additional information on this project see:
 185    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 186 */
 187
 188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 189
 190 /* Function vect_determine_vectorization_factor
 191
 192    Determine the vectorization factor (VF).  VF is the number of data elements
 193    that are operated upon in parallel in a single iteration of the vectorized
 194    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 195    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 196    elements can fit in a single vector register.
 197
 198    We currently support vectorization of loops in which all types operated upon
 199    are of the same size.  Therefore this function currently sets VF according to
 200    the size of the types operated upon, and fails if there are multiple sizes
 201    in the loop.
 202
 203    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 204    original loop:
 205         for (i=0; i<N; i++){
 206           a[i] = b[i] + c[i];
 207         }
 208
 209    vectorized loop:
 210         for (i=0; i<N; i+=VF){
 211           a[i:VF] = b[i:VF] + c[i:VF];
 212         }
 213 */
 214
 215 static bool
 216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 217 {
 218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 219   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 220   int nbbs = loop->num_nodes;
 221   unsigned int vectorization_factor = 0;
 222   tree scalar_type;
 223   gphi *phi;
 224   tree vectype;
 225   unsigned int nunits;
 226   stmt_vec_info stmt_info;
 227   int i;
 228   HOST_WIDE_INT dummy;
 229   gimple stmt, pattern_stmt = NULL;
 230   gimple_seq pattern_def_seq = NULL;
 231   gimple_stmt_iterator pattern_def_si = gsi_none ();
 232   bool analyze_pattern_stmt = false;
 233
 234   if (dump_enabled_p ())
 235     dump_printf_loc (MSG_NOTE, vect_location,
 236                      "=== vect_determine_vectorization_factor ===\n");
 237
 238   for (i = 0; i < nbbs; i++)
 239     {
 240       basic_block bb = bbs[i];
 241
 242       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 243            gsi_next (&si))
 244         {
 245           phi = si.phi ();
 246           stmt_info = vinfo_for_stmt (phi);
 247           if (dump_enabled_p ())
 248             {
 249               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 250               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 251               dump_printf (MSG_NOTE, "\n");
 252             }
 253
 254           gcc_assert (stmt_info);
 255
 256           if (STMT_VINFO_RELEVANT_P (stmt_info))
 257             {
 258               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 259               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location,
 264                                    "get vectype for scalar type:  ");
 265                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 266                   dump_printf (MSG_NOTE, "\n");
 267                 }
 268
 269               vectype = get_vectype_for_scalar_type (scalar_type);
 270               if (!vectype)
 271                 {
 272                   if (dump_enabled_p ())
 273                     {
 274                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 275                                        "not vectorized: unsupported "
 276                                        "data-type ");
 277                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 278                                          scalar_type);
 279                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 280                     }
 281                   return false;
 282                 }
 283               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 284
 285               if (dump_enabled_p ())
 286                 {
 287                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 288                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 289                   dump_printf (MSG_NOTE, "\n");
 290                 }
 291
 292               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 293               if (dump_enabled_p ())
 294                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 295                                  nunits);
 296
 297               if (!vectorization_factor
 298                   || (nunits > vectorization_factor))
 299                 vectorization_factor = nunits;
 300             }
 301         }
 302
 303       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 304            !gsi_end_p (si) || analyze_pattern_stmt;)
 305         {
 306           tree vf_vectype;
 307
 308           if (analyze_pattern_stmt)
 309             stmt = pattern_stmt;
 310           else
 311             stmt = gsi_stmt (si);
 312
 313           stmt_info = vinfo_for_stmt (stmt);
 314
 315           if (dump_enabled_p ())
 316             {
 317               dump_printf_loc (MSG_NOTE, vect_location,
 318                                "==> examining statement: ");
 319               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 320               dump_printf (MSG_NOTE, "\n");
 321             }
 322
 323           gcc_assert (stmt_info);
 324
 325           /* Skip stmts which do not need to be vectorized.  */
 326           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 327                && !STMT_VINFO_LIVE_P (stmt_info))
 328               || gimple_clobber_p (stmt))
 329             {
 330               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 331                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 332                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 333                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 334                 {
 335                   stmt = pattern_stmt;
 336                   stmt_info = vinfo_for_stmt (pattern_stmt);
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_NOTE, vect_location,
 340                                        "==> examining pattern statement: ");
 341                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 342                       dump_printf (MSG_NOTE, "\n");
 343                     }
 344                 }
 345               else
 346                 {
 347                   if (dump_enabled_p ())
 348                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 349                   gsi_next (&si);
 350                   continue;
 351                 }
 352             }
 353           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 354                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 355                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 356                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 357             analyze_pattern_stmt = true;
 358
 359           /* If a pattern statement has def stmts, analyze them too.  */
 360           if (is_pattern_stmt_p (stmt_info))
 361             {
 362               if (pattern_def_seq == NULL)
 363                 {
 364                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 365                   pattern_def_si = gsi_start (pattern_def_seq);
 366                 }
 367               else if (!gsi_end_p (pattern_def_si))
 368                 gsi_next (&pattern_def_si);
 369               if (pattern_def_seq != NULL)
 370                 {
 371                   gimple pattern_def_stmt = NULL;
 372                   stmt_vec_info pattern_def_stmt_info = NULL;
 373
 374                   while (!gsi_end_p (pattern_def_si))
 375                     {
 376                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 377                       pattern_def_stmt_info
 378                         = vinfo_for_stmt (pattern_def_stmt);
 379                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 380                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 381                         break;
 382                       gsi_next (&pattern_def_si);
 383                     }
 384
 385                   if (!gsi_end_p (pattern_def_si))
 386                     {
 387                       if (dump_enabled_p ())
 388                         {
 389                           dump_printf_loc (MSG_NOTE, vect_location,
 390                                            "==> examining pattern def stmt: ");
 391                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 392                                             pattern_def_stmt, 0);
 393                           dump_printf (MSG_NOTE, "\n");
 394                         }
 395
 396                       stmt = pattern_def_stmt;
 397                       stmt_info = pattern_def_stmt_info;
 398                     }
 399                   else
 400                     {
 401                       pattern_def_si = gsi_none ();
 402                       analyze_pattern_stmt = false;
 403                     }
 404                 }
 405               else
 406                 analyze_pattern_stmt = false;
 407             }
 408
 409           if (gimple_get_lhs (stmt) == NULL_TREE
 410               /* MASK_STORE has no lhs, but is ok.  */
 411               && (!is_gimple_call (stmt)
 412                   || !gimple_call_internal_p (stmt)
 413                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 414             {
 415               if (is_gimple_call (stmt))
 416                 {
 417                   /* Ignore calls with no lhs.  These must be calls to
 418                      #pragma omp simd functions, and what vectorization factor
 419                      it really needs can't be determined until
 420                      vectorizable_simd_clone_call.  */
 421                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 422                     {
 423                       pattern_def_seq = NULL;
 424                       gsi_next (&si);
 425                     }
 426                   continue;
 427                 }
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: irregular stmt.");
 432                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 433                                     0);
 434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 435                 }
 436               return false;
 437             }
 438
 439           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: vector stmt in loop:");
 445                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 446                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 447                 }
 448               return false;
 449             }
 450
 451           if (STMT_VINFO_VECTYPE (stmt_info))
 452             {
 453               /* The only case when a vectype had been already set is for stmts
 454                  that contain a dataref, or for "pattern-stmts" (stmts
 455                  generated by the vectorizer to represent/replace a certain
 456                  idiom).  */
 457               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 458                           || is_pattern_stmt_p (stmt_info)
 459                           || !gsi_end_p (pattern_def_si));
 460               vectype = STMT_VINFO_VECTYPE (stmt_info);
 461             }
 462           else
 463             {
 464               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 465               if (is_gimple_call (stmt)
 466                   && gimple_call_internal_p (stmt)
 467                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 468                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 469               else
 470                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 471               if (dump_enabled_p ())
 472                 {
 473                   dump_printf_loc (MSG_NOTE, vect_location,
 474                                    "get vectype for scalar type:  ");
 475                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 476                   dump_printf (MSG_NOTE, "\n");
 477                 }
 478               vectype = get_vectype_for_scalar_type (scalar_type);
 479               if (!vectype)
 480                 {
 481                   if (dump_enabled_p ())
 482                     {
 483                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                                        "not vectorized: unsupported "
 485                                        "data-type ");
 486                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 487                                          scalar_type);
 488                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 489                     }
 490                   return false;
 491                 }
 492
 493               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 494
 495               if (dump_enabled_p ())
 496                 {
 497                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 498                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 499                   dump_printf (MSG_NOTE, "\n");
 500                 }
 501             }
 502
 503           /* The vectorization factor is according to the smallest
 504              scalar type (or the largest vector size, but we only
 505              support one vector size per loop).  */
 506           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                        &dummy);
 508           if (dump_enabled_p ())
 509             {
 510               dump_printf_loc (MSG_NOTE, vect_location,
 511                                "get vectype for scalar type:  ");
 512               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513               dump_printf (MSG_NOTE, "\n");
 514             }
 515           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   return true;
 583 }
 584
 585
 586 /* Function vect_is_simple_iv_evolution.
 587
 588    FORNOW: A simple evolution of an induction variables in the loop is
 589    considered a polynomial evolution.  */
 590
 591 static bool
 592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 593                              tree * step)
 594 {
 595   tree init_expr;
 596   tree step_expr;
 597   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 598   basic_block bb;
 599
 600   /* When there is no evolution in this loop, the evolution function
 601      is not "simple".  */
 602   if (evolution_part == NULL_TREE)
 603     return false;
 604
 605   /* When the evolution is a polynomial of degree >= 2
 606      the evolution function is not "simple".  */
 607   if (tree_is_chrec (evolution_part))
 608     return false;
 609
 610   step_expr = evolution_part;
 611   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 612
 613   if (dump_enabled_p ())
 614     {
 615       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 616       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 617       dump_printf (MSG_NOTE, ",  init: ");
 618       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 619       dump_printf (MSG_NOTE, "\n");
 620     }
 621
 622   *init = init_expr;
 623   *step = step_expr;
 624
 625   if (TREE_CODE (step_expr) != INTEGER_CST
 626       && (TREE_CODE (step_expr) != SSA_NAME
 627           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 628               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 629           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 630               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 631                   || !flag_associative_math)))
 632       && (TREE_CODE (step_expr) != REAL_CST
 633           || !flag_associative_math))
 634     {
 635       if (dump_enabled_p ())
 636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 637                          "step unknown.\n");
 638       return false;
 639     }
 640
 641   return true;
 642 }
 643
 644 /* Function vect_analyze_scalar_cycles_1.
 645
 646    Examine the cross iteration def-use cycles of scalar variables
 647    in LOOP.  LOOP_VINFO represents the loop that is now being
 648    considered for vectorization (can be LOOP, or an outer-loop
 649    enclosing LOOP).  */
 650
 651 static void
 652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 653 {
 654   basic_block bb = loop->header;
 655   tree init, step;
 656   auto_vec<gimple, 64> worklist;
 657   gphi_iterator gsi;
 658   bool double_reduc;
 659
 660   if (dump_enabled_p ())
 661     dump_printf_loc (MSG_NOTE, vect_location,
 662                      "=== vect_analyze_scalar_cycles ===\n");
 663
 664   /* First - identify all inductions.  Reduction detection assumes that all the
 665      inductions have been identified, therefore, this order must not be
 666      changed.  */
 667   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 668     {
 669       gphi *phi = gsi.phi ();
 670       tree access_fn = NULL;
 671       tree def = PHI_RESULT (phi);
 672       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 673
 674       if (dump_enabled_p ())
 675         {
 676           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 677           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 678           dump_printf (MSG_NOTE, "\n");
 679         }
 680
 681       /* Skip virtual phi's.  The data dependences that are associated with
 682          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 683       if (virtual_operand_p (def))
 684         continue;
 685
 686       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 687
 688       /* Analyze the evolution function.  */
 689       access_fn = analyze_scalar_evolution (loop, def);
 690       if (access_fn)
 691         {
 692           STRIP_NOPS (access_fn);
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_NOTE, vect_location,
 696                                "Access function of PHI: ");
 697               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 698               dump_printf (MSG_NOTE, "\n");
 699             }
 700           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 701             = evolution_part_in_loop_num (access_fn, loop->num);
 702         }
 703
 704       if (!access_fn
 705           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 706           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 707               && TREE_CODE (step) != INTEGER_CST))
 708         {
 709           worklist.safe_push (phi);
 710           continue;
 711         }
 712
 713       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 714
 715       if (dump_enabled_p ())
 716         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 717       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 718     }
 719
 720
 721   /* Second - identify all reductions and nested cycles.  */
 722   while (worklist.length () > 0)
 723     {
 724       gimple phi = worklist.pop ();
 725       tree def = PHI_RESULT (phi);
 726       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 727       gimple reduc_stmt;
 728       bool nested_cycle;
 729
 730       if (dump_enabled_p ())
 731         {
 732           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 733           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 734           dump_printf (MSG_NOTE, "\n");
 735         }
 736
 737       gcc_assert (!virtual_operand_p (def)
 738                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 739
 740       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 741       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 742                                                 &double_reduc);
 743       if (reduc_stmt)
 744         {
 745           if (double_reduc)
 746             {
 747               if (dump_enabled_p ())
 748                 dump_printf_loc (MSG_NOTE, vect_location,
 749                                  "Detected double reduction.\n");
 750
 751               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 752               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 753                                                     vect_double_reduction_def;
 754             }
 755           else
 756             {
 757               if (nested_cycle)
 758                 {
 759                   if (dump_enabled_p ())
 760                     dump_printf_loc (MSG_NOTE, vect_location,
 761                                      "Detected vectorizable nested cycle.\n");
 762
 763                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 764                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 765                                                              vect_nested_cycle;
 766                 }
 767               else
 768                 {
 769                   if (dump_enabled_p ())
 770                     dump_printf_loc (MSG_NOTE, vect_location,
 771                                      "Detected reduction.\n");
 772
 773                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 774                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 775                                                            vect_reduction_def;
 776                   /* Store the reduction cycles for possible vectorization in
 777                      loop-aware SLP.  */
 778                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 779                 }
 780             }
 781         }
 782       else
 783         if (dump_enabled_p ())
 784           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 785                            "Unknown def-use cycle pattern.\n");
 786     }
 787 }
 788
 789
 790 /* Function vect_analyze_scalar_cycles.
 791
 792    Examine the cross iteration def-use cycles of scalar variables, by
 793    analyzing the loop-header PHIs of scalar variables.  Classify each
 794    cycle as one of the following: invariant, induction, reduction, unknown.
 795    We do that for the loop represented by LOOP_VINFO, and also to its
 796    inner-loop, if exists.
 797    Examples for scalar cycles:
 798
 799    Example1: reduction:
 800
 801               loop1:
 802               for (i=0; i<N; i++)
 803                  sum += a[i];
 804
 805    Example2: induction:
 806
 807               loop2:
 808               for (i=0; i<N; i++)
 809                  a[i] = i;  */
 810
 811 static void
 812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 813 {
 814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 815
 816   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 817
 818   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 819      Reductions in such inner-loop therefore have different properties than
 820      the reductions in the nest that gets vectorized:
 821      1. When vectorized, they are executed in the same order as in the original
 822         scalar loop, so we can't change the order of computation when
 823         vectorizing them.
 824      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 825         current checks are too strict.  */
 826
 827   if (loop->inner)
 828     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 829 }
 830
 831
 832 /* Function vect_get_loop_niters.
 833
 834    Determine how many iterations the loop is executed and place it
 835    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 836    in NUMBER_OF_ITERATIONSM1.
 837
 838    Return the loop exit condition.  */
 839
 840
 841 static gcond *
 842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 843                       tree *number_of_iterationsm1)
 844 {
 845   tree niters;
 846
 847   if (dump_enabled_p ())
 848     dump_printf_loc (MSG_NOTE, vect_location,
 849                      "=== get_loop_niters ===\n");
 850
 851   niters = number_of_latch_executions (loop);
 852   *number_of_iterationsm1 = niters;
 853
 854   /* We want the number of loop header executions which is the number
 855      of latch executions plus one.
 856      ???  For UINT_MAX latch executions this number overflows to zero
 857      for loops like do { n++; } while (n != 0);  */
 858   if (niters && !chrec_contains_undetermined (niters))
 859     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 860                           build_int_cst (TREE_TYPE (niters), 1));
 861   *number_of_iterations = niters;
 862
 863   return get_loop_exit_condition (loop);
 864 }
 865
 866
 867 /* Function bb_in_loop_p
 868
 869    Used as predicate for dfs order traversal of the loop bbs.  */
 870
 871 static bool
 872 bb_in_loop_p (const_basic_block bb, const void *data)
 873 {
 874   const struct loop *const loop = (const struct loop *)data;
 875   if (flow_bb_inside_loop_p (loop, bb))
 876     return true;
 877   return false;
 878 }
 879
 880
 881 /* Function new_loop_vec_info.
 882
 883    Create and initialize a new loop_vec_info struct for LOOP, as well as
 884    stmt_vec_info structs for all the stmts in LOOP.  */
 885
 886 static loop_vec_info
 887 new_loop_vec_info (struct loop *loop)
 888 {
 889   loop_vec_info res;
 890   basic_block *bbs;
 891   gimple_stmt_iterator si;
 892   unsigned int i, nbbs;
 893
 894   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 895   LOOP_VINFO_LOOP (res) = loop;
 896
 897   bbs = get_loop_body (loop);
 898
 899   /* Create/Update stmt_info for all stmts in the loop.  */
 900   for (i = 0; i < loop->num_nodes; i++)
 901     {
 902       basic_block bb = bbs[i];
 903
 904       /* BBs in a nested inner-loop will have been already processed (because
 905          we will have called vect_analyze_loop_form for any nested inner-loop).
 906          Therefore, for stmts in an inner-loop we just want to update the
 907          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 908          loop_info of the outer-loop we are currently considering to vectorize
 909          (instead of the loop_info of the inner-loop).
 910          For stmts in other BBs we need to create a stmt_info from scratch.  */
 911       if (bb->loop_father != loop)
 912         {
 913           /* Inner-loop bb.  */
 914           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 915           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 916             {
 917               gimple phi = gsi_stmt (si);
 918               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 919               loop_vec_info inner_loop_vinfo =
 920                 STMT_VINFO_LOOP_VINFO (stmt_info);
 921               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 922               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 923             }
 924           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 925            {
 926               gimple stmt = gsi_stmt (si);
 927               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 928               loop_vec_info inner_loop_vinfo =
 929                  STMT_VINFO_LOOP_VINFO (stmt_info);
 930               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 931               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 932            }
 933         }
 934       else
 935         {
 936           /* bb in current nest.  */
 937           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 938             {
 939               gimple phi = gsi_stmt (si);
 940               gimple_set_uid (phi, 0);
 941               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 942             }
 943
 944           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 945             {
 946               gimple stmt = gsi_stmt (si);
 947               gimple_set_uid (stmt, 0);
 948               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 949             }
 950         }
 951     }
 952
 953   /* CHECKME: We want to visit all BBs before their successors (except for
 954      latch blocks, for which this assertion wouldn't hold).  In the simple
 955      case of the loop forms we allow, a dfs order of the BBs would the same
 956      as reversed postorder traversal, so we are safe.  */
 957
 958    free (bbs);
 959    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 960    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 961                               bbs, loop->num_nodes, loop);
 962    gcc_assert (nbbs == loop->num_nodes);
 963
 964   LOOP_VINFO_BBS (res) = bbs;
 965   LOOP_VINFO_NITERSM1 (res) = NULL;
 966   LOOP_VINFO_NITERS (res) = NULL;
 967   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 968   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 969   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 970   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 971   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 972   LOOP_VINFO_VECT_FACTOR (res) = 0;
 973   LOOP_VINFO_LOOP_NEST (res).create (3);
 974   LOOP_VINFO_DATAREFS (res).create (10);
 975   LOOP_VINFO_DDRS (res).create (10 * 10);
 976   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 977   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 978              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 979   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 980              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 981   LOOP_VINFO_GROUPED_STORES (res).create (10);
 982   LOOP_VINFO_REDUCTIONS (res).create (10);
 983   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 984   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 985   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 986   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 987   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 988   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 989   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 990
 991   return res;
 992 }
 993
 994
 995 /* Function destroy_loop_vec_info.
 996
 997    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 998    stmts in the loop.  */
 999
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1002 {
1003   struct loop *loop;
1004   basic_block *bbs;
1005   int nbbs;
1006   gimple_stmt_iterator si;
1007   int j;
1008   vec<slp_instance> slp_instances;
1009   slp_instance instance;
1010   bool swapped;
1011
1012   if (!loop_vinfo)
1013     return;
1014
1015   loop = LOOP_VINFO_LOOP (loop_vinfo);
1016
1017   bbs = LOOP_VINFO_BBS (loop_vinfo);
1018   nbbs = clean_stmts ? loop->num_nodes : 0;
1019   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1020
1021   for (j = 0; j < nbbs; j++)
1022     {
1023       basic_block bb = bbs[j];
1024       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025         free_stmt_vec_info (gsi_stmt (si));
1026
1027       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1028         {
1029           gimple stmt = gsi_stmt (si);
1030
1031           /* We may have broken canonical form by moving a constant
1032              into RHS1 of a commutative op.  Fix such occurrences.  */
1033           if (swapped && is_gimple_assign (stmt))
1034             {
1035               enum tree_code code = gimple_assign_rhs_code (stmt);
1036
1037               if ((code == PLUS_EXPR
1038                    || code == POINTER_PLUS_EXPR
1039                    || code == MULT_EXPR)
1040                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041                 swap_ssa_operands (stmt,
1042                                    gimple_assign_rhs1_ptr (stmt),
1043                                    gimple_assign_rhs2_ptr (stmt));
1044             }
1045
1046           /* Free stmt_vec_info.  */
1047           free_stmt_vec_info (stmt);
1048           gsi_next (&si);
1049         }
1050     }
1051
1052   free (LOOP_VINFO_BBS (loop_vinfo));
1053   vect_destroy_datarefs (loop_vinfo, NULL);
1054   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060     vect_free_slp_instance (instance);
1061
1062   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1066
1067   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1069
1070   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1071
1072   free (loop_vinfo);
1073   loop->aux = NULL;
1074 }
1075
1076
1077 /* Function vect_analyze_loop_1.
1078
1079    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080    for it. The different analyses will record information in the
1081    loop_vec_info struct.  This is a subset of the analyses applied in
1082    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083    that is now considered for (outer-loop) vectorization.  */
1084
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1087 {
1088   loop_vec_info loop_vinfo;
1089
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location,
1092                      "===== analyze_loop_nest_1 =====\n");
1093
1094   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1095
1096   loop_vinfo = vect_analyze_loop_form (loop);
1097   if (!loop_vinfo)
1098     {
1099       if (dump_enabled_p ())
1100         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101                          "bad inner-loop form.\n");
1102       return NULL;
1103     }
1104
1105   return loop_vinfo;
1106 }
1107
1108
1109 /* Function vect_analyze_loop_form.
1110
1111    Verify that certain CFG restrictions hold, including:
1112    - the loop has a pre-header
1113    - the loop has a single entry and exit
1114    - the loop exit condition is simple enough, and the number of iterations
1115      can be analyzed (a countable loop).  */
1116
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1119 {
1120   loop_vec_info loop_vinfo;
1121   gcond *loop_cond;
1122   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123   loop_vec_info inner_loop_vinfo = NULL;
1124
1125   if (dump_enabled_p ())
1126     dump_printf_loc (MSG_NOTE, vect_location,
1127                      "=== vect_analyze_loop_form ===\n");
1128
1129   /* Different restrictions apply when we are considering an inner-most loop,
1130      vs. an outer (nested) loop.
1131      (FORNOW. May want to relax some of these restrictions in the future).  */
1132
1133   if (!loop->inner)
1134     {
1135       /* Inner-most loop.  We currently require that the number of BBs is
1136          exactly 2 (the header and latch).  Vectorizable inner-most loops
1137          look like this:
1138
1139                         (pre-header)
1140                            |
1141                           header <--------+
1142                            | |            |
1143                            | +--> latch --+
1144                            |
1145                         (exit-bb)  */
1146
1147       if (loop->num_nodes != 2)
1148         {
1149           if (dump_enabled_p ())
1150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151                              "not vectorized: control flow in loop.\n");
1152           return NULL;
1153         }
1154
1155       if (empty_block_p (loop->header))
1156         {
1157           if (dump_enabled_p ())
1158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159                              "not vectorized: empty loop.\n");
1160           return NULL;
1161         }
1162     }
1163   else
1164     {
1165       struct loop *innerloop = loop->inner;
1166       edge entryedge;
1167
1168       /* Nested loop. We currently require that the loop is doubly-nested,
1169          contains a single inner loop, and the number of BBs is exactly 5.
1170          Vectorizable outer-loops look like this:
1171
1172                         (pre-header)
1173                            |
1174                           header <---+
1175                            |         |
1176                           inner-loop |
1177                            |         |
1178                           tail ------+
1179                            |
1180                         (exit-bb)
1181
1182          The inner-loop has the properties expected of inner-most loops
1183          as described above.  */
1184
1185       if ((loop->inner)->inner || (loop->inner)->next)
1186         {
1187           if (dump_enabled_p ())
1188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                              "not vectorized: multiple nested loops.\n");
1190           return NULL;
1191         }
1192
1193       /* Analyze the inner-loop.  */
1194       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195       if (!inner_loop_vinfo)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: Bad inner loop.\n");
1200           return NULL;
1201         }
1202
1203       if (!expr_invariant_in_loop_p (loop,
1204                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1205         {
1206           if (dump_enabled_p ())
1207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208                              "not vectorized: inner-loop count not"
1209                              " invariant.\n");
1210           destroy_loop_vec_info (inner_loop_vinfo, true);
1211           return NULL;
1212         }
1213
1214       if (loop->num_nodes != 5)
1215         {
1216           if (dump_enabled_p ())
1217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                              "not vectorized: control flow in loop.\n");
1219           destroy_loop_vec_info (inner_loop_vinfo, true);
1220           return NULL;
1221         }
1222
1223       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224       entryedge = EDGE_PRED (innerloop->header, 0);
1225       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226         entryedge = EDGE_PRED (innerloop->header, 1);
1227
1228       if (entryedge->src != loop->header
1229           || !single_exit (innerloop)
1230           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: unsupported outerloop form.\n");
1235           destroy_loop_vec_info (inner_loop_vinfo, true);
1236           return NULL;
1237         }
1238
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_NOTE, vect_location,
1241                          "Considering outer-loop vectorization.\n");
1242     }
1243
1244   if (!single_exit (loop)
1245       || EDGE_COUNT (loop->header->preds) != 2)
1246     {
1247       if (dump_enabled_p ())
1248         {
1249           if (!single_exit (loop))
1250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                              "not vectorized: multiple exits.\n");
1252           else if (EDGE_COUNT (loop->header->preds) != 2)
1253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                              "not vectorized: too many incoming edges.\n");
1255         }
1256       if (inner_loop_vinfo)
1257         destroy_loop_vec_info (inner_loop_vinfo, true);
1258       return NULL;
1259     }
1260
1261   /* We assume that the loop exit condition is at the end of the loop. i.e,
1262      that the loop is represented as a do-while (with a proper if-guard
1263      before the loop if needed), where the loop header contains all the
1264      executable statements, and the latch is empty.  */
1265   if (!empty_block_p (loop->latch)
1266       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267     {
1268       if (dump_enabled_p ())
1269         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270                          "not vectorized: latch block not empty.\n");
1271       if (inner_loop_vinfo)
1272         destroy_loop_vec_info (inner_loop_vinfo, true);
1273       return NULL;
1274     }
1275
1276   /* Make sure there exists a single-predecessor exit bb:  */
1277   if (!single_pred_p (single_exit (loop)->dest))
1278     {
1279       edge e = single_exit (loop);
1280       if (!(e->flags & EDGE_ABNORMAL))
1281         {
1282           split_loop_exit_edge (e);
1283           if (dump_enabled_p ())
1284             dump_printf (MSG_NOTE, "split exit edge.\n");
1285         }
1286       else
1287         {
1288           if (dump_enabled_p ())
1289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290                              "not vectorized: abnormal loop exit edge.\n");
1291           if (inner_loop_vinfo)
1292             destroy_loop_vec_info (inner_loop_vinfo, true);
1293           return NULL;
1294         }
1295     }
1296
1297   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298                                     &number_of_iterationsm1);
1299   if (!loop_cond)
1300     {
1301       if (dump_enabled_p ())
1302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                          "not vectorized: complicated exit condition.\n");
1304       if (inner_loop_vinfo)
1305         destroy_loop_vec_info (inner_loop_vinfo, true);
1306       return NULL;
1307     }
1308
1309   if (!number_of_iterations
1310       || chrec_contains_undetermined (number_of_iterations))
1311     {
1312       if (dump_enabled_p ())
1313         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                          "not vectorized: number of iterations cannot be "
1315                          "computed.\n");
1316       if (inner_loop_vinfo)
1317         destroy_loop_vec_info (inner_loop_vinfo, true);
1318       return NULL;
1319     }
1320
1321   if (integer_zerop (number_of_iterations))
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: number of iterations = 0.\n");
1326       if (inner_loop_vinfo)
1327         destroy_loop_vec_info (inner_loop_vinfo, true);
1328       return NULL;
1329     }
1330
1331   loop_vinfo = new_loop_vec_info (loop);
1332   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1335
1336   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1337     {
1338       if (dump_enabled_p ())
1339         {
1340           dump_printf_loc (MSG_NOTE, vect_location,
1341                            "Symbolic number of iterations is ");
1342           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343           dump_printf (MSG_NOTE, "\n");
1344         }
1345     }
1346
1347   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1348
1349   /* CHECKME: May want to keep it around it in the future.  */
1350   if (inner_loop_vinfo)
1351     destroy_loop_vec_info (inner_loop_vinfo, false);
1352
1353   gcc_assert (!loop->aux);
1354   loop->aux = loop_vinfo;
1355   return loop_vinfo;
1356 }
1357
1358
1359 /* Function vect_analyze_loop_operations.
1360
1361    Scan the loop stmts and make sure they are all vectorizable.  */
1362
1363 static bool
1364 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1365 {
1366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1368   int nbbs = loop->num_nodes;
1369   unsigned int vectorization_factor = 0;
1370   int i;
1371   stmt_vec_info stmt_info;
1372   bool need_to_vectorize = false;
1373   int min_profitable_iters;
1374   int min_scalar_loop_bound;
1375   unsigned int th;
1376   bool only_slp_in_loop = true, ok;
1377   HOST_WIDE_INT max_niter;
1378   HOST_WIDE_INT estimated_niter;
1379   int min_profitable_estimate;
1380
1381   if (dump_enabled_p ())
1382     dump_printf_loc (MSG_NOTE, vect_location,
1383                      "=== vect_analyze_loop_operations ===\n");
1384
1385   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1386   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387   if (slp)
1388     {
1389       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390          vectorization factor of the loop is the unrolling factor required by
1391          the SLP instances.  If that unrolling factor is 1, we say, that we
1392          perform pure SLP on loop - cross iteration parallelism is not
1393          exploited.  */
1394       for (i = 0; i < nbbs; i++)
1395         {
1396           basic_block bb = bbs[i];
1397           for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398                gsi_next (&si))
1399             {
1400               gimple stmt = gsi_stmt (si);
1401               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1402               gcc_assert (stmt_info);
1403               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1404                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1405                   && !PURE_SLP_STMT (stmt_info))
1406                 /* STMT needs both SLP and loop-based vectorization.  */
1407                 only_slp_in_loop = false;
1408             }
1409         }
1410
1411       if (only_slp_in_loop)
1412         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413       else
1414         vectorization_factor = least_common_multiple (vectorization_factor,
1415                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1416
1417       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1418       if (dump_enabled_p ())
1419         dump_printf_loc (MSG_NOTE, vect_location,
1420                          "Updating vectorization factor to %d\n",
1421                          vectorization_factor);
1422     }
1423
1424   for (i = 0; i < nbbs; i++)
1425     {
1426       basic_block bb = bbs[i];
1427
1428       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1429            gsi_next (&si))
1430         {
1431           gphi *phi = si.phi ();
1432           ok = true;
1433
1434           stmt_info = vinfo_for_stmt (phi);
1435           if (dump_enabled_p ())
1436             {
1437               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1438               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1439               dump_printf (MSG_NOTE, "\n");
1440             }
1441
1442           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1443              (i.e., a phi in the tail of the outer-loop).  */
1444           if (! is_loop_header_bb_p (bb))
1445             {
1446               /* FORNOW: we currently don't support the case that these phis
1447                  are not used in the outerloop (unless it is double reduction,
1448                  i.e., this phi is vect_reduction_def), cause this case
1449                  requires to actually do something here.  */
1450               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1451                    || STMT_VINFO_LIVE_P (stmt_info))
1452                   && STMT_VINFO_DEF_TYPE (stmt_info)
1453                      != vect_double_reduction_def)
1454                 {
1455                   if (dump_enabled_p ())
1456                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457                                      "Unsupported loop-closed phi in "
1458                                      "outer-loop.\n");
1459                   return false;
1460                 }
1461
1462               /* If PHI is used in the outer loop, we check that its operand
1463                  is defined in the inner loop.  */
1464               if (STMT_VINFO_RELEVANT_P (stmt_info))
1465                 {
1466                   tree phi_op;
1467                   gimple op_def_stmt;
1468
1469                   if (gimple_phi_num_args (phi) != 1)
1470                     return false;
1471
1472                   phi_op = PHI_ARG_DEF (phi, 0);
1473                   if (TREE_CODE (phi_op) != SSA_NAME)
1474                     return false;
1475
1476                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1477                   if (gimple_nop_p (op_def_stmt)
1478                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1479                       || !vinfo_for_stmt (op_def_stmt))
1480                     return false;
1481
1482                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1483                         != vect_used_in_outer
1484                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1485                            != vect_used_in_outer_by_reduction)
1486                     return false;
1487                 }
1488
1489               continue;
1490             }
1491
1492           gcc_assert (stmt_info);
1493
1494           if (STMT_VINFO_LIVE_P (stmt_info))
1495             {
1496               /* FORNOW: not yet supported.  */
1497               if (dump_enabled_p ())
1498                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                                  "not vectorized: value used after loop.\n");
1500               return false;
1501             }
1502
1503           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1504               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1505             {
1506               /* A scalar-dependence cycle that we don't support.  */
1507               if (dump_enabled_p ())
1508                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1509                                  "not vectorized: scalar dependence cycle.\n");
1510               return false;
1511             }
1512
1513           if (STMT_VINFO_RELEVANT_P (stmt_info))
1514             {
1515               need_to_vectorize = true;
1516               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1517                 ok = vectorizable_induction (phi, NULL, NULL);
1518             }
1519
1520           if (!ok)
1521             {
1522               if (dump_enabled_p ())
1523                 {
1524                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525                                    "not vectorized: relevant phi not "
1526                                    "supported: ");
1527                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1528                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1529                 }
1530               return false;
1531             }
1532         }
1533
1534       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1535            gsi_next (&si))
1536         {
1537           gimple stmt = gsi_stmt (si);
1538           if (!gimple_clobber_p (stmt)
1539               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1540             return false;
1541         }
1542     } /* bbs */
1543
1544   /* All operations in the loop are either irrelevant (deal with loop
1545      control, or dead), or only used outside the loop and can be moved
1546      out of the loop (e.g. invariants, inductions).  The loop can be
1547      optimized away by scalar optimizations.  We're better off not
1548      touching this loop.  */
1549   if (!need_to_vectorize)
1550     {
1551       if (dump_enabled_p ())
1552         dump_printf_loc (MSG_NOTE, vect_location,
1553                          "All the computation can be taken out of the loop.\n");
1554       if (dump_enabled_p ())
1555         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                          "not vectorized: redundant loop. no profit to "
1557                          "vectorize.\n");
1558       return false;
1559     }
1560
1561   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1562     dump_printf_loc (MSG_NOTE, vect_location,
1563                      "vectorization_factor = %d, niters = "
1564                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1565                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1566
1567   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1568        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1569       || ((max_niter = max_stmt_executions_int (loop)) != -1
1570           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1571     {
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574                          "not vectorized: iteration count too small.\n");
1575       if (dump_enabled_p ())
1576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577                          "not vectorized: iteration count smaller than "
1578                          "vectorization factor.\n");
1579       return false;
1580     }
1581
1582   /* Analyze cost.  Decide if worth while to vectorize.  */
1583
1584   /* Once VF is set, SLP costs should be updated since the number of created
1585      vector stmts depends on VF.  */
1586   vect_update_slp_costs_according_to_vf (loop_vinfo);
1587
1588   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1589                                       &min_profitable_estimate);
1590   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1591
1592   if (min_profitable_iters < 0)
1593     {
1594       if (dump_enabled_p ())
1595         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596                          "not vectorized: vectorization not profitable.\n");
1597       if (dump_enabled_p ())
1598         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599                          "not vectorized: vector version will never be "
1600                          "profitable.\n");
1601       return false;
1602     }
1603
1604   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1605                             * vectorization_factor) - 1);
1606
1607
1608   /* Use the cost model only if it is more conservative than user specified
1609      threshold.  */
1610
1611   th = (unsigned) min_scalar_loop_bound;
1612   if (min_profitable_iters
1613       && (!min_scalar_loop_bound
1614           || min_profitable_iters > min_scalar_loop_bound))
1615     th = (unsigned) min_profitable_iters;
1616
1617   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1618
1619   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1620       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1621     {
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624                          "not vectorized: vectorization not profitable.\n");
1625       if (dump_enabled_p ())
1626         dump_printf_loc (MSG_NOTE, vect_location,
1627                          "not vectorized: iteration count smaller than user "
1628                          "specified loop bound parameter or minimum profitable "
1629                          "iterations (whichever is more conservative).\n");
1630       return false;
1631     }
1632
1633   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1634       && ((unsigned HOST_WIDE_INT) estimated_niter
1635           <= MAX (th, (unsigned)min_profitable_estimate)))
1636     {
1637       if (dump_enabled_p ())
1638         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                          "not vectorized: estimated iteration count too "
1640                          "small.\n");
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_NOTE, vect_location,
1643                          "not vectorized: estimated iteration count smaller "
1644                          "than specified loop bound parameter or minimum "
1645                          "profitable iterations (whichever is more "
1646                          "conservative).\n");
1647       return false;
1648     }
1649
1650   return true;
1651 }
1652
1653
1654 /* Function vect_analyze_loop_2.
1655
1656    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1657    for it.  The different analyses will record information in the
1658    loop_vec_info struct.  */
1659 static bool
1660 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1661 {
1662   bool ok, slp = false;
1663   int max_vf = MAX_VECTORIZATION_FACTOR;
1664   int min_vf = 2;
1665   unsigned int th;
1666   unsigned int n_stmts = 0;
1667
1668   /* Find all data references in the loop (which correspond to vdefs/vuses)
1669      and analyze their evolution in the loop.  Also adjust the minimal
1670      vectorization factor according to the loads and stores.
1671
1672      FORNOW: Handle only simple, array references, which
1673      alignment can be forced, and aligned pointer-references.  */
1674
1675   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1676   if (!ok)
1677     {
1678       if (dump_enabled_p ())
1679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680                          "bad data references.\n");
1681       return false;
1682     }
1683
1684   /* Classify all cross-iteration scalar data-flow cycles.
1685      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1686
1687   vect_analyze_scalar_cycles (loop_vinfo);
1688
1689   vect_pattern_recog (loop_vinfo, NULL);
1690
1691   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1692      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1693
1694   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1695   if (!ok)
1696     {
1697       if (dump_enabled_p ())
1698         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699                          "bad data access.\n");
1700       return false;
1701     }
1702
1703   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1704
1705   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1706   if (!ok)
1707     {
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                          "unexpected pattern.\n");
1711       return false;
1712     }
1713
1714   /* Analyze data dependences between the data-refs in the loop
1715      and adjust the maximum vectorization factor according to
1716      the dependences.
1717      FORNOW: fail at the first data dependence that we encounter.  */
1718
1719   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1720   if (!ok
1721       || max_vf < min_vf)
1722     {
1723       if (dump_enabled_p ())
1724             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725                              "bad data dependence.\n");
1726       return false;
1727     }
1728
1729   ok = vect_determine_vectorization_factor (loop_vinfo);
1730   if (!ok)
1731     {
1732       if (dump_enabled_p ())
1733         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734                          "can't determine vectorization factor.\n");
1735       return false;
1736     }
1737   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1738     {
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741                          "bad data dependence.\n");
1742       return false;
1743     }
1744
1745   /* Analyze the alignment of the data-refs in the loop.
1746      Fail if a data reference is found that cannot be vectorized.  */
1747
1748   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1749   if (!ok)
1750     {
1751       if (dump_enabled_p ())
1752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753                          "bad data alignment.\n");
1754       return false;
1755     }
1756
1757   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1758      It is important to call pruning after vect_analyze_data_ref_accesses,
1759      since we use grouping information gathered by interleaving analysis.  */
1760   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1761   if (!ok)
1762     {
1763       if (dump_enabled_p ())
1764         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765                          "number of versioning for alias "
1766                          "run-time tests exceeds %d "
1767                          "(--param vect-max-version-for-alias-checks)\n",
1768                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1769       return false;
1770     }
1771
1772   /* This pass will decide on using loop versioning and/or loop peeling in
1773      order to enhance the alignment of data references in the loop.  */
1774
1775   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1776   if (!ok)
1777     {
1778       if (dump_enabled_p ())
1779         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780                          "bad data alignment.\n");
1781       return false;
1782     }
1783
1784   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1785   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1786   if (ok)
1787     {
1788       /* Decide which possible SLP instances to SLP.  */
1789       slp = vect_make_slp_decision (loop_vinfo);
1790
1791       /* Find stmts that need to be both vectorized and SLPed.  */
1792       vect_detect_hybrid_slp (loop_vinfo);
1793     }
1794   else
1795     return false;
1796
1797   /* Scan all the operations in the loop and make sure they are
1798      vectorizable.  */
1799
1800   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1801   if (!ok)
1802     {
1803       if (dump_enabled_p ())
1804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805                          "bad operation or unsupported loop bound.\n");
1806       return false;
1807     }
1808
1809   /* Decide whether we need to create an epilogue loop to handle
1810      remaining scalar iterations.  */
1811   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1812         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1813        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1814
1815   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1816       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1817     {
1818       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1819                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1820           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1821         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1822     }
1823   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1824            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1825                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1826                /* In case of versioning, check if the maximum number of
1827                   iterations is greater than th.  If they are identical,
1828                   the epilogue is unnecessary.  */
1829                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1830                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1831                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1832                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1833     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1834
1835   /* If an epilogue loop is required make sure we can create one.  */
1836   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1837       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1838     {
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1841       if (!vect_can_advance_ivs_p (loop_vinfo)
1842           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1843                                            single_exit (LOOP_VINFO_LOOP
1844                                                          (loop_vinfo))))
1845         {
1846           if (dump_enabled_p ())
1847             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                              "not vectorized: can't create required "
1849                              "epilog loop\n");
1850           return false;
1851         }
1852     }
1853
1854   return true;
1855 }
1856
1857 /* Function vect_analyze_loop.
1858
1859    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1860    for it.  The different analyses will record information in the
1861    loop_vec_info struct.  */
1862 loop_vec_info
1863 vect_analyze_loop (struct loop *loop)
1864 {
1865   loop_vec_info loop_vinfo;
1866   unsigned int vector_sizes;
1867
1868   /* Autodetect first vector size we try.  */
1869   current_vector_size = 0;
1870   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1871
1872   if (dump_enabled_p ())
1873     dump_printf_loc (MSG_NOTE, vect_location,
1874                      "===== analyze_loop_nest =====\n");
1875
1876   if (loop_outer (loop)
1877       && loop_vec_info_for_loop (loop_outer (loop))
1878       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1879     {
1880       if (dump_enabled_p ())
1881         dump_printf_loc (MSG_NOTE, vect_location,
1882                          "outer-loop already vectorized.\n");
1883       return NULL;
1884     }
1885
1886   while (1)
1887     {
1888       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1889       loop_vinfo = vect_analyze_loop_form (loop);
1890       if (!loop_vinfo)
1891         {
1892           if (dump_enabled_p ())
1893             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894                              "bad loop form.\n");
1895           return NULL;
1896         }
1897
1898       if (vect_analyze_loop_2 (loop_vinfo))
1899         {
1900           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1901
1902           return loop_vinfo;
1903         }
1904
1905       destroy_loop_vec_info (loop_vinfo, true);
1906
1907       vector_sizes &= ~current_vector_size;
1908       if (vector_sizes == 0
1909           || current_vector_size == 0)
1910         return NULL;
1911
1912       /* Try the next biggest vector size.  */
1913       current_vector_size = 1 << floor_log2 (vector_sizes);
1914       if (dump_enabled_p ())
1915         dump_printf_loc (MSG_NOTE, vect_location,
1916                          "***** Re-trying analysis with "
1917                          "vector size %d\n", current_vector_size);
1918     }
1919 }
1920
1921
1922 /* Function reduction_code_for_scalar_code
1923
1924    Input:
1925    CODE - tree_code of a reduction operations.
1926
1927    Output:
1928    REDUC_CODE - the corresponding tree-code to be used to reduce the
1929       vector of partial results into a single scalar result, or ERROR_MARK
1930       if the operation is a supported reduction operation, but does not have
1931       such a tree-code.
1932
1933    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1934
1935 static bool
1936 reduction_code_for_scalar_code (enum tree_code code,
1937                                 enum tree_code *reduc_code)
1938 {
1939   switch (code)
1940     {
1941       case MAX_EXPR:
1942         *reduc_code = REDUC_MAX_EXPR;
1943         return true;
1944
1945       case MIN_EXPR:
1946         *reduc_code = REDUC_MIN_EXPR;
1947         return true;
1948
1949       case PLUS_EXPR:
1950         *reduc_code = REDUC_PLUS_EXPR;
1951         return true;
1952
1953       case MULT_EXPR:
1954       case MINUS_EXPR:
1955       case BIT_IOR_EXPR:
1956       case BIT_XOR_EXPR:
1957       case BIT_AND_EXPR:
1958         *reduc_code = ERROR_MARK;
1959         return true;
1960
1961       default:
1962        return false;
1963     }
1964 }
1965
1966
1967 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1968    STMT is printed with a message MSG. */
1969
1970 static void
1971 report_vect_op (int msg_type, gimple stmt, const char *msg)
1972 {
1973   dump_printf_loc (msg_type, vect_location, "%s", msg);
1974   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1975   dump_printf (msg_type, "\n");
1976 }
1977
1978
1979 /* Detect SLP reduction of the form:
1980
1981    #a1 = phi <a5, a0>
1982    a2 = operation (a1)
1983    a3 = operation (a2)
1984    a4 = operation (a3)
1985    a5 = operation (a4)
1986
1987    #a = phi <a5>
1988
1989    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1990    FIRST_STMT is the first reduction stmt in the chain
1991    (a2 = operation (a1)).
1992
1993    Return TRUE if a reduction chain was detected.  */
1994
1995 static bool
1996 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1997 {
1998   struct loop *loop = (gimple_bb (phi))->loop_father;
1999   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2000   enum tree_code code;
2001   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2002   stmt_vec_info use_stmt_info, current_stmt_info;
2003   tree lhs;
2004   imm_use_iterator imm_iter;
2005   use_operand_p use_p;
2006   int nloop_uses, size = 0, n_out_of_loop_uses;
2007   bool found = false;
2008
2009   if (loop != vect_loop)
2010     return false;
2011
2012   lhs = PHI_RESULT (phi);
2013   code = gimple_assign_rhs_code (first_stmt);
2014   while (1)
2015     {
2016       nloop_uses = 0;
2017       n_out_of_loop_uses = 0;
2018       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2019         {
2020           gimple use_stmt = USE_STMT (use_p);
2021           if (is_gimple_debug (use_stmt))
2022             continue;
2023
2024           /* Check if we got back to the reduction phi.  */
2025           if (use_stmt == phi)
2026             {
2027               loop_use_stmt = use_stmt;
2028               found = true;
2029               break;
2030             }
2031
2032           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2033             {
2034               if (vinfo_for_stmt (use_stmt)
2035                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2036                 {
2037                   loop_use_stmt = use_stmt;
2038                   nloop_uses++;
2039                 }
2040             }
2041            else
2042              n_out_of_loop_uses++;
2043
2044            /* There are can be either a single use in the loop or two uses in
2045               phi nodes.  */
2046            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2047              return false;
2048         }
2049
2050       if (found)
2051         break;
2052
2053       /* We reached a statement with no loop uses.  */
2054       if (nloop_uses == 0)
2055         return false;
2056
2057       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2058       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2059         return false;
2060
2061       if (!is_gimple_assign (loop_use_stmt)
2062           || code != gimple_assign_rhs_code (loop_use_stmt)
2063           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2064         return false;
2065
2066       /* Insert USE_STMT into reduction chain.  */
2067       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2068       if (current_stmt)
2069         {
2070           current_stmt_info = vinfo_for_stmt (current_stmt);
2071           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2072           GROUP_FIRST_ELEMENT (use_stmt_info)
2073             = GROUP_FIRST_ELEMENT (current_stmt_info);
2074         }
2075       else
2076         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2077
2078       lhs = gimple_assign_lhs (loop_use_stmt);
2079       current_stmt = loop_use_stmt;
2080       size++;
2081    }
2082
2083   if (!found || loop_use_stmt != phi || size < 2)
2084     return false;
2085
2086   /* Swap the operands, if needed, to make the reduction operand be the second
2087      operand.  */
2088   lhs = PHI_RESULT (phi);
2089   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2090   while (next_stmt)
2091     {
2092       if (gimple_assign_rhs2 (next_stmt) == lhs)
2093         {
2094           tree op = gimple_assign_rhs1 (next_stmt);
2095           gimple def_stmt = NULL;
2096
2097           if (TREE_CODE (op) == SSA_NAME)
2098             def_stmt = SSA_NAME_DEF_STMT (op);
2099
2100           /* Check that the other def is either defined in the loop
2101              ("vect_internal_def"), or it's an induction (defined by a
2102              loop-header phi-node).  */
2103           if (def_stmt
2104               && gimple_bb (def_stmt)
2105               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2106               && (is_gimple_assign (def_stmt)
2107                   || is_gimple_call (def_stmt)
2108                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2109                            == vect_induction_def
2110                   || (gimple_code (def_stmt) == GIMPLE_PHI
2111                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2112                                   == vect_internal_def
2113                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2114             {
2115               lhs = gimple_assign_lhs (next_stmt);
2116               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2117               continue;
2118             }
2119
2120           return false;
2121         }
2122       else
2123         {
2124           tree op = gimple_assign_rhs2 (next_stmt);
2125           gimple def_stmt = NULL;
2126
2127           if (TREE_CODE (op) == SSA_NAME)
2128             def_stmt = SSA_NAME_DEF_STMT (op);
2129
2130           /* Check that the other def is either defined in the loop
2131             ("vect_internal_def"), or it's an induction (defined by a
2132             loop-header phi-node).  */
2133           if (def_stmt
2134               && gimple_bb (def_stmt)
2135               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2136               && (is_gimple_assign (def_stmt)
2137                   || is_gimple_call (def_stmt)
2138                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2139                               == vect_induction_def
2140                   || (gimple_code (def_stmt) == GIMPLE_PHI
2141                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2142                                   == vect_internal_def
2143                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2144             {
2145               if (dump_enabled_p ())
2146                 {
2147                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2148                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2149                   dump_printf (MSG_NOTE, "\n");
2150                 }
2151
2152               swap_ssa_operands (next_stmt,
2153                                  gimple_assign_rhs1_ptr (next_stmt),
2154                                  gimple_assign_rhs2_ptr (next_stmt));
2155               update_stmt (next_stmt);
2156
2157               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2158                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2159             }
2160           else
2161             return false;
2162         }
2163
2164       lhs = gimple_assign_lhs (next_stmt);
2165       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2166     }
2167
2168   /* Save the chain for further analysis in SLP detection.  */
2169   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2170   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2171   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2172
2173   return true;
2174 }
2175
2176
2177 /* Function vect_is_simple_reduction_1
2178
2179    (1) Detect a cross-iteration def-use cycle that represents a simple
2180    reduction computation.  We look for the following pattern:
2181
2182    loop_header:
2183      a1 = phi < a0, a2 >
2184      a3 = ...
2185      a2 = operation (a3, a1)
2186
2187    or
2188
2189    a3 = ...
2190    loop_header:
2191      a1 = phi < a0, a2 >
2192      a2 = operation (a3, a1)
2193
2194    such that:
2195    1. operation is commutative and associative and it is safe to
2196       change the order of the computation (if CHECK_REDUCTION is true)
2197    2. no uses for a2 in the loop (a2 is used out of the loop)
2198    3. no uses of a1 in the loop besides the reduction operation
2199    4. no uses of a1 outside the loop.
2200
2201    Conditions 1,4 are tested here.
2202    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2203
2204    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2205    nested cycles, if CHECK_REDUCTION is false.
2206
2207    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2208    reductions:
2209
2210      a1 = phi < a0, a2 >
2211      inner loop (def of a3)
2212      a2 = phi < a3 >
2213
2214    If MODIFY is true it tries also to rework the code in-place to enable
2215    detection of more reduction patterns.  For the time being we rewrite
2216    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2217 */
2218
2219 static gimple
2220 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2221                             bool check_reduction, bool *double_reduc,
2222                             bool modify)
2223 {
2224   struct loop *loop = (gimple_bb (phi))->loop_father;
2225   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2226   edge latch_e = loop_latch_edge (loop);
2227   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2228   gimple def_stmt, def1 = NULL, def2 = NULL;
2229   enum tree_code orig_code, code;
2230   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2231   tree type;
2232   int nloop_uses;
2233   tree name;
2234   imm_use_iterator imm_iter;
2235   use_operand_p use_p;
2236   bool phi_def;
2237
2238   *double_reduc = false;
2239
2240   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2241      otherwise, we assume outer loop vectorization.  */
2242   gcc_assert ((check_reduction && loop == vect_loop)
2243               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2244
2245   name = PHI_RESULT (phi);
2246   /* ???  If there are no uses of the PHI result the inner loop reduction
2247      won't be detected as possibly double-reduction by vectorizable_reduction
2248      because that tries to walk the PHI arg from the preheader edge which
2249      can be constant.  See PR60382.  */
2250   if (has_zero_uses (name))
2251     return NULL;
2252   nloop_uses = 0;
2253   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2254     {
2255       gimple use_stmt = USE_STMT (use_p);
2256       if (is_gimple_debug (use_stmt))
2257         continue;
2258
2259       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2260         {
2261           if (dump_enabled_p ())
2262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263                              "intermediate value used outside loop.\n");
2264
2265           return NULL;
2266         }
2267
2268       if (vinfo_for_stmt (use_stmt)
2269           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2270         nloop_uses++;
2271       if (nloop_uses > 1)
2272         {
2273           if (dump_enabled_p ())
2274             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275                              "reduction used in loop.\n");
2276           return NULL;
2277         }
2278     }
2279
2280   if (TREE_CODE (loop_arg) != SSA_NAME)
2281     {
2282       if (dump_enabled_p ())
2283         {
2284           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                            "reduction: not ssa_name: ");
2286           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2287           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2288         }
2289       return NULL;
2290     }
2291
2292   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2293   if (!def_stmt)
2294     {
2295       if (dump_enabled_p ())
2296         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297                          "reduction: no def_stmt.\n");
2298       return NULL;
2299     }
2300
2301   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2302     {
2303       if (dump_enabled_p ())
2304         {
2305           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2306           dump_printf (MSG_NOTE, "\n");
2307         }
2308       return NULL;
2309     }
2310
2311   if (is_gimple_assign (def_stmt))
2312     {
2313       name = gimple_assign_lhs (def_stmt);
2314       phi_def = false;
2315     }
2316   else
2317     {
2318       name = PHI_RESULT (def_stmt);
2319       phi_def = true;
2320     }
2321
2322   nloop_uses = 0;
2323   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2324     {
2325       gimple use_stmt = USE_STMT (use_p);
2326       if (is_gimple_debug (use_stmt))
2327         continue;
2328       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2329           && vinfo_for_stmt (use_stmt)
2330           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2331         nloop_uses++;
2332       if (nloop_uses > 1)
2333         {
2334           if (dump_enabled_p ())
2335             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336                              "reduction used in loop.\n");
2337           return NULL;
2338         }
2339     }
2340
2341   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2342      defined in the inner loop.  */
2343   if (phi_def)
2344     {
2345       op1 = PHI_ARG_DEF (def_stmt, 0);
2346
2347       if (gimple_phi_num_args (def_stmt) != 1
2348           || TREE_CODE (op1) != SSA_NAME)
2349         {
2350           if (dump_enabled_p ())
2351             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352                              "unsupported phi node definition.\n");
2353
2354           return NULL;
2355         }
2356
2357       def1 = SSA_NAME_DEF_STMT (op1);
2358       if (gimple_bb (def1)
2359           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2360           && loop->inner
2361           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2362           && is_gimple_assign (def1))
2363         {
2364           if (dump_enabled_p ())
2365             report_vect_op (MSG_NOTE, def_stmt,
2366                             "detected double reduction: ");
2367
2368           *double_reduc = true;
2369           return def_stmt;
2370         }
2371
2372       return NULL;
2373     }
2374
2375   code = orig_code = gimple_assign_rhs_code (def_stmt);
2376
2377   /* We can handle "res -= x[i]", which is non-associative by
2378      simply rewriting this into "res += -x[i]".  Avoid changing
2379      gimple instruction for the first simple tests and only do this
2380      if we're allowed to change code at all.  */
2381   if (code == MINUS_EXPR
2382       && modify
2383       && (op1 = gimple_assign_rhs1 (def_stmt))
2384       && TREE_CODE (op1) == SSA_NAME
2385       && SSA_NAME_DEF_STMT (op1) == phi)
2386     code = PLUS_EXPR;
2387
2388   if (check_reduction
2389       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2390     {
2391       if (dump_enabled_p ())
2392         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2393                         "reduction: not commutative/associative: ");
2394       return NULL;
2395     }
2396
2397   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2398     {
2399       if (code != COND_EXPR)
2400         {
2401           if (dump_enabled_p ())
2402             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403                             "reduction: not binary operation: ");
2404
2405           return NULL;
2406         }
2407
2408       op3 = gimple_assign_rhs1 (def_stmt);
2409       if (COMPARISON_CLASS_P (op3))
2410         {
2411           op4 = TREE_OPERAND (op3, 1);
2412           op3 = TREE_OPERAND (op3, 0);
2413         }
2414
2415       op1 = gimple_assign_rhs2 (def_stmt);
2416       op2 = gimple_assign_rhs3 (def_stmt);
2417
2418       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2419         {
2420           if (dump_enabled_p ())
2421             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2422                             "reduction: uses not ssa_names: ");
2423
2424           return NULL;
2425         }
2426     }
2427   else
2428     {
2429       op1 = gimple_assign_rhs1 (def_stmt);
2430       op2 = gimple_assign_rhs2 (def_stmt);
2431
2432       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2433         {
2434           if (dump_enabled_p ())
2435             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2436                             "reduction: uses not ssa_names: ");
2437
2438           return NULL;
2439         }
2440    }
2441
2442   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2443   if ((TREE_CODE (op1) == SSA_NAME
2444        && !types_compatible_p (type,TREE_TYPE (op1)))
2445       || (TREE_CODE (op2) == SSA_NAME
2446           && !types_compatible_p (type, TREE_TYPE (op2)))
2447       || (op3 && TREE_CODE (op3) == SSA_NAME
2448           && !types_compatible_p (type, TREE_TYPE (op3)))
2449       || (op4 && TREE_CODE (op4) == SSA_NAME
2450           && !types_compatible_p (type, TREE_TYPE (op4))))
2451     {
2452       if (dump_enabled_p ())
2453         {
2454           dump_printf_loc (MSG_NOTE, vect_location,
2455                            "reduction: multiple types: operation type: ");
2456           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2457           dump_printf (MSG_NOTE, ", operands types: ");
2458           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2459                              TREE_TYPE (op1));
2460           dump_printf (MSG_NOTE, ",");
2461           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2462                              TREE_TYPE (op2));
2463           if (op3)
2464             {
2465               dump_printf (MSG_NOTE, ",");
2466               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2467                                  TREE_TYPE (op3));
2468             }
2469
2470           if (op4)
2471             {
2472               dump_printf (MSG_NOTE, ",");
2473               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2474                                  TREE_TYPE (op4));
2475             }
2476           dump_printf (MSG_NOTE, "\n");
2477         }
2478
2479       return NULL;
2480     }
2481
2482   /* Check that it's ok to change the order of the computation.
2483      Generally, when vectorizing a reduction we change the order of the
2484      computation.  This may change the behavior of the program in some
2485      cases, so we need to check that this is ok.  One exception is when
2486      vectorizing an outer-loop: the inner-loop is executed sequentially,
2487      and therefore vectorizing reductions in the inner-loop during
2488      outer-loop vectorization is safe.  */
2489
2490   /* CHECKME: check for !flag_finite_math_only too?  */
2491   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2492       && check_reduction)
2493     {
2494       /* Changing the order of operations changes the semantics.  */
2495       if (dump_enabled_p ())
2496         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2497                         "reduction: unsafe fp math optimization: ");
2498       return NULL;
2499     }
2500   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2501            && check_reduction)
2502     {
2503       /* Changing the order of operations changes the semantics.  */
2504       if (dump_enabled_p ())
2505         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2506                         "reduction: unsafe int math optimization: ");
2507       return NULL;
2508     }
2509   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2510     {
2511       /* Changing the order of operations changes the semantics.  */
2512       if (dump_enabled_p ())
2513         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2514                         "reduction: unsafe fixed-point math optimization: ");
2515       return NULL;
2516     }
2517
2518   /* If we detected "res -= x[i]" earlier, rewrite it into
2519      "res += -x[i]" now.  If this turns out to be useless reassoc
2520      will clean it up again.  */
2521   if (orig_code == MINUS_EXPR)
2522     {
2523       tree rhs = gimple_assign_rhs2 (def_stmt);
2524       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2525       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2526       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2527       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2528                                                           loop_info, NULL));
2529       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2530       gimple_assign_set_rhs2 (def_stmt, negrhs);
2531       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2532       update_stmt (def_stmt);
2533     }
2534
2535   /* Reduction is safe. We're dealing with one of the following:
2536      1) integer arithmetic and no trapv
2537      2) floating point arithmetic, and special flags permit this optimization
2538      3) nested cycle (i.e., outer loop vectorization).  */
2539   if (TREE_CODE (op1) == SSA_NAME)
2540     def1 = SSA_NAME_DEF_STMT (op1);
2541
2542   if (TREE_CODE (op2) == SSA_NAME)
2543     def2 = SSA_NAME_DEF_STMT (op2);
2544
2545   if (code != COND_EXPR
2546       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2547     {
2548       if (dump_enabled_p ())
2549         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2550       return NULL;
2551     }
2552
2553   /* Check that one def is the reduction def, defined by PHI,
2554      the other def is either defined in the loop ("vect_internal_def"),
2555      or it's an induction (defined by a loop-header phi-node).  */
2556
2557   if (def2 && def2 == phi
2558       && (code == COND_EXPR
2559           || !def1 || gimple_nop_p (def1)
2560           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2561           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2562               && (is_gimple_assign (def1)
2563                   || is_gimple_call (def1)
2564                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2565                       == vect_induction_def
2566                   || (gimple_code (def1) == GIMPLE_PHI
2567                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2568                           == vect_internal_def
2569                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2570     {
2571       if (dump_enabled_p ())
2572         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2573       return def_stmt;
2574     }
2575
2576   if (def1 && def1 == phi
2577       && (code == COND_EXPR
2578           || !def2 || gimple_nop_p (def2)
2579           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2580           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2581               && (is_gimple_assign (def2)
2582                   || is_gimple_call (def2)
2583                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2584                       == vect_induction_def
2585                   || (gimple_code (def2) == GIMPLE_PHI
2586                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2587                           == vect_internal_def
2588                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2589     {
2590       if (check_reduction)
2591         {
2592           /* Swap operands (just for simplicity - so that the rest of the code
2593              can assume that the reduction variable is always the last (second)
2594              argument).  */
2595           if (dump_enabled_p ())
2596             report_vect_op (MSG_NOTE, def_stmt,
2597                             "detected reduction: need to swap operands: ");
2598
2599           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2600                              gimple_assign_rhs2_ptr (def_stmt));
2601
2602           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2603             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2604         }
2605       else
2606         {
2607           if (dump_enabled_p ())
2608             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2609         }
2610
2611       return def_stmt;
2612     }
2613
2614   /* Try to find SLP reduction chain.  */
2615   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2616     {
2617       if (dump_enabled_p ())
2618         report_vect_op (MSG_NOTE, def_stmt,
2619                         "reduction: detected reduction chain: ");
2620
2621       return def_stmt;
2622     }
2623
2624   if (dump_enabled_p ())
2625     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2626                     "reduction: unknown pattern: ");
2627
2628   return NULL;
2629 }
2630
2631 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2632    in-place.  Arguments as there.  */
2633
2634 static gimple
2635 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2636                           bool check_reduction, bool *double_reduc)
2637 {
2638   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2639                                      double_reduc, false);
2640 }
2641
2642 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2643    in-place if it enables detection of more reductions.  Arguments
2644    as there.  */
2645
2646 gimple
2647 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2648                           bool check_reduction, bool *double_reduc)
2649 {
2650   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2651                                      double_reduc, true);
2652 }
2653
2654 /* Calculate the cost of one scalar iteration of the loop.  */
2655 int
2656 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2657                                        stmt_vector_for_cost *scalar_cost_vec)
2658 {
2659   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2660   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2661   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2662   int innerloop_iters, i;
2663
2664   /* Count statements in scalar loop.  Using this as scalar cost for a single
2665      iteration for now.
2666
2667      TODO: Add outer loop support.
2668
2669      TODO: Consider assigning different costs to different scalar
2670      statements.  */
2671
2672   /* FORNOW.  */
2673   innerloop_iters = 1;
2674   if (loop->inner)
2675     innerloop_iters = 50; /* FIXME */
2676
2677   for (i = 0; i < nbbs; i++)
2678     {
2679       gimple_stmt_iterator si;
2680       basic_block bb = bbs[i];
2681
2682       if (bb->loop_father == loop->inner)
2683         factor = innerloop_iters;
2684       else
2685         factor = 1;
2686
2687       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2688         {
2689           gimple stmt = gsi_stmt (si);
2690           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2691
2692           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2693             continue;
2694
2695           /* Skip stmts that are not vectorized inside the loop.  */
2696           if (stmt_info
2697               && !STMT_VINFO_RELEVANT_P (stmt_info)
2698               && (!STMT_VINFO_LIVE_P (stmt_info)
2699                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2700               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2701             continue;
2702
2703           vect_cost_for_stmt kind;
2704           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2705             {
2706               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2707                kind = scalar_load;
2708              else
2709                kind = scalar_store;
2710             }
2711           else
2712             kind = scalar_stmt;
2713
2714           scalar_single_iter_cost
2715             += record_stmt_cost (scalar_cost_vec, factor, kind,
2716                                  NULL, 0, vect_prologue);
2717         }
2718     }
2719   return scalar_single_iter_cost;
2720 }
2721
2722 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2723 int
2724 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2725                              int *peel_iters_epilogue,
2726                              stmt_vector_for_cost *scalar_cost_vec,
2727                              stmt_vector_for_cost *prologue_cost_vec,
2728                              stmt_vector_for_cost *epilogue_cost_vec)
2729 {
2730   int retval = 0;
2731   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2732
2733   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2734     {
2735       *peel_iters_epilogue = vf/2;
2736       if (dump_enabled_p ())
2737         dump_printf_loc (MSG_NOTE, vect_location,
2738                          "cost model: epilogue peel iters set to vf/2 "
2739                          "because loop iterations are unknown .\n");
2740
2741       /* If peeled iterations are known but number of scalar loop
2742          iterations are unknown, count a taken branch per peeled loop.  */
2743       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2744                                  NULL, 0, vect_prologue);
2745       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2746                                  NULL, 0, vect_epilogue);
2747     }
2748   else
2749     {
2750       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2751       peel_iters_prologue = niters < peel_iters_prologue ?
2752                             niters : peel_iters_prologue;
2753       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2754       /* If we need to peel for gaps, but no peeling is required, we have to
2755          peel VF iterations.  */
2756       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2757         *peel_iters_epilogue = vf;
2758     }
2759
2760   stmt_info_for_cost *si;
2761   int j;
2762   if (peel_iters_prologue)
2763     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2764       retval += record_stmt_cost (prologue_cost_vec,
2765                                   si->count * peel_iters_prologue,
2766                                   si->kind, NULL, si->misalign,
2767                                   vect_prologue);
2768   if (*peel_iters_epilogue)
2769     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2770       retval += record_stmt_cost (epilogue_cost_vec,
2771                                   si->count * *peel_iters_epilogue,
2772                                   si->kind, NULL, si->misalign,
2773                                   vect_epilogue);
2774
2775   return retval;
2776 }
2777
2778 /* Function vect_estimate_min_profitable_iters
2779
2780    Return the number of iterations required for the vector version of the
2781    loop to be profitable relative to the cost of the scalar version of the
2782    loop.  */
2783
2784 static void
2785 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2786                                     int *ret_min_profitable_niters,
2787                                     int *ret_min_profitable_estimate)
2788 {
2789   int min_profitable_iters;
2790   int min_profitable_estimate;
2791   int peel_iters_prologue;
2792   int peel_iters_epilogue;
2793   unsigned vec_inside_cost = 0;
2794   int vec_outside_cost = 0;
2795   unsigned vec_prologue_cost = 0;
2796   unsigned vec_epilogue_cost = 0;
2797   int scalar_single_iter_cost = 0;
2798   int scalar_outside_cost = 0;
2799   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2800   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2801   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2802
2803   /* Cost model disabled.  */
2804   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2805     {
2806       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2807       *ret_min_profitable_niters = 0;
2808       *ret_min_profitable_estimate = 0;
2809       return;
2810     }
2811
2812   /* Requires loop versioning tests to handle misalignment.  */
2813   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2814     {
2815       /*  FIXME: Make cost depend on complexity of individual check.  */
2816       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2817       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2818                             vect_prologue);
2819       dump_printf (MSG_NOTE,
2820                    "cost model: Adding cost of checks for loop "
2821                    "versioning to treat misalignment.\n");
2822     }
2823
2824   /* Requires loop versioning with alias checks.  */
2825   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2826     {
2827       /*  FIXME: Make cost depend on complexity of individual check.  */
2828       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2829       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2830                             vect_prologue);
2831       dump_printf (MSG_NOTE,
2832                    "cost model: Adding cost of checks for loop "
2833                    "versioning aliasing.\n");
2834     }
2835
2836   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2837       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2838     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2839                           vect_prologue);
2840
2841   /* Count statements in scalar loop.  Using this as scalar cost for a single
2842      iteration for now.
2843
2844      TODO: Add outer loop support.
2845
2846      TODO: Consider assigning different costs to different scalar
2847      statements.  */
2848
2849   auto_vec<stmt_info_for_cost> scalar_cost_vec;
2850   scalar_single_iter_cost
2851      = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2852
2853   /* Add additional cost for the peeled instructions in prologue and epilogue
2854      loop.
2855
2856      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2857      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2858
2859      TODO: Build an expression that represents peel_iters for prologue and
2860      epilogue to be used in a run-time test.  */
2861
2862   if (npeel  < 0)
2863     {
2864       peel_iters_prologue = vf/2;
2865       dump_printf (MSG_NOTE, "cost model: "
2866                    "prologue peel iters set to vf/2.\n");
2867
2868       /* If peeling for alignment is unknown, loop bound of main loop becomes
2869          unknown.  */
2870       peel_iters_epilogue = vf/2;
2871       dump_printf (MSG_NOTE, "cost model: "
2872                    "epilogue peel iters set to vf/2 because "
2873                    "peeling for alignment is unknown.\n");
2874
2875       /* If peeled iterations are unknown, count a taken branch and a not taken
2876          branch per peeled loop. Even if scalar loop iterations are known,
2877          vector iterations are not known since peeled prologue iterations are
2878          not known. Hence guards remain the same.  */
2879       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2880                             NULL, 0, vect_prologue);
2881       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2882                             NULL, 0, vect_prologue);
2883       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2884                             NULL, 0, vect_epilogue);
2885       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2886                             NULL, 0, vect_epilogue);
2887       stmt_info_for_cost *si;
2888       int j;
2889       FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2890         {
2891           struct _stmt_vec_info *stmt_info
2892             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2893           (void) add_stmt_cost (target_cost_data,
2894                                 si->count * peel_iters_prologue,
2895                                 si->kind, stmt_info, si->misalign,
2896                                 vect_prologue);
2897           (void) add_stmt_cost (target_cost_data,
2898                                 si->count * peel_iters_epilogue,
2899                                 si->kind, stmt_info, si->misalign,
2900                                 vect_epilogue);
2901         }
2902     }
2903   else
2904     {
2905       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2906       stmt_info_for_cost *si;
2907       int j;
2908       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2909
2910       prologue_cost_vec.create (2);
2911       epilogue_cost_vec.create (2);
2912       peel_iters_prologue = npeel;
2913
2914       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2915                                           &peel_iters_epilogue,
2916                                           &scalar_cost_vec,
2917                                           &prologue_cost_vec,
2918                                           &epilogue_cost_vec);
2919
2920       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2921         {
2922           struct _stmt_vec_info *stmt_info
2923             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2924           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2925                                 si->misalign, vect_prologue);
2926         }
2927
2928       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2929         {
2930           struct _stmt_vec_info *stmt_info
2931             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2932           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2933                                 si->misalign, vect_epilogue);
2934         }
2935
2936       prologue_cost_vec.release ();
2937       epilogue_cost_vec.release ();
2938     }
2939
2940   /* FORNOW: The scalar outside cost is incremented in one of the
2941      following ways:
2942
2943      1. The vectorizer checks for alignment and aliasing and generates
2944      a condition that allows dynamic vectorization.  A cost model
2945      check is ANDED with the versioning condition.  Hence scalar code
2946      path now has the added cost of the versioning check.
2947
2948        if (cost > th & versioning_check)
2949          jmp to vector code
2950
2951      Hence run-time scalar is incremented by not-taken branch cost.
2952
2953      2. The vectorizer then checks if a prologue is required.  If the
2954      cost model check was not done before during versioning, it has to
2955      be done before the prologue check.
2956
2957        if (cost <= th)
2958          prologue = scalar_iters
2959        if (prologue == 0)
2960          jmp to vector code
2961        else
2962          execute prologue
2963        if (prologue == num_iters)
2964          go to exit
2965
2966      Hence the run-time scalar cost is incremented by a taken branch,
2967      plus a not-taken branch, plus a taken branch cost.
2968
2969      3. The vectorizer then checks if an epilogue is required.  If the
2970      cost model check was not done before during prologue check, it
2971      has to be done with the epilogue check.
2972
2973        if (prologue == 0)
2974          jmp to vector code
2975        else
2976          execute prologue
2977        if (prologue == num_iters)
2978          go to exit
2979        vector code:
2980          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2981            jmp to epilogue
2982
2983      Hence the run-time scalar cost should be incremented by 2 taken
2984      branches.
2985
2986      TODO: The back end may reorder the BBS's differently and reverse
2987      conditions/branch directions.  Change the estimates below to
2988      something more reasonable.  */
2989
2990   /* If the number of iterations is known and we do not do versioning, we can
2991      decide whether to vectorize at compile time.  Hence the scalar version
2992      do not carry cost model guard costs.  */
2993   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2994       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2995       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2996     {
2997       /* Cost model check occurs at versioning.  */
2998       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2999           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3000         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3001       else
3002         {
3003           /* Cost model check occurs at prologue generation.  */
3004           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3005             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3006               + vect_get_stmt_cost (cond_branch_not_taken);
3007           /* Cost model check occurs at epilogue generation.  */
3008           else
3009             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3010         }
3011     }
3012
3013   /* Complete the target-specific cost calculations.  */
3014   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3015                &vec_inside_cost, &vec_epilogue_cost);
3016
3017   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3018
3019   if (dump_enabled_p ())
3020     {
3021       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3022       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3023                    vec_inside_cost);
3024       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3025                    vec_prologue_cost);
3026       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3027                    vec_epilogue_cost);
3028       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3029                    scalar_single_iter_cost);
3030       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3031                    scalar_outside_cost);
3032       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3033                    vec_outside_cost);
3034       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3035                    peel_iters_prologue);
3036       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3037                    peel_iters_epilogue);
3038     }
3039
3040   /* Calculate number of iterations required to make the vector version
3041      profitable, relative to the loop bodies only.  The following condition
3042      must hold true:
3043      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3044      where
3045      SIC = scalar iteration cost, VIC = vector iteration cost,
3046      VOC = vector outside cost, VF = vectorization factor,
3047      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3048      SOC = scalar outside cost for run time cost model check.  */
3049
3050   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3051     {
3052       if (vec_outside_cost <= 0)
3053         min_profitable_iters = 1;
3054       else
3055         {
3056           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3057                                   - vec_inside_cost * peel_iters_prologue
3058                                   - vec_inside_cost * peel_iters_epilogue)
3059                                  / ((scalar_single_iter_cost * vf)
3060                                     - vec_inside_cost);
3061
3062           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3063               <= (((int) vec_inside_cost * min_profitable_iters)
3064                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3065             min_profitable_iters++;
3066         }
3067     }
3068   /* vector version will never be profitable.  */
3069   else
3070     {
3071       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3072         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3073                     "did not happen for a simd loop");
3074
3075       if (dump_enabled_p ())
3076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3077                          "cost model: the vector iteration cost = %d "
3078                          "divided by the scalar iteration cost = %d "
3079                          "is greater or equal to the vectorization factor = %d"
3080                          ".\n",
3081                          vec_inside_cost, scalar_single_iter_cost, vf);
3082       *ret_min_profitable_niters = -1;
3083       *ret_min_profitable_estimate = -1;
3084       return;
3085     }
3086
3087   dump_printf (MSG_NOTE,
3088                "  Calculated minimum iters for profitability: %d\n",
3089                min_profitable_iters);
3090
3091   min_profitable_iters =
3092         min_profitable_iters < vf ? vf : min_profitable_iters;
3093
3094   /* Because the condition we create is:
3095      if (niters <= min_profitable_iters)
3096        then skip the vectorized loop.  */
3097   min_profitable_iters--;
3098
3099   if (dump_enabled_p ())
3100     dump_printf_loc (MSG_NOTE, vect_location,
3101                      "  Runtime profitability threshold = %d\n",
3102                      min_profitable_iters);
3103
3104   *ret_min_profitable_niters = min_profitable_iters;
3105
3106   /* Calculate number of iterations required to make the vector version
3107      profitable, relative to the loop bodies only.
3108
3109      Non-vectorized variant is SIC * niters and it must win over vector
3110      variant on the expected loop trip count.  The following condition must hold true:
3111      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3112
3113   if (vec_outside_cost <= 0)
3114     min_profitable_estimate = 1;
3115   else
3116     {
3117       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3118                                  - vec_inside_cost * peel_iters_prologue
3119                                  - vec_inside_cost * peel_iters_epilogue)
3120                                  / ((scalar_single_iter_cost * vf)
3121                                    - vec_inside_cost);
3122     }
3123   min_profitable_estimate --;
3124   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3125   if (dump_enabled_p ())
3126     dump_printf_loc (MSG_NOTE, vect_location,
3127                      "  Static estimate profitability threshold = %d\n",
3128                       min_profitable_iters);
3129
3130   *ret_min_profitable_estimate = min_profitable_estimate;
3131 }
3132
3133 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3134    vector elements (not bits) for a vector of mode MODE.  */
3135 static void
3136 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3137                               unsigned char *sel)
3138 {
3139   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3140
3141   for (i = 0; i < nelt; i++)
3142     sel[i] = (i + offset) & (2*nelt - 1);
3143 }
3144
3145 /* Checks whether the target supports whole-vector shifts for vectors of mode
3146    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3147    it supports vec_perm_const with masks for all necessary shift amounts.  */
3148 static bool
3149 have_whole_vector_shift (enum machine_mode mode)
3150 {
3151   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3152     return true;
3153
3154   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3155     return false;
3156
3157   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3158   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3159
3160   for (i = nelt/2; i >= 1; i/=2)
3161     {
3162       calc_vec_perm_mask_for_shift (mode, i, sel);
3163       if (!can_vec_perm_p (mode, false, sel))
3164         return false;
3165     }
3166   return true;
3167 }
3168
3169 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3170    functions. Design better to avoid maintenance issues.  */
3171
3172 /* Function vect_model_reduction_cost.
3173
3174    Models cost for a reduction operation, including the vector ops
3175    generated within the strip-mine loop, the initial definition before
3176    the loop, and the epilogue code that must be generated.  */
3177
3178 static bool
3179 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3180                            int ncopies)
3181 {
3182   int prologue_cost = 0, epilogue_cost = 0;
3183   enum tree_code code;
3184   optab optab;
3185   tree vectype;
3186   gimple stmt, orig_stmt;
3187   tree reduction_op;
3188   machine_mode mode;
3189   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3190   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3191   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3192
3193   /* Cost of reduction op inside loop.  */
3194   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3195                                         stmt_info, 0, vect_body);
3196   stmt = STMT_VINFO_STMT (stmt_info);
3197
3198   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3199     {
3200     case GIMPLE_SINGLE_RHS:
3201       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3202       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3203       break;
3204     case GIMPLE_UNARY_RHS:
3205       reduction_op = gimple_assign_rhs1 (stmt);
3206       break;
3207     case GIMPLE_BINARY_RHS:
3208       reduction_op = gimple_assign_rhs2 (stmt);
3209       break;
3210     case GIMPLE_TERNARY_RHS:
3211       reduction_op = gimple_assign_rhs3 (stmt);
3212       break;
3213     default:
3214       gcc_unreachable ();
3215     }
3216
3217   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3218   if (!vectype)
3219     {
3220       if (dump_enabled_p ())
3221         {
3222           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3223                            "unsupported data-type ");
3224           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3225                              TREE_TYPE (reduction_op));
3226           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3227         }
3228       return false;
3229    }
3230
3231   mode = TYPE_MODE (vectype);
3232   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3233
3234   if (!orig_stmt)
3235     orig_stmt = STMT_VINFO_STMT (stmt_info);
3236
3237   code = gimple_assign_rhs_code (orig_stmt);
3238
3239   /* Add in cost for initial definition.  */
3240   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3241                                   stmt_info, 0, vect_prologue);
3242
3243   /* Determine cost of epilogue code.
3244
3245      We have a reduction operator that will reduce the vector in one statement.
3246      Also requires scalar extract.  */
3247
3248   if (!nested_in_vect_loop_p (loop, orig_stmt))
3249     {
3250       if (reduc_code != ERROR_MARK)
3251         {
3252           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3253                                           stmt_info, 0, vect_epilogue);
3254           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3255                                           stmt_info, 0, vect_epilogue);
3256         }
3257       else
3258         {
3259           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3260           tree bitsize =
3261             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3262           int element_bitsize = tree_to_uhwi (bitsize);
3263           int nelements = vec_size_in_bits / element_bitsize;
3264
3265           optab = optab_for_tree_code (code, vectype, optab_default);
3266
3267           /* We have a whole vector shift available.  */
3268           if (VECTOR_MODE_P (mode)
3269               && optab_handler (optab, mode) != CODE_FOR_nothing
3270               && have_whole_vector_shift (mode))
3271             {
3272               /* Final reduction via vector shifts and the reduction operator.
3273                  Also requires scalar extract.  */
3274               epilogue_cost += add_stmt_cost (target_cost_data,
3275                                               exact_log2 (nelements) * 2,
3276                                               vector_stmt, stmt_info, 0,
3277                                               vect_epilogue);
3278               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3279                                               vec_to_scalar, stmt_info, 0,
3280                                               vect_epilogue);
3281             }
3282           else
3283             /* Use extracts and reduction op for final reduction.  For N
3284                elements, we have N extracts and N-1 reduction ops.  */
3285             epilogue_cost += add_stmt_cost (target_cost_data,
3286                                             nelements + nelements - 1,
3287                                             vector_stmt, stmt_info, 0,
3288                                             vect_epilogue);
3289         }
3290     }
3291
3292   if (dump_enabled_p ())
3293     dump_printf (MSG_NOTE,
3294                  "vect_model_reduction_cost: inside_cost = %d, "
3295                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3296                  prologue_cost, epilogue_cost);
3297
3298   return true;
3299 }
3300
3301
3302 /* Function vect_model_induction_cost.
3303
3304    Models cost for induction operations.  */
3305
3306 static void
3307 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3308 {
3309   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3310   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3311   unsigned inside_cost, prologue_cost;
3312
3313   /* loop cost for vec_loop.  */
3314   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3315                                stmt_info, 0, vect_body);
3316
3317   /* prologue cost for vec_init and vec_step.  */
3318   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3319                                  stmt_info, 0, vect_prologue);
3320
3321   if (dump_enabled_p ())
3322     dump_printf_loc (MSG_NOTE, vect_location,
3323                      "vect_model_induction_cost: inside_cost = %d, "
3324                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3325 }
3326
3327
3328 /* Function get_initial_def_for_induction
3329
3330    Input:
3331    STMT - a stmt that performs an induction operation in the loop.
3332    IV_PHI - the initial value of the induction variable
3333
3334    Output:
3335    Return a vector variable, initialized with the first VF values of
3336    the induction variable.  E.g., for an iv with IV_PHI='X' and
3337    evolution S, for a vector of 4 units, we want to return:
3338    [X, X + S, X + 2*S, X + 3*S].  */
3339
3340 static tree
3341 get_initial_def_for_induction (gimple iv_phi)
3342 {
3343   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3344   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3345   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3346   tree vectype;
3347   int nunits;
3348   edge pe = loop_preheader_edge (loop);
3349   struct loop *iv_loop;
3350   basic_block new_bb;
3351   tree new_vec, vec_init, vec_step, t;
3352   tree new_var;
3353   tree new_name;
3354   gimple init_stmt, new_stmt;
3355   gphi *induction_phi;
3356   tree induc_def, vec_def, vec_dest;
3357   tree init_expr, step_expr;
3358   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3359   int i;
3360   int ncopies;
3361   tree expr;
3362   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3363   bool nested_in_vect_loop = false;
3364   gimple_seq stmts = NULL;
3365   imm_use_iterator imm_iter;
3366   use_operand_p use_p;
3367   gimple exit_phi;
3368   edge latch_e;
3369   tree loop_arg;
3370   gimple_stmt_iterator si;
3371   basic_block bb = gimple_bb (iv_phi);
3372   tree stepvectype;
3373   tree resvectype;
3374
3375   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3376   if (nested_in_vect_loop_p (loop, iv_phi))
3377     {
3378       nested_in_vect_loop = true;
3379       iv_loop = loop->inner;
3380     }
3381   else
3382     iv_loop = loop;
3383   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3384
3385   latch_e = loop_latch_edge (iv_loop);
3386   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3387
3388   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3389   gcc_assert (step_expr != NULL_TREE);
3390
3391   pe = loop_preheader_edge (iv_loop);
3392   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3393                                      loop_preheader_edge (iv_loop));
3394
3395   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3396   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3397   gcc_assert (vectype);
3398   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3399   ncopies = vf / nunits;
3400
3401   gcc_assert (phi_info);
3402   gcc_assert (ncopies >= 1);
3403
3404   /* Convert the step to the desired type.  */
3405   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3406                                                   step_expr),
3407                                     &stmts, true, NULL_TREE);
3408   if (stmts)
3409     {
3410       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3411       gcc_assert (!new_bb);
3412     }
3413
3414   /* Find the first insertion point in the BB.  */
3415   si = gsi_after_labels (bb);
3416
3417   /* Create the vector that holds the initial_value of the induction.  */
3418   if (nested_in_vect_loop)
3419     {
3420       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3421          been created during vectorization of previous stmts.  We obtain it
3422          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3423       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3424       /* If the initial value is not of proper type, convert it.  */
3425       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3426         {
3427           new_stmt
3428             = gimple_build_assign (vect_get_new_vect_var (vectype,
3429                                                           vect_simple_var,
3430                                                           "vec_iv_"),
3431                                    VIEW_CONVERT_EXPR,
3432                                    build1 (VIEW_CONVERT_EXPR, vectype,
3433                                            vec_init));
3434           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3435           gimple_assign_set_lhs (new_stmt, vec_init);
3436           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3437                                                  new_stmt);
3438           gcc_assert (!new_bb);
3439           set_vinfo_for_stmt (new_stmt,
3440                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3441         }
3442     }
3443   else
3444     {
3445       vec<constructor_elt, va_gc> *v;
3446
3447       /* iv_loop is the loop to be vectorized. Create:
3448          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3449       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3450                                        vect_scalar_var, "var_");
3451       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3452                                                      init_expr),
3453                                        &stmts, false, new_var);
3454       if (stmts)
3455         {
3456           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3457           gcc_assert (!new_bb);
3458         }
3459
3460       vec_alloc (v, nunits);
3461       bool constant_p = is_gimple_min_invariant (new_name);
3462       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3463       for (i = 1; i < nunits; i++)
3464         {
3465           /* Create: new_name_i = new_name + step_expr  */
3466           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3467                                   new_name, step_expr);
3468           if (!is_gimple_min_invariant (new_name))
3469             {
3470               init_stmt = gimple_build_assign (new_var, new_name);
3471               new_name = make_ssa_name (new_var, init_stmt);
3472               gimple_assign_set_lhs (init_stmt, new_name);
3473               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3474               gcc_assert (!new_bb);
3475               if (dump_enabled_p ())
3476                 {
3477                   dump_printf_loc (MSG_NOTE, vect_location,
3478                                    "created new init_stmt: ");
3479                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3480                   dump_printf (MSG_NOTE, "\n");
3481                 }
3482               constant_p = false;
3483             }
3484           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3485         }
3486       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3487       if (constant_p)
3488         new_vec = build_vector_from_ctor (vectype, v);
3489       else
3490         new_vec = build_constructor (vectype, v);
3491       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3492     }
3493
3494
3495   /* Create the vector that holds the step of the induction.  */
3496   if (nested_in_vect_loop)
3497     /* iv_loop is nested in the loop to be vectorized. Generate:
3498        vec_step = [S, S, S, S]  */
3499     new_name = step_expr;
3500   else
3501     {
3502       /* iv_loop is the loop to be vectorized. Generate:
3503           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3504       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3505         {
3506           expr = build_int_cst (integer_type_node, vf);
3507           expr = fold_convert (TREE_TYPE (step_expr), expr);
3508         }
3509       else
3510         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3511       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3512                               expr, step_expr);
3513       if (TREE_CODE (step_expr) == SSA_NAME)
3514         new_name = vect_init_vector (iv_phi, new_name,
3515                                      TREE_TYPE (step_expr), NULL);
3516     }
3517
3518   t = unshare_expr (new_name);
3519   gcc_assert (CONSTANT_CLASS_P (new_name)
3520               || TREE_CODE (new_name) == SSA_NAME);
3521   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3522   gcc_assert (stepvectype);
3523   new_vec = build_vector_from_val (stepvectype, t);
3524   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3525
3526
3527   /* Create the following def-use cycle:
3528      loop prolog:
3529          vec_init = ...
3530          vec_step = ...
3531      loop:
3532          vec_iv = PHI <vec_init, vec_loop>
3533          ...
3534          STMT
3535          ...
3536          vec_loop = vec_iv + vec_step;  */
3537
3538   /* Create the induction-phi that defines the induction-operand.  */
3539   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3540   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3541   set_vinfo_for_stmt (induction_phi,
3542                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3543   induc_def = PHI_RESULT (induction_phi);
3544
3545   /* Create the iv update inside the loop  */
3546   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3547   vec_def = make_ssa_name (vec_dest, new_stmt);
3548   gimple_assign_set_lhs (new_stmt, vec_def);
3549   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3550   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3551                                                    NULL));
3552
3553   /* Set the arguments of the phi node:  */
3554   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3555   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3556                UNKNOWN_LOCATION);
3557
3558
3559   /* In case that vectorization factor (VF) is bigger than the number
3560      of elements that we can fit in a vectype (nunits), we have to generate
3561      more than one vector stmt - i.e - we need to "unroll" the
3562      vector stmt by a factor VF/nunits.  For more details see documentation
3563      in vectorizable_operation.  */
3564
3565   if (ncopies > 1)
3566     {
3567       stmt_vec_info prev_stmt_vinfo;
3568       /* FORNOW. This restriction should be relaxed.  */
3569       gcc_assert (!nested_in_vect_loop);
3570
3571       /* Create the vector that holds the step of the induction.  */
3572       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3573         {
3574           expr = build_int_cst (integer_type_node, nunits);
3575           expr = fold_convert (TREE_TYPE (step_expr), expr);
3576         }
3577       else
3578         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3579       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3580                               expr, step_expr);
3581       if (TREE_CODE (step_expr) == SSA_NAME)
3582         new_name = vect_init_vector (iv_phi, new_name,
3583                                      TREE_TYPE (step_expr), NULL);
3584       t = unshare_expr (new_name);
3585       gcc_assert (CONSTANT_CLASS_P (new_name)
3586                   || TREE_CODE (new_name) == SSA_NAME);
3587       new_vec = build_vector_from_val (stepvectype, t);
3588       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3589
3590       vec_def = induc_def;
3591       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3592       for (i = 1; i < ncopies; i++)
3593         {
3594           /* vec_i = vec_prev + vec_step  */
3595           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3596                                           vec_def, vec_step);
3597           vec_def = make_ssa_name (vec_dest, new_stmt);
3598           gimple_assign_set_lhs (new_stmt, vec_def);
3599
3600           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3601           if (!useless_type_conversion_p (resvectype, vectype))
3602             {
3603               new_stmt
3604                 = gimple_build_assign
3605                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3606                                                 "vec_iv_"),
3607                          VIEW_CONVERT_EXPR,
3608                          build1 (VIEW_CONVERT_EXPR, resvectype,
3609                                  gimple_assign_lhs (new_stmt)));
3610               gimple_assign_set_lhs (new_stmt,
3611                                      make_ssa_name
3612                                        (gimple_assign_lhs (new_stmt), new_stmt));
3613               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3614             }
3615           set_vinfo_for_stmt (new_stmt,
3616                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3617           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3618           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3619         }
3620     }
3621
3622   if (nested_in_vect_loop)
3623     {
3624       /* Find the loop-closed exit-phi of the induction, and record
3625          the final vector of induction results:  */
3626       exit_phi = NULL;
3627       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3628         {
3629           gimple use_stmt = USE_STMT (use_p);
3630           if (is_gimple_debug (use_stmt))
3631             continue;
3632
3633           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3634             {
3635               exit_phi = use_stmt;
3636               break;
3637             }
3638         }
3639       if (exit_phi)
3640         {
3641           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3642           /* FORNOW. Currently not supporting the case that an inner-loop induction
3643              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3644           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3645                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3646
3647           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3648           if (dump_enabled_p ())
3649             {
3650               dump_printf_loc (MSG_NOTE, vect_location,
3651                                "vector of inductions after inner-loop:");
3652               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3653               dump_printf (MSG_NOTE, "\n");
3654             }
3655         }
3656     }
3657
3658
3659   if (dump_enabled_p ())
3660     {
3661       dump_printf_loc (MSG_NOTE, vect_location,
3662                        "transform induction: created def-use cycle: ");
3663       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3664       dump_printf (MSG_NOTE, "\n");
3665       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3666                         SSA_NAME_DEF_STMT (vec_def), 0);
3667       dump_printf (MSG_NOTE, "\n");
3668     }
3669
3670   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3671   if (!useless_type_conversion_p (resvectype, vectype))
3672     {
3673       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3674                                                              vect_simple_var,
3675                                                              "vec_iv_"),
3676                                       VIEW_CONVERT_EXPR,
3677                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3678                                               induc_def));
3679       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3680       gimple_assign_set_lhs (new_stmt, induc_def);
3681       si = gsi_after_labels (bb);
3682       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3683       set_vinfo_for_stmt (new_stmt,
3684                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3685       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3686         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3687     }
3688
3689   return induc_def;
3690 }
3691
3692
3693 /* Function get_initial_def_for_reduction
3694
3695    Input:
3696    STMT - a stmt that performs a reduction operation in the loop.
3697    INIT_VAL - the initial value of the reduction variable
3698
3699    Output:
3700    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3701         of the reduction (used for adjusting the epilog - see below).
3702    Return a vector variable, initialized according to the operation that STMT
3703         performs. This vector will be used as the initial value of the
3704         vector of partial results.
3705
3706    Option1 (adjust in epilog): Initialize the vector as follows:
3707      add/bit or/xor:    [0,0,...,0,0]
3708      mult/bit and:      [1,1,...,1,1]
3709      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3710    and when necessary (e.g. add/mult case) let the caller know
3711    that it needs to adjust the result by init_val.
3712
3713    Option2: Initialize the vector as follows:
3714      add/bit or/xor:    [init_val,0,0,...,0]
3715      mult/bit and:      [init_val,1,1,...,1]
3716      min/max/cond_expr: [init_val,init_val,...,init_val]
3717    and no adjustments are needed.
3718
3719    For example, for the following code:
3720
3721    s = init_val;
3722    for (i=0;i<n;i++)
3723      s = s + a[i];
3724
3725    STMT is 's = s + a[i]', and the reduction variable is 's'.
3726    For a vector of 4 units, we want to return either [0,0,0,init_val],
3727    or [0,0,0,0] and let the caller know that it needs to adjust
3728    the result at the end by 'init_val'.
3729
3730    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3731    initialization vector is simpler (same element in all entries), if
3732    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3733
3734    A cost model should help decide between these two schemes.  */
3735
3736 tree
3737 get_initial_def_for_reduction (gimple stmt, tree init_val,
3738                                tree *adjustment_def)
3739 {
3740   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3741   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3742   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3743   tree scalar_type = TREE_TYPE (init_val);
3744   tree vectype = get_vectype_for_scalar_type (scalar_type);
3745   int nunits;
3746   enum tree_code code = gimple_assign_rhs_code (stmt);
3747   tree def_for_init;
3748   tree init_def;
3749   tree *elts;
3750   int i;
3751   bool nested_in_vect_loop = false;
3752   tree init_value;
3753   REAL_VALUE_TYPE real_init_val = dconst0;
3754   int int_init_val = 0;
3755   gimple def_stmt = NULL;
3756
3757   gcc_assert (vectype);
3758   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3759
3760   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3761               || SCALAR_FLOAT_TYPE_P (scalar_type));
3762
3763   if (nested_in_vect_loop_p (loop, stmt))
3764     nested_in_vect_loop = true;
3765   else
3766     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3767
3768   /* In case of double reduction we only create a vector variable to be put
3769      in the reduction phi node.  The actual statement creation is done in
3770      vect_create_epilog_for_reduction.  */
3771   if (adjustment_def && nested_in_vect_loop
3772       && TREE_CODE (init_val) == SSA_NAME
3773       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3774       && gimple_code (def_stmt) == GIMPLE_PHI
3775       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3776       && vinfo_for_stmt (def_stmt)
3777       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3778           == vect_double_reduction_def)
3779     {
3780       *adjustment_def = NULL;
3781       return vect_create_destination_var (init_val, vectype);
3782     }
3783
3784   if (TREE_CONSTANT (init_val))
3785     {
3786       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3787         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3788       else
3789         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3790     }
3791   else
3792     init_value = init_val;
3793
3794   switch (code)
3795     {
3796       case WIDEN_SUM_EXPR:
3797       case DOT_PROD_EXPR:
3798       case SAD_EXPR:
3799       case PLUS_EXPR:
3800       case MINUS_EXPR:
3801       case BIT_IOR_EXPR:
3802       case BIT_XOR_EXPR:
3803       case MULT_EXPR:
3804       case BIT_AND_EXPR:
3805         /* ADJUSMENT_DEF is NULL when called from
3806            vect_create_epilog_for_reduction to vectorize double reduction.  */
3807         if (adjustment_def)
3808           {
3809             if (nested_in_vect_loop)
3810               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3811                                                               NULL);
3812             else
3813               *adjustment_def = init_val;
3814           }
3815
3816         if (code == MULT_EXPR)
3817           {
3818             real_init_val = dconst1;
3819             int_init_val = 1;
3820           }
3821
3822         if (code == BIT_AND_EXPR)
3823           int_init_val = -1;
3824
3825         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3826           def_for_init = build_real (scalar_type, real_init_val);
3827         else
3828           def_for_init = build_int_cst (scalar_type, int_init_val);
3829
3830         /* Create a vector of '0' or '1' except the first element.  */
3831         elts = XALLOCAVEC (tree, nunits);
3832         for (i = nunits - 2; i >= 0; --i)
3833           elts[i + 1] = def_for_init;
3834
3835         /* Option1: the first element is '0' or '1' as well.  */
3836         if (adjustment_def)
3837           {
3838             elts[0] = def_for_init;
3839             init_def = build_vector (vectype, elts);
3840             break;
3841           }
3842
3843         /* Option2: the first element is INIT_VAL.  */
3844         elts[0] = init_val;
3845         if (TREE_CONSTANT (init_val))
3846           init_def = build_vector (vectype, elts);
3847         else
3848           {
3849             vec<constructor_elt, va_gc> *v;
3850             vec_alloc (v, nunits);
3851             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3852             for (i = 1; i < nunits; ++i)
3853               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3854             init_def = build_constructor (vectype, v);
3855           }
3856
3857         break;
3858
3859       case MIN_EXPR:
3860       case MAX_EXPR:
3861       case COND_EXPR:
3862         if (adjustment_def)
3863           {
3864             *adjustment_def = NULL_TREE;
3865             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3866             break;
3867           }
3868
3869         init_def = build_vector_from_val (vectype, init_value);
3870         break;
3871
3872       default:
3873         gcc_unreachable ();
3874     }
3875
3876   return init_def;
3877 }
3878
3879 /* Function vect_create_epilog_for_reduction
3880
3881    Create code at the loop-epilog to finalize the result of a reduction
3882    computation.
3883
3884    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3885      reduction statements.
3886    STMT is the scalar reduction stmt that is being vectorized.
3887    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3888      number of elements that we can fit in a vectype (nunits).  In this case
3889      we have to generate more than one vector stmt - i.e - we need to "unroll"
3890      the vector stmt by a factor VF/nunits.  For more details see documentation
3891      in vectorizable_operation.
3892    REDUC_CODE is the tree-code for the epilog reduction.
3893    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3894      computation.
3895    REDUC_INDEX is the index of the operand in the right hand side of the
3896      statement that is defined by REDUCTION_PHI.
3897    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3898    SLP_NODE is an SLP node containing a group of reduction statements. The
3899      first one in this group is STMT.
3900
3901    This function:
3902    1. Creates the reduction def-use cycles: sets the arguments for
3903       REDUCTION_PHIS:
3904       The loop-entry argument is the vectorized initial-value of the reduction.
3905       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3906       sums.
3907    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3908       by applying the operation specified by REDUC_CODE if available, or by
3909       other means (whole-vector shifts or a scalar loop).
3910       The function also creates a new phi node at the loop exit to preserve
3911       loop-closed form, as illustrated below.
3912
3913      The flow at the entry to this function:
3914
3915         loop:
3916           vec_def = phi <null, null>            # REDUCTION_PHI
3917           VECT_DEF = vector_stmt                # vectorized form of STMT
3918           s_loop = scalar_stmt                  # (scalar) STMT
3919         loop_exit:
3920           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3921           use <s_out0>
3922           use <s_out0>
3923
3924      The above is transformed by this function into:
3925
3926         loop:
3927           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3928           VECT_DEF = vector_stmt                # vectorized form of STMT
3929           s_loop = scalar_stmt                  # (scalar) STMT
3930         loop_exit:
3931           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3932           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3933           v_out2 = reduce <v_out1>
3934           s_out3 = extract_field <v_out2, 0>
3935           s_out4 = adjust_result <s_out3>
3936           use <s_out4>
3937           use <s_out4>
3938 */
3939
3940 static void
3941 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3942                                   int ncopies, enum tree_code reduc_code,
3943                                   vec<gimple> reduction_phis,
3944                                   int reduc_index, bool double_reduc,
3945                                   slp_tree slp_node)
3946 {
3947   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3948   stmt_vec_info prev_phi_info;
3949   tree vectype;
3950   machine_mode mode;
3951   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3952   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3953   basic_block exit_bb;
3954   tree scalar_dest;
3955   tree scalar_type;
3956   gimple new_phi = NULL, phi;
3957   gimple_stmt_iterator exit_gsi;
3958   tree vec_dest;
3959   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3960   gimple epilog_stmt = NULL;
3961   enum tree_code code = gimple_assign_rhs_code (stmt);
3962   gimple exit_phi;
3963   tree bitsize;
3964   tree adjustment_def = NULL;
3965   tree vec_initial_def = NULL;
3966   tree reduction_op, expr, def;
3967   tree orig_name, scalar_result;
3968   imm_use_iterator imm_iter, phi_imm_iter;
3969   use_operand_p use_p, phi_use_p;
3970   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3971   bool nested_in_vect_loop = false;
3972   auto_vec<gimple> new_phis;
3973   auto_vec<gimple> inner_phis;
3974   enum vect_def_type dt = vect_unknown_def_type;
3975   int j, i;
3976   auto_vec<tree> scalar_results;
3977   unsigned int group_size = 1, k, ratio;
3978   auto_vec<tree> vec_initial_defs;
3979   auto_vec<gimple> phis;
3980   bool slp_reduc = false;
3981   tree new_phi_result;
3982   gimple inner_phi = NULL;
3983
3984   if (slp_node)
3985     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3986
3987   if (nested_in_vect_loop_p (loop, stmt))
3988     {
3989       outer_loop = loop;
3990       loop = loop->inner;
3991       nested_in_vect_loop = true;
3992       gcc_assert (!slp_node);
3993     }
3994
3995   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3996     {
3997     case GIMPLE_SINGLE_RHS:
3998       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3999                   == ternary_op);
4000       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
4001       break;
4002     case GIMPLE_UNARY_RHS:
4003       reduction_op = gimple_assign_rhs1 (stmt);
4004       break;
4005     case GIMPLE_BINARY_RHS:
4006       reduction_op = reduc_index ?
4007                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
4008       break;
4009     case GIMPLE_TERNARY_RHS:
4010       reduction_op = gimple_op (stmt, reduc_index + 1);
4011       break;
4012     default:
4013       gcc_unreachable ();
4014     }
4015
4016   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4017   gcc_assert (vectype);
4018   mode = TYPE_MODE (vectype);
4019
4020   /* 1. Create the reduction def-use cycle:
4021      Set the arguments of REDUCTION_PHIS, i.e., transform
4022
4023         loop:
4024           vec_def = phi <null, null>            # REDUCTION_PHI
4025           VECT_DEF = vector_stmt                # vectorized form of STMT
4026           ...
4027
4028      into:
4029
4030         loop:
4031           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4032           VECT_DEF = vector_stmt                # vectorized form of STMT
4033           ...
4034
4035      (in case of SLP, do it for all the phis). */
4036
4037   /* Get the loop-entry arguments.  */
4038   if (slp_node)
4039     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4040                        NULL, slp_node, reduc_index);
4041   else
4042     {
4043       vec_initial_defs.create (1);
4044      /* For the case of reduction, vect_get_vec_def_for_operand returns
4045         the scalar def before the loop, that defines the initial value
4046         of the reduction variable.  */
4047       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4048                                                       &adjustment_def);
4049       vec_initial_defs.quick_push (vec_initial_def);
4050     }
4051
4052   /* Set phi nodes arguments.  */
4053   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4054     {
4055       tree vec_init_def, def;
4056       gimple_seq stmts;
4057       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4058                                            true, NULL_TREE);
4059       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4060       def = vect_defs[i];
4061       for (j = 0; j < ncopies; j++)
4062         {
4063           /* Set the loop-entry arg of the reduction-phi.  */
4064           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4065                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4066
4067           /* Set the loop-latch arg for the reduction-phi.  */
4068           if (j > 0)
4069             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4070
4071           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4072                        UNKNOWN_LOCATION);
4073
4074           if (dump_enabled_p ())
4075             {
4076               dump_printf_loc (MSG_NOTE, vect_location,
4077                                "transform reduction: created def-use cycle: ");
4078               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4079               dump_printf (MSG_NOTE, "\n");
4080               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4081               dump_printf (MSG_NOTE, "\n");
4082             }
4083
4084           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4085         }
4086     }
4087
4088   /* 2. Create epilog code.
4089         The reduction epilog code operates across the elements of the vector
4090         of partial results computed by the vectorized loop.
4091         The reduction epilog code consists of:
4092
4093         step 1: compute the scalar result in a vector (v_out2)
4094         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4095         step 3: adjust the scalar result (s_out3) if needed.
4096
4097         Step 1 can be accomplished using one the following three schemes:
4098           (scheme 1) using reduc_code, if available.
4099           (scheme 2) using whole-vector shifts, if available.
4100           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4101                      combined.
4102
4103           The overall epilog code looks like this:
4104
4105           s_out0 = phi <s_loop>         # original EXIT_PHI
4106           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4107           v_out2 = reduce <v_out1>              # step 1
4108           s_out3 = extract_field <v_out2, 0>    # step 2
4109           s_out4 = adjust_result <s_out3>       # step 3
4110
4111           (step 3 is optional, and steps 1 and 2 may be combined).
4112           Lastly, the uses of s_out0 are replaced by s_out4.  */
4113
4114
4115   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4116          v_out1 = phi <VECT_DEF>
4117          Store them in NEW_PHIS.  */
4118
4119   exit_bb = single_exit (loop)->dest;
4120   prev_phi_info = NULL;
4121   new_phis.create (vect_defs.length ());
4122   FOR_EACH_VEC_ELT (vect_defs, i, def)
4123     {
4124       for (j = 0; j < ncopies; j++)
4125         {
4126           tree new_def = copy_ssa_name (def);
4127           phi = create_phi_node (new_def, exit_bb);
4128           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4129           if (j == 0)
4130             new_phis.quick_push (phi);
4131           else
4132             {
4133               def = vect_get_vec_def_for_stmt_copy (dt, def);
4134               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4135             }
4136
4137           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4138           prev_phi_info = vinfo_for_stmt (phi);
4139         }
4140     }
4141
4142   /* The epilogue is created for the outer-loop, i.e., for the loop being
4143      vectorized.  Create exit phis for the outer loop.  */
4144   if (double_reduc)
4145     {
4146       loop = outer_loop;
4147       exit_bb = single_exit (loop)->dest;
4148       inner_phis.create (vect_defs.length ());
4149       FOR_EACH_VEC_ELT (new_phis, i, phi)
4150         {
4151           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4152           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4153           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4154                            PHI_RESULT (phi));
4155           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4156                                                             loop_vinfo, NULL));
4157           inner_phis.quick_push (phi);
4158           new_phis[i] = outer_phi;
4159           prev_phi_info = vinfo_for_stmt (outer_phi);
4160           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4161             {
4162               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4163               new_result = copy_ssa_name (PHI_RESULT (phi));
4164               outer_phi = create_phi_node (new_result, exit_bb);
4165               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4166                                PHI_RESULT (phi));
4167               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4168                                                         loop_vinfo, NULL));
4169               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4170               prev_phi_info = vinfo_for_stmt (outer_phi);
4171             }
4172         }
4173     }
4174
4175   exit_gsi = gsi_after_labels (exit_bb);
4176
4177   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4178          (i.e. when reduc_code is not available) and in the final adjustment
4179          code (if needed).  Also get the original scalar reduction variable as
4180          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4181          represents a reduction pattern), the tree-code and scalar-def are
4182          taken from the original stmt that the pattern-stmt (STMT) replaces.
4183          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4184          are taken from STMT.  */
4185
4186   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4187   if (!orig_stmt)
4188     {
4189       /* Regular reduction  */
4190       orig_stmt = stmt;
4191     }
4192   else
4193     {
4194       /* Reduction pattern  */
4195       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4196       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4197       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4198     }
4199
4200   code = gimple_assign_rhs_code (orig_stmt);
4201   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4202      partial results are added and not subtracted.  */
4203   if (code == MINUS_EXPR)
4204     code = PLUS_EXPR;
4205
4206   scalar_dest = gimple_assign_lhs (orig_stmt);
4207   scalar_type = TREE_TYPE (scalar_dest);
4208   scalar_results.create (group_size);
4209   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4210   bitsize = TYPE_SIZE (scalar_type);
4211
4212   /* In case this is a reduction in an inner-loop while vectorizing an outer
4213      loop - we don't need to extract a single scalar result at the end of the
4214      inner-loop (unless it is double reduction, i.e., the use of reduction is
4215      outside the outer-loop).  The final vector of partial results will be used
4216      in the vectorized outer-loop, or reduced to a scalar result at the end of
4217      the outer-loop.  */
4218   if (nested_in_vect_loop && !double_reduc)
4219     goto vect_finalize_reduction;
4220
4221   /* SLP reduction without reduction chain, e.g.,
4222      # a1 = phi <a2, a0>
4223      # b1 = phi <b2, b0>
4224      a2 = operation (a1)
4225      b2 = operation (b1)  */
4226   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4227
4228   /* In case of reduction chain, e.g.,
4229      # a1 = phi <a3, a0>
4230      a2 = operation (a1)
4231      a3 = operation (a2),
4232
4233      we may end up with more than one vector result.  Here we reduce them to
4234      one vector.  */
4235   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4236     {
4237       tree first_vect = PHI_RESULT (new_phis[0]);
4238       tree tmp;
4239       gassign *new_vec_stmt = NULL;
4240
4241       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4242       for (k = 1; k < new_phis.length (); k++)
4243         {
4244           gimple next_phi = new_phis[k];
4245           tree second_vect = PHI_RESULT (next_phi);
4246
4247           tmp = build2 (code, vectype,  first_vect, second_vect);
4248           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4249           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4250           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4251           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4252         }
4253
4254       new_phi_result = first_vect;
4255       if (new_vec_stmt)
4256         {
4257           new_phis.truncate (0);
4258           new_phis.safe_push (new_vec_stmt);
4259         }
4260     }
4261   else
4262     new_phi_result = PHI_RESULT (new_phis[0]);
4263
4264   /* 2.3 Create the reduction code, using one of the three schemes described
4265          above. In SLP we simply need to extract all the elements from the
4266          vector (without reducing them), so we use scalar shifts.  */
4267   if (reduc_code != ERROR_MARK && !slp_reduc)
4268     {
4269       tree tmp;
4270       tree vec_elem_type;
4271
4272       /*** Case 1:  Create:
4273            v_out2 = reduc_expr <v_out1>  */
4274
4275       if (dump_enabled_p ())
4276         dump_printf_loc (MSG_NOTE, vect_location,
4277                          "Reduce using direct vector reduction.\n");
4278
4279       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4280       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4281         {
4282           tree tmp_dest =
4283               vect_create_destination_var (scalar_dest, vec_elem_type);
4284           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4285           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4286           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4287           gimple_assign_set_lhs (epilog_stmt, new_temp);
4288           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4289
4290           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4291         }
4292       else
4293         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4294       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4295       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4296       gimple_assign_set_lhs (epilog_stmt, new_temp);
4297       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4298       scalar_results.safe_push (new_temp);
4299     }
4300   else
4301     {
4302       bool reduce_with_shift = have_whole_vector_shift (mode);
4303       int element_bitsize = tree_to_uhwi (bitsize);
4304       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4305       tree vec_temp;
4306
4307       /* Regardless of whether we have a whole vector shift, if we're
4308          emulating the operation via tree-vect-generic, we don't want
4309          to use it.  Only the first round of the reduction is likely
4310          to still be profitable via emulation.  */
4311       /* ??? It might be better to emit a reduction tree code here, so that
4312          tree-vect-generic can expand the first round via bit tricks.  */
4313       if (!VECTOR_MODE_P (mode))
4314         reduce_with_shift = false;
4315       else
4316         {
4317           optab optab = optab_for_tree_code (code, vectype, optab_default);
4318           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4319             reduce_with_shift = false;
4320         }
4321
4322       if (reduce_with_shift && !slp_reduc)
4323         {
4324           int nelements = vec_size_in_bits / element_bitsize;
4325           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4326
4327           int elt_offset;
4328
4329           tree zero_vec = build_zero_cst (vectype);
4330           /*** Case 2: Create:
4331              for (offset = nelements/2; offset >= 1; offset/=2)
4332                 {
4333                   Create:  va' = vec_shift <va, offset>
4334                   Create:  va = vop <va, va'>
4335                 }  */
4336
4337           tree rhs;
4338
4339           if (dump_enabled_p ())
4340             dump_printf_loc (MSG_NOTE, vect_location,
4341                              "Reduce using vector shifts\n");
4342
4343           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4344           new_temp = new_phi_result;
4345           for (elt_offset = nelements / 2;
4346                elt_offset >= 1;
4347                elt_offset /= 2)
4348             {
4349               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4350               tree mask = vect_gen_perm_mask_any (vectype, sel);
4351               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4352                                                  new_temp, zero_vec, mask);
4353               new_name = make_ssa_name (vec_dest, epilog_stmt);
4354               gimple_assign_set_lhs (epilog_stmt, new_name);
4355               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4356
4357               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4358                                                  new_temp);
4359               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4360               gimple_assign_set_lhs (epilog_stmt, new_temp);
4361               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4362             }
4363
4364           /* 2.4  Extract the final scalar result.  Create:
4365              s_out3 = extract_field <v_out2, bitpos>  */
4366
4367           if (dump_enabled_p ())
4368             dump_printf_loc (MSG_NOTE, vect_location,
4369                              "extract scalar result\n");
4370
4371           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4372                         bitsize, bitsize_zero_node);
4373           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4374           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4375           gimple_assign_set_lhs (epilog_stmt, new_temp);
4376           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4377           scalar_results.safe_push (new_temp);
4378         }
4379       else
4380         {
4381           /*** Case 3: Create:
4382              s = extract_field <v_out2, 0>
4383              for (offset = element_size;
4384                   offset < vector_size;
4385                   offset += element_size;)
4386                {
4387                  Create:  s' = extract_field <v_out2, offset>
4388                  Create:  s = op <s, s'>  // For non SLP cases
4389                }  */
4390
4391           if (dump_enabled_p ())
4392             dump_printf_loc (MSG_NOTE, vect_location,
4393                              "Reduce using scalar code.\n");
4394
4395           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4396           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4397             {
4398               int bit_offset;
4399               if (gimple_code (new_phi) == GIMPLE_PHI)
4400                 vec_temp = PHI_RESULT (new_phi);
4401               else
4402                 vec_temp = gimple_assign_lhs (new_phi);
4403               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4404                             bitsize_zero_node);
4405               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4406               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4407               gimple_assign_set_lhs (epilog_stmt, new_temp);
4408               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4409
4410               /* In SLP we don't need to apply reduction operation, so we just
4411                  collect s' values in SCALAR_RESULTS.  */
4412               if (slp_reduc)
4413                 scalar_results.safe_push (new_temp);
4414
4415               for (bit_offset = element_bitsize;
4416                    bit_offset < vec_size_in_bits;
4417                    bit_offset += element_bitsize)
4418                 {
4419                   tree bitpos = bitsize_int (bit_offset);
4420                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4421                                      bitsize, bitpos);
4422
4423                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4424                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4425                   gimple_assign_set_lhs (epilog_stmt, new_name);
4426                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4427
4428                   if (slp_reduc)
4429                     {
4430                       /* In SLP we don't need to apply reduction operation, so
4431                          we just collect s' values in SCALAR_RESULTS.  */
4432                       new_temp = new_name;
4433                       scalar_results.safe_push (new_name);
4434                     }
4435                   else
4436                     {
4437                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4438                                                          new_name, new_temp);
4439                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4440                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4441                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4442                     }
4443                 }
4444             }
4445
4446           /* The only case where we need to reduce scalar results in SLP, is
4447              unrolling.  If the size of SCALAR_RESULTS is greater than
4448              GROUP_SIZE, we reduce them combining elements modulo
4449              GROUP_SIZE.  */
4450           if (slp_reduc)
4451             {
4452               tree res, first_res, new_res;
4453               gimple new_stmt;
4454
4455               /* Reduce multiple scalar results in case of SLP unrolling.  */
4456               for (j = group_size; scalar_results.iterate (j, &res);
4457                    j++)
4458                 {
4459                   first_res = scalar_results[j % group_size];
4460                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4461                                                   first_res, res);
4462                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4463                   gimple_assign_set_lhs (new_stmt, new_res);
4464                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4465                   scalar_results[j % group_size] = new_res;
4466                 }
4467             }
4468           else
4469             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4470             scalar_results.safe_push (new_temp);
4471         }
4472     }
4473
4474 vect_finalize_reduction:
4475
4476   if (double_reduc)
4477     loop = loop->inner;
4478
4479   /* 2.5 Adjust the final result by the initial value of the reduction
4480          variable. (When such adjustment is not needed, then
4481          'adjustment_def' is zero).  For example, if code is PLUS we create:
4482          new_temp = loop_exit_def + adjustment_def  */
4483
4484   if (adjustment_def)
4485     {
4486       gcc_assert (!slp_reduc);
4487       if (nested_in_vect_loop)
4488         {
4489           new_phi = new_phis[0];
4490           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4491           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4492           new_dest = vect_create_destination_var (scalar_dest, vectype);
4493         }
4494       else
4495         {
4496           new_temp = scalar_results[0];
4497           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4498           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4499           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4500         }
4501
4502       epilog_stmt = gimple_build_assign (new_dest, expr);
4503       new_temp = make_ssa_name (new_dest, epilog_stmt);
4504       gimple_assign_set_lhs (epilog_stmt, new_temp);
4505       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4506       if (nested_in_vect_loop)
4507         {
4508           set_vinfo_for_stmt (epilog_stmt,
4509                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4510                                                  NULL));
4511           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4512                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4513
4514           if (!double_reduc)
4515             scalar_results.quick_push (new_temp);
4516           else
4517             scalar_results[0] = new_temp;
4518         }
4519       else
4520         scalar_results[0] = new_temp;
4521
4522       new_phis[0] = epilog_stmt;
4523     }
4524
4525   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4526           phis with new adjusted scalar results, i.e., replace use <s_out0>
4527           with use <s_out4>.
4528
4529      Transform:
4530         loop_exit:
4531           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4532           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4533           v_out2 = reduce <v_out1>
4534           s_out3 = extract_field <v_out2, 0>
4535           s_out4 = adjust_result <s_out3>
4536           use <s_out0>
4537           use <s_out0>
4538
4539      into:
4540
4541         loop_exit:
4542           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4543           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4544           v_out2 = reduce <v_out1>
4545           s_out3 = extract_field <v_out2, 0>
4546           s_out4 = adjust_result <s_out3>
4547           use <s_out4>
4548           use <s_out4> */
4549
4550
4551   /* In SLP reduction chain we reduce vector results into one vector if
4552      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4553      the last stmt in the reduction chain, since we are looking for the loop
4554      exit phi node.  */
4555   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4556     {
4557       scalar_dest = gimple_assign_lhs (
4558                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4559       group_size = 1;
4560     }
4561
4562   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4563      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4564      need to match SCALAR_RESULTS with corresponding statements.  The first
4565      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4566      the first vector stmt, etc.
4567      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4568   if (group_size > new_phis.length ())
4569     {
4570       ratio = group_size / new_phis.length ();
4571       gcc_assert (!(group_size % new_phis.length ()));
4572     }
4573   else
4574     ratio = 1;
4575
4576   for (k = 0; k < group_size; k++)
4577     {
4578       if (k % ratio == 0)
4579         {
4580           epilog_stmt = new_phis[k / ratio];
4581           reduction_phi = reduction_phis[k / ratio];
4582           if (double_reduc)
4583             inner_phi = inner_phis[k / ratio];
4584         }
4585
4586       if (slp_reduc)
4587         {
4588           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4589
4590           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4591           /* SLP statements can't participate in patterns.  */
4592           gcc_assert (!orig_stmt);
4593           scalar_dest = gimple_assign_lhs (current_stmt);
4594         }
4595
4596       phis.create (3);
4597       /* Find the loop-closed-use at the loop exit of the original scalar
4598          result.  (The reduction result is expected to have two immediate uses -
4599          one at the latch block, and one at the loop exit).  */
4600       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4601         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4602             && !is_gimple_debug (USE_STMT (use_p)))
4603           phis.safe_push (USE_STMT (use_p));
4604
4605       /* While we expect to have found an exit_phi because of loop-closed-ssa
4606          form we can end up without one if the scalar cycle is dead.  */
4607
4608       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4609         {
4610           if (outer_loop)
4611             {
4612               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4613               gphi *vect_phi;
4614
4615               /* FORNOW. Currently not supporting the case that an inner-loop
4616                  reduction is not used in the outer-loop (but only outside the
4617                  outer-loop), unless it is double reduction.  */
4618               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4619                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4620                           || double_reduc);
4621
4622               if (double_reduc)
4623                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4624               else
4625                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4626               if (!double_reduc
4627                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4628                       != vect_double_reduction_def)
4629                 continue;
4630
4631               /* Handle double reduction:
4632
4633                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4634                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4635                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4636                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4637
4638                  At that point the regular reduction (stmt2 and stmt3) is
4639                  already vectorized, as well as the exit phi node, stmt4.
4640                  Here we vectorize the phi node of double reduction, stmt1, and
4641                  update all relevant statements.  */
4642
4643               /* Go through all the uses of s2 to find double reduction phi
4644                  node, i.e., stmt1 above.  */
4645               orig_name = PHI_RESULT (exit_phi);
4646               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4647                 {
4648                   stmt_vec_info use_stmt_vinfo;
4649                   stmt_vec_info new_phi_vinfo;
4650                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4651                   basic_block bb = gimple_bb (use_stmt);
4652                   gimple use;
4653
4654                   /* Check that USE_STMT is really double reduction phi
4655                      node.  */
4656                   if (gimple_code (use_stmt) != GIMPLE_PHI
4657                       || gimple_phi_num_args (use_stmt) != 2
4658                       || bb->loop_father != outer_loop)
4659                     continue;
4660                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4661                   if (!use_stmt_vinfo
4662                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4663                           != vect_double_reduction_def)
4664                     continue;
4665
4666                   /* Create vector phi node for double reduction:
4667                      vs1 = phi <vs0, vs2>
4668                      vs1 was created previously in this function by a call to
4669                        vect_get_vec_def_for_operand and is stored in
4670                        vec_initial_def;
4671                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4672                      vs0 is created here.  */
4673
4674                   /* Create vector phi node.  */
4675                   vect_phi = create_phi_node (vec_initial_def, bb);
4676                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4677                                     loop_vec_info_for_loop (outer_loop), NULL);
4678                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4679
4680                   /* Create vs0 - initial def of the double reduction phi.  */
4681                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4682                                              loop_preheader_edge (outer_loop));
4683                   init_def = get_initial_def_for_reduction (stmt,
4684                                                           preheader_arg, NULL);
4685                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4686                                                     vectype, NULL);
4687
4688                   /* Update phi node arguments with vs0 and vs2.  */
4689                   add_phi_arg (vect_phi, vect_phi_init,
4690                                loop_preheader_edge (outer_loop),
4691                                UNKNOWN_LOCATION);
4692                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4693                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4694                   if (dump_enabled_p ())
4695                     {
4696                       dump_printf_loc (MSG_NOTE, vect_location,
4697                                        "created double reduction phi node: ");
4698                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4699                       dump_printf (MSG_NOTE, "\n");
4700                     }
4701
4702                   vect_phi_res = PHI_RESULT (vect_phi);
4703
4704                   /* Replace the use, i.e., set the correct vs1 in the regular
4705                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4706                      loop is redundant.  */
4707                   use = reduction_phi;
4708                   for (j = 0; j < ncopies; j++)
4709                     {
4710                       edge pr_edge = loop_preheader_edge (loop);
4711                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4712                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4713                     }
4714                 }
4715             }
4716         }
4717
4718       phis.release ();
4719       if (nested_in_vect_loop)
4720         {
4721           if (double_reduc)
4722             loop = outer_loop;
4723           else
4724             continue;
4725         }
4726
4727       phis.create (3);
4728       /* Find the loop-closed-use at the loop exit of the original scalar
4729          result.  (The reduction result is expected to have two immediate uses,
4730          one at the latch block, and one at the loop exit).  For double
4731          reductions we are looking for exit phis of the outer loop.  */
4732       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4733         {
4734           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4735             {
4736               if (!is_gimple_debug (USE_STMT (use_p)))
4737                 phis.safe_push (USE_STMT (use_p));
4738             }
4739           else
4740             {
4741               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4742                 {
4743                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4744
4745                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4746                     {
4747                       if (!flow_bb_inside_loop_p (loop,
4748                                              gimple_bb (USE_STMT (phi_use_p)))
4749                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4750                         phis.safe_push (USE_STMT (phi_use_p));
4751                     }
4752                 }
4753             }
4754         }
4755
4756       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4757         {
4758           /* Replace the uses:  */
4759           orig_name = PHI_RESULT (exit_phi);
4760           scalar_result = scalar_results[k];
4761           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4762             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4763               SET_USE (use_p, scalar_result);
4764         }
4765
4766       phis.release ();
4767     }
4768 }
4769
4770
4771 /* Function vectorizable_reduction.
4772
4773    Check if STMT performs a reduction operation that can be vectorized.
4774    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4775    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4776    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4777
4778    This function also handles reduction idioms (patterns) that have been
4779    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4780    of this form:
4781      X = pattern_expr (arg0, arg1, ..., X)
4782    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4783    sequence that had been detected and replaced by the pattern-stmt (STMT).
4784
4785    In some cases of reduction patterns, the type of the reduction variable X is
4786    different than the type of the other arguments of STMT.
4787    In such cases, the vectype that is used when transforming STMT into a vector
4788    stmt is different than the vectype that is used to determine the
4789    vectorization factor, because it consists of a different number of elements
4790    than the actual number of elements that are being operated upon in parallel.
4791
4792    For example, consider an accumulation of shorts into an int accumulator.
4793    On some targets it's possible to vectorize this pattern operating on 8
4794    shorts at a time (hence, the vectype for purposes of determining the
4795    vectorization factor should be V8HI); on the other hand, the vectype that
4796    is used to create the vector form is actually V4SI (the type of the result).
4797
4798    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4799    indicates what is the actual level of parallelism (V8HI in the example), so
4800    that the right vectorization factor would be derived.  This vectype
4801    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4802    be used to create the vectorized stmt.  The right vectype for the vectorized
4803    stmt is obtained from the type of the result X:
4804         get_vectype_for_scalar_type (TREE_TYPE (X))
4805
4806    This means that, contrary to "regular" reductions (or "regular" stmts in
4807    general), the following equation:
4808       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4809    does *NOT* necessarily hold for reduction patterns.  */
4810
4811 bool
4812 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4813                         gimple *vec_stmt, slp_tree slp_node)
4814 {
4815   tree vec_dest;
4816   tree scalar_dest;
4817   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4818   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4819   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4820   tree vectype_in = NULL_TREE;
4821   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4822   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4823   enum tree_code code, orig_code, epilog_reduc_code;
4824   machine_mode vec_mode;
4825   int op_type;
4826   optab optab, reduc_optab;
4827   tree new_temp = NULL_TREE;
4828   tree def;
4829   gimple def_stmt;
4830   enum vect_def_type dt;
4831   gphi *new_phi = NULL;
4832   tree scalar_type;
4833   bool is_simple_use;
4834   gimple orig_stmt;
4835   stmt_vec_info orig_stmt_info;
4836   tree expr = NULL_TREE;
4837   int i;
4838   int ncopies;
4839   int epilog_copies;
4840   stmt_vec_info prev_stmt_info, prev_phi_info;
4841   bool single_defuse_cycle = false;
4842   tree reduc_def = NULL_TREE;
4843   gimple new_stmt = NULL;
4844   int j;
4845   tree ops[3];
4846   bool nested_cycle = false, found_nested_cycle_def = false;
4847   gimple reduc_def_stmt = NULL;
4848   /* The default is that the reduction variable is the last in statement.  */
4849   int reduc_index = 2;
4850   bool double_reduc = false, dummy;
4851   basic_block def_bb;
4852   struct loop * def_stmt_loop, *outer_loop = NULL;
4853   tree def_arg;
4854   gimple def_arg_stmt;
4855   auto_vec<tree> vec_oprnds0;
4856   auto_vec<tree> vec_oprnds1;
4857   auto_vec<tree> vect_defs;
4858   auto_vec<gimple> phis;
4859   int vec_num;
4860   tree def0, def1, tem, op0, op1 = NULL_TREE;
4861
4862   /* In case of reduction chain we switch to the first stmt in the chain, but
4863      we don't update STMT_INFO, since only the last stmt is marked as reduction
4864      and has reduction properties.  */
4865   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4866     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4867
4868   if (nested_in_vect_loop_p (loop, stmt))
4869     {
4870       outer_loop = loop;
4871       loop = loop->inner;
4872       nested_cycle = true;
4873     }
4874
4875   /* 1. Is vectorizable reduction?  */
4876   /* Not supportable if the reduction variable is used in the loop, unless
4877      it's a reduction chain.  */
4878   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4879       && !GROUP_FIRST_ELEMENT (stmt_info))
4880     return false;
4881
4882   /* Reductions that are not used even in an enclosing outer-loop,
4883      are expected to be "live" (used out of the loop).  */
4884   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4885       && !STMT_VINFO_LIVE_P (stmt_info))
4886     return false;
4887
4888   /* Make sure it was already recognized as a reduction computation.  */
4889   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4890       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4891     return false;
4892
4893   /* 2. Has this been recognized as a reduction pattern?
4894
4895      Check if STMT represents a pattern that has been recognized
4896      in earlier analysis stages.  For stmts that represent a pattern,
4897      the STMT_VINFO_RELATED_STMT field records the last stmt in
4898      the original sequence that constitutes the pattern.  */
4899
4900   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4901   if (orig_stmt)
4902     {
4903       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4904       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4905       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4906     }
4907
4908   /* 3. Check the operands of the operation.  The first operands are defined
4909         inside the loop body. The last operand is the reduction variable,
4910         which is defined by the loop-header-phi.  */
4911
4912   gcc_assert (is_gimple_assign (stmt));
4913
4914   /* Flatten RHS.  */
4915   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4916     {
4917     case GIMPLE_SINGLE_RHS:
4918       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4919       if (op_type == ternary_op)
4920         {
4921           tree rhs = gimple_assign_rhs1 (stmt);
4922           ops[0] = TREE_OPERAND (rhs, 0);
4923           ops[1] = TREE_OPERAND (rhs, 1);
4924           ops[2] = TREE_OPERAND (rhs, 2);
4925           code = TREE_CODE (rhs);
4926         }
4927       else
4928         return false;
4929       break;
4930
4931     case GIMPLE_BINARY_RHS:
4932       code = gimple_assign_rhs_code (stmt);
4933       op_type = TREE_CODE_LENGTH (code);
4934       gcc_assert (op_type == binary_op);
4935       ops[0] = gimple_assign_rhs1 (stmt);
4936       ops[1] = gimple_assign_rhs2 (stmt);
4937       break;
4938
4939     case GIMPLE_TERNARY_RHS:
4940       code = gimple_assign_rhs_code (stmt);
4941       op_type = TREE_CODE_LENGTH (code);
4942       gcc_assert (op_type == ternary_op);
4943       ops[0] = gimple_assign_rhs1 (stmt);
4944       ops[1] = gimple_assign_rhs2 (stmt);
4945       ops[2] = gimple_assign_rhs3 (stmt);
4946       break;
4947
4948     case GIMPLE_UNARY_RHS:
4949       return false;
4950
4951     default:
4952       gcc_unreachable ();
4953     }
4954
4955   if (code == COND_EXPR && slp_node)
4956     return false;
4957
4958   scalar_dest = gimple_assign_lhs (stmt);
4959   scalar_type = TREE_TYPE (scalar_dest);
4960   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4961       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4962     return false;
4963
4964   /* Do not try to vectorize bit-precision reductions.  */
4965   if ((TYPE_PRECISION (scalar_type)
4966        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4967     return false;
4968
4969   /* All uses but the last are expected to be defined in the loop.
4970      The last use is the reduction variable.  In case of nested cycle this
4971      assumption is not true: we use reduc_index to record the index of the
4972      reduction variable.  */
4973   for (i = 0; i < op_type - 1; i++)
4974     {
4975       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4976       if (i == 0 && code == COND_EXPR)
4977         continue;
4978
4979       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4980                                             &def_stmt, &def, &dt, &tem);
4981       if (!vectype_in)
4982         vectype_in = tem;
4983       gcc_assert (is_simple_use);
4984
4985       if (dt != vect_internal_def
4986           && dt != vect_external_def
4987           && dt != vect_constant_def
4988           && dt != vect_induction_def
4989           && !(dt == vect_nested_cycle && nested_cycle))
4990         return false;
4991
4992       if (dt == vect_nested_cycle)
4993         {
4994           found_nested_cycle_def = true;
4995           reduc_def_stmt = def_stmt;
4996           reduc_index = i;
4997         }
4998     }
4999
5000   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5001                                         &def_stmt, &def, &dt, &tem);
5002   if (!vectype_in)
5003     vectype_in = tem;
5004   gcc_assert (is_simple_use);
5005   if (!found_nested_cycle_def)
5006     reduc_def_stmt = def_stmt;
5007
5008   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5009     return false;
5010
5011   if (!(dt == vect_reduction_def
5012         || dt == vect_nested_cycle
5013         || ((dt == vect_internal_def || dt == vect_external_def
5014              || dt == vect_constant_def || dt == vect_induction_def)
5015             && nested_cycle && found_nested_cycle_def)))
5016     {
5017       /* For pattern recognized stmts, orig_stmt might be a reduction,
5018          but some helper statements for the pattern might not, or
5019          might be COND_EXPRs with reduction uses in the condition.  */
5020       gcc_assert (orig_stmt);
5021       return false;
5022     }
5023
5024   if (orig_stmt)
5025     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5026                                                        reduc_def_stmt,
5027                                                        !nested_cycle,
5028                                                        &dummy));
5029   else
5030     {
5031       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5032                                              !nested_cycle, &dummy);
5033       /* We changed STMT to be the first stmt in reduction chain, hence we
5034          check that in this case the first element in the chain is STMT.  */
5035       gcc_assert (stmt == tmp
5036                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5037     }
5038
5039   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5040     return false;
5041
5042   if (slp_node || PURE_SLP_STMT (stmt_info))
5043     ncopies = 1;
5044   else
5045     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5046                / TYPE_VECTOR_SUBPARTS (vectype_in));
5047
5048   gcc_assert (ncopies >= 1);
5049
5050   vec_mode = TYPE_MODE (vectype_in);
5051
5052   if (code == COND_EXPR)
5053     {
5054       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5055         {
5056           if (dump_enabled_p ())
5057             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5058                              "unsupported condition in reduction\n");
5059
5060             return false;
5061         }
5062     }
5063   else
5064     {
5065       /* 4. Supportable by target?  */
5066
5067       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5068           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5069         {
5070           /* Shifts and rotates are only supported by vectorizable_shifts,
5071              not vectorizable_reduction.  */
5072           if (dump_enabled_p ())
5073             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5074                              "unsupported shift or rotation.\n");
5075           return false;
5076         }
5077
5078       /* 4.1. check support for the operation in the loop  */
5079       optab = optab_for_tree_code (code, vectype_in, optab_default);
5080       if (!optab)
5081         {
5082           if (dump_enabled_p ())
5083             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5084                              "no optab.\n");
5085
5086           return false;
5087         }
5088
5089       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5090         {
5091           if (dump_enabled_p ())
5092             dump_printf (MSG_NOTE, "op not supported by target.\n");
5093
5094           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5095               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5096                   < vect_min_worthwhile_factor (code))
5097             return false;
5098
5099           if (dump_enabled_p ())
5100             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5101         }
5102
5103       /* Worthwhile without SIMD support?  */
5104       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5105           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5106              < vect_min_worthwhile_factor (code))
5107         {
5108           if (dump_enabled_p ())
5109             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5110                              "not worthwhile without SIMD support.\n");
5111
5112           return false;
5113         }
5114     }
5115
5116   /* 4.2. Check support for the epilog operation.
5117
5118           If STMT represents a reduction pattern, then the type of the
5119           reduction variable may be different than the type of the rest
5120           of the arguments.  For example, consider the case of accumulation
5121           of shorts into an int accumulator; The original code:
5122                         S1: int_a = (int) short_a;
5123           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5124
5125           was replaced with:
5126                         STMT: int_acc = widen_sum <short_a, int_acc>
5127
5128           This means that:
5129           1. The tree-code that is used to create the vector operation in the
5130              epilog code (that reduces the partial results) is not the
5131              tree-code of STMT, but is rather the tree-code of the original
5132              stmt from the pattern that STMT is replacing.  I.e, in the example
5133              above we want to use 'widen_sum' in the loop, but 'plus' in the
5134              epilog.
5135           2. The type (mode) we use to check available target support
5136              for the vector operation to be created in the *epilog*, is
5137              determined by the type of the reduction variable (in the example
5138              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5139              However the type (mode) we use to check available target support
5140              for the vector operation to be created *inside the loop*, is
5141              determined by the type of the other arguments to STMT (in the
5142              example we'd check this: optab_handler (widen_sum_optab,
5143              vect_short_mode)).
5144
5145           This is contrary to "regular" reductions, in which the types of all
5146           the arguments are the same as the type of the reduction variable.
5147           For "regular" reductions we can therefore use the same vector type
5148           (and also the same tree-code) when generating the epilog code and
5149           when generating the code inside the loop.  */
5150
5151   if (orig_stmt)
5152     {
5153       /* This is a reduction pattern: get the vectype from the type of the
5154          reduction variable, and get the tree-code from orig_stmt.  */
5155       orig_code = gimple_assign_rhs_code (orig_stmt);
5156       gcc_assert (vectype_out);
5157       vec_mode = TYPE_MODE (vectype_out);
5158     }
5159   else
5160     {
5161       /* Regular reduction: use the same vectype and tree-code as used for
5162          the vector code inside the loop can be used for the epilog code. */
5163       orig_code = code;
5164     }
5165
5166   if (nested_cycle)
5167     {
5168       def_bb = gimple_bb (reduc_def_stmt);
5169       def_stmt_loop = def_bb->loop_father;
5170       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5171                                        loop_preheader_edge (def_stmt_loop));
5172       if (TREE_CODE (def_arg) == SSA_NAME
5173           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5174           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5175           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5176           && vinfo_for_stmt (def_arg_stmt)
5177           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5178               == vect_double_reduction_def)
5179         double_reduc = true;
5180     }
5181
5182   epilog_reduc_code = ERROR_MARK;
5183   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5184     {
5185       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5186                                          optab_default);
5187       if (!reduc_optab)
5188         {
5189           if (dump_enabled_p ())
5190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5191                              "no optab for reduction.\n");
5192
5193           epilog_reduc_code = ERROR_MARK;
5194         }
5195       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5196         {
5197           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5198           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5199             {
5200               if (dump_enabled_p ())
5201                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5202                                  "reduc op not supported by target.\n");
5203
5204               epilog_reduc_code = ERROR_MARK;
5205             }
5206         }
5207     }
5208   else
5209     {
5210       if (!nested_cycle || double_reduc)
5211         {
5212           if (dump_enabled_p ())
5213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5214                              "no reduc code for scalar code.\n");
5215
5216           return false;
5217         }
5218     }
5219
5220   if (double_reduc && ncopies > 1)
5221     {
5222       if (dump_enabled_p ())
5223         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5224                          "multiple types in double reduction\n");
5225
5226       return false;
5227     }
5228
5229   /* In case of widenning multiplication by a constant, we update the type
5230      of the constant to be the type of the other operand.  We check that the
5231      constant fits the type in the pattern recognition pass.  */
5232   if (code == DOT_PROD_EXPR
5233       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5234     {
5235       if (TREE_CODE (ops[0]) == INTEGER_CST)
5236         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5237       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5238         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5239       else
5240         {
5241           if (dump_enabled_p ())
5242             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5243                              "invalid types in dot-prod\n");
5244
5245           return false;
5246         }
5247     }
5248
5249   if (!vec_stmt) /* transformation not required.  */
5250     {
5251       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5252         return false;
5253       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5254       return true;
5255     }
5256
5257   /** Transform.  **/
5258
5259   if (dump_enabled_p ())
5260     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5261
5262   /* FORNOW: Multiple types are not supported for condition.  */
5263   if (code == COND_EXPR)
5264     gcc_assert (ncopies == 1);
5265
5266   /* Create the destination vector  */
5267   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5268
5269   /* In case the vectorization factor (VF) is bigger than the number
5270      of elements that we can fit in a vectype (nunits), we have to generate
5271      more than one vector stmt - i.e - we need to "unroll" the
5272      vector stmt by a factor VF/nunits.  For more details see documentation
5273      in vectorizable_operation.  */
5274
5275   /* If the reduction is used in an outer loop we need to generate
5276      VF intermediate results, like so (e.g. for ncopies=2):
5277         r0 = phi (init, r0)
5278         r1 = phi (init, r1)
5279         r0 = x0 + r0;
5280         r1 = x1 + r1;
5281     (i.e. we generate VF results in 2 registers).
5282     In this case we have a separate def-use cycle for each copy, and therefore
5283     for each copy we get the vector def for the reduction variable from the
5284     respective phi node created for this copy.
5285
5286     Otherwise (the reduction is unused in the loop nest), we can combine
5287     together intermediate results, like so (e.g. for ncopies=2):
5288         r = phi (init, r)
5289         r = x0 + r;
5290         r = x1 + r;
5291    (i.e. we generate VF/2 results in a single register).
5292    In this case for each copy we get the vector def for the reduction variable
5293    from the vectorized reduction operation generated in the previous iteration.
5294   */
5295
5296   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5297     {
5298       single_defuse_cycle = true;
5299       epilog_copies = 1;
5300     }
5301   else
5302     epilog_copies = ncopies;
5303
5304   prev_stmt_info = NULL;
5305   prev_phi_info = NULL;
5306   if (slp_node)
5307     {
5308       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5309       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5310                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5311     }
5312   else
5313     {
5314       vec_num = 1;
5315       vec_oprnds0.create (1);
5316       if (op_type == ternary_op)
5317         vec_oprnds1.create (1);
5318     }
5319
5320   phis.create (vec_num);
5321   vect_defs.create (vec_num);
5322   if (!slp_node)
5323     vect_defs.quick_push (NULL_TREE);
5324
5325   for (j = 0; j < ncopies; j++)
5326     {
5327       if (j == 0 || !single_defuse_cycle)
5328         {
5329           for (i = 0; i < vec_num; i++)
5330             {
5331               /* Create the reduction-phi that defines the reduction
5332                  operand.  */
5333               new_phi = create_phi_node (vec_dest, loop->header);
5334               set_vinfo_for_stmt (new_phi,
5335                                   new_stmt_vec_info (new_phi, loop_vinfo,
5336                                                      NULL));
5337                if (j == 0 || slp_node)
5338                  phis.quick_push (new_phi);
5339             }
5340         }
5341
5342       if (code == COND_EXPR)
5343         {
5344           gcc_assert (!slp_node);
5345           vectorizable_condition (stmt, gsi, vec_stmt,
5346                                   PHI_RESULT (phis[0]),
5347                                   reduc_index, NULL);
5348           /* Multiple types are not supported for condition.  */
5349           break;
5350         }
5351
5352       /* Handle uses.  */
5353       if (j == 0)
5354         {
5355           op0 = ops[!reduc_index];
5356           if (op_type == ternary_op)
5357             {
5358               if (reduc_index == 0)
5359                 op1 = ops[2];
5360               else
5361                 op1 = ops[1];
5362             }
5363
5364           if (slp_node)
5365             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5366                                slp_node, -1);
5367           else
5368             {
5369               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5370                                                             stmt, NULL);
5371               vec_oprnds0.quick_push (loop_vec_def0);
5372               if (op_type == ternary_op)
5373                {
5374                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5375                                                                NULL);
5376                  vec_oprnds1.quick_push (loop_vec_def1);
5377                }
5378             }
5379         }
5380       else
5381         {
5382           if (!slp_node)
5383             {
5384               enum vect_def_type dt;
5385               gimple dummy_stmt;
5386               tree dummy;
5387
5388               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5389                                   &dummy_stmt, &dummy, &dt);
5390               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5391                                                               loop_vec_def0);
5392               vec_oprnds0[0] = loop_vec_def0;
5393               if (op_type == ternary_op)
5394                 {
5395                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5396                                       &dummy, &dt);
5397                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5398                                                                 loop_vec_def1);
5399                   vec_oprnds1[0] = loop_vec_def1;
5400                 }
5401             }
5402
5403           if (single_defuse_cycle)
5404             reduc_def = gimple_assign_lhs (new_stmt);
5405
5406           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5407         }
5408
5409       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5410         {
5411           if (slp_node)
5412             reduc_def = PHI_RESULT (phis[i]);
5413           else
5414             {
5415               if (!single_defuse_cycle || j == 0)
5416                 reduc_def = PHI_RESULT (new_phi);
5417             }
5418
5419           def1 = ((op_type == ternary_op)
5420                   ? vec_oprnds1[i] : NULL);
5421           if (op_type == binary_op)
5422             {
5423               if (reduc_index == 0)
5424                 expr = build2 (code, vectype_out, reduc_def, def0);
5425               else
5426                 expr = build2 (code, vectype_out, def0, reduc_def);
5427             }
5428           else
5429             {
5430               if (reduc_index == 0)
5431                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5432               else
5433                 {
5434                   if (reduc_index == 1)
5435                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5436                   else
5437                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5438                 }
5439             }
5440
5441           new_stmt = gimple_build_assign (vec_dest, expr);
5442           new_temp = make_ssa_name (vec_dest, new_stmt);
5443           gimple_assign_set_lhs (new_stmt, new_temp);
5444           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5445
5446           if (slp_node)
5447             {
5448               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5449               vect_defs.quick_push (new_temp);
5450             }
5451           else
5452             vect_defs[0] = new_temp;
5453         }
5454
5455       if (slp_node)
5456         continue;
5457
5458       if (j == 0)
5459         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5460       else
5461         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5462
5463       prev_stmt_info = vinfo_for_stmt (new_stmt);
5464       prev_phi_info = vinfo_for_stmt (new_phi);
5465     }
5466
5467   /* Finalize the reduction-phi (set its arguments) and create the
5468      epilog reduction code.  */
5469   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5470     {
5471       new_temp = gimple_assign_lhs (*vec_stmt);
5472       vect_defs[0] = new_temp;
5473     }
5474
5475   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5476                                     epilog_reduc_code, phis, reduc_index,
5477                                     double_reduc, slp_node);
5478
5479   return true;
5480 }
5481
5482 /* Function vect_min_worthwhile_factor.
5483
5484    For a loop where we could vectorize the operation indicated by CODE,
5485    return the minimum vectorization factor that makes it worthwhile
5486    to use generic vectors.  */
5487 int
5488 vect_min_worthwhile_factor (enum tree_code code)
5489 {
5490   switch (code)
5491     {
5492     case PLUS_EXPR:
5493     case MINUS_EXPR:
5494     case NEGATE_EXPR:
5495       return 4;
5496
5497     case BIT_AND_EXPR:
5498     case BIT_IOR_EXPR:
5499     case BIT_XOR_EXPR:
5500     case BIT_NOT_EXPR:
5501       return 2;
5502
5503     default:
5504       return INT_MAX;
5505     }
5506 }
5507
5508
5509 /* Function vectorizable_induction
5510
5511    Check if PHI performs an induction computation that can be vectorized.
5512    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5513    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5514    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5515
5516 bool
5517 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5518                         gimple *vec_stmt)
5519 {
5520   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5521   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5522   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5523   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5524   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5525   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5526   tree vec_def;
5527
5528   gcc_assert (ncopies >= 1);
5529   /* FORNOW. These restrictions should be relaxed.  */
5530   if (nested_in_vect_loop_p (loop, phi))
5531     {
5532       imm_use_iterator imm_iter;
5533       use_operand_p use_p;
5534       gimple exit_phi;
5535       edge latch_e;
5536       tree loop_arg;
5537
5538       if (ncopies > 1)
5539         {
5540           if (dump_enabled_p ())
5541             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5542                              "multiple types in nested loop.\n");
5543           return false;
5544         }
5545
5546       exit_phi = NULL;
5547       latch_e = loop_latch_edge (loop->inner);
5548       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5549       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5550         {
5551           gimple use_stmt = USE_STMT (use_p);
5552           if (is_gimple_debug (use_stmt))
5553             continue;
5554
5555           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5556             {
5557               exit_phi = use_stmt;
5558               break;
5559             }
5560         }
5561       if (exit_phi)
5562         {
5563           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5564           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5565                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5566             {
5567               if (dump_enabled_p ())
5568                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5569                                  "inner-loop induction only used outside "
5570                                  "of the outer vectorized loop.\n");
5571               return false;
5572             }
5573         }
5574     }
5575
5576   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5577     return false;
5578
5579   /* FORNOW: SLP not supported.  */
5580   if (STMT_SLP_TYPE (stmt_info))
5581     return false;
5582
5583   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5584
5585   if (gimple_code (phi) != GIMPLE_PHI)
5586     return false;
5587
5588   if (!vec_stmt) /* transformation not required.  */
5589     {
5590       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5591       if (dump_enabled_p ())
5592         dump_printf_loc (MSG_NOTE, vect_location,
5593                          "=== vectorizable_induction ===\n");
5594       vect_model_induction_cost (stmt_info, ncopies);
5595       return true;
5596     }
5597
5598   /** Transform.  **/
5599
5600   if (dump_enabled_p ())
5601     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5602
5603   vec_def = get_initial_def_for_induction (phi);
5604   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5605   return true;
5606 }
5607
5608 /* Function vectorizable_live_operation.
5609
5610    STMT computes a value that is used outside the loop.  Check if
5611    it can be supported.  */
5612
5613 bool
5614 vectorizable_live_operation (gimple stmt,
5615                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5616                              gimple *vec_stmt)
5617 {
5618   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5619   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5620   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5621   int i;
5622   int op_type;
5623   tree op;
5624   tree def;
5625   gimple def_stmt;
5626   enum vect_def_type dt;
5627   enum tree_code code;
5628   enum gimple_rhs_class rhs_class;
5629
5630   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5631
5632   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5633     return false;
5634
5635   if (!is_gimple_assign (stmt))
5636     {
5637       if (gimple_call_internal_p (stmt)
5638           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5639           && gimple_call_lhs (stmt)
5640           && loop->simduid
5641           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5642           && loop->simduid
5643              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5644         {
5645           edge e = single_exit (loop);
5646           basic_block merge_bb = e->dest;
5647           imm_use_iterator imm_iter;
5648           use_operand_p use_p;
5649           tree lhs = gimple_call_lhs (stmt);
5650
5651           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5652             {
5653               gimple use_stmt = USE_STMT (use_p);
5654               if (gimple_code (use_stmt) == GIMPLE_PHI
5655                   && gimple_bb (use_stmt) == merge_bb)
5656                 {
5657                   if (vec_stmt)
5658                     {
5659                       tree vfm1
5660                         = build_int_cst (unsigned_type_node,
5661                                          loop_vinfo->vectorization_factor - 1);
5662                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5663                     }
5664                   return true;
5665                 }
5666             }
5667         }
5668
5669       return false;
5670     }
5671
5672   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5673     return false;
5674
5675   /* FORNOW. CHECKME. */
5676   if (nested_in_vect_loop_p (loop, stmt))
5677     return false;
5678
5679   code = gimple_assign_rhs_code (stmt);
5680   op_type = TREE_CODE_LENGTH (code);
5681   rhs_class = get_gimple_rhs_class (code);
5682   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5683   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5684
5685   /* FORNOW: support only if all uses are invariant.  This means
5686      that the scalar operations can remain in place, unvectorized.
5687      The original last scalar value that they compute will be used.  */
5688
5689   for (i = 0; i < op_type; i++)
5690     {
5691       if (rhs_class == GIMPLE_SINGLE_RHS)
5692         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5693       else
5694         op = gimple_op (stmt, i + 1);
5695       if (op
5696           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5697                                   &dt))
5698         {
5699           if (dump_enabled_p ())
5700             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5701                              "use not simple.\n");
5702           return false;
5703         }
5704
5705       if (dt != vect_external_def && dt != vect_constant_def)
5706         return false;
5707     }
5708
5709   /* No transformation is required for the cases we currently support.  */
5710   return true;
5711 }
5712
5713 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5714
5715 static void
5716 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5717 {
5718   ssa_op_iter op_iter;
5719   imm_use_iterator imm_iter;
5720   def_operand_p def_p;
5721   gimple ustmt;
5722
5723   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5724     {
5725       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5726         {
5727           basic_block bb;
5728
5729           if (!is_gimple_debug (ustmt))
5730             continue;
5731
5732           bb = gimple_bb (ustmt);
5733
5734           if (!flow_bb_inside_loop_p (loop, bb))
5735             {
5736               if (gimple_debug_bind_p (ustmt))
5737                 {
5738                   if (dump_enabled_p ())
5739                     dump_printf_loc (MSG_NOTE, vect_location,
5740                                      "killing debug use\n");
5741
5742                   gimple_debug_bind_reset_value (ustmt);
5743                   update_stmt (ustmt);
5744                 }
5745               else
5746                 gcc_unreachable ();
5747             }
5748         }
5749     }
5750 }
5751
5752
5753 /* This function builds ni_name = number of iterations.  Statements
5754    are emitted on the loop preheader edge.  */
5755
5756 static tree
5757 vect_build_loop_niters (loop_vec_info loop_vinfo)
5758 {
5759   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5760   if (TREE_CODE (ni) == INTEGER_CST)
5761     return ni;
5762   else
5763     {
5764       tree ni_name, var;
5765       gimple_seq stmts = NULL;
5766       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5767
5768       var = create_tmp_var (TREE_TYPE (ni), "niters");
5769       ni_name = force_gimple_operand (ni, &stmts, false, var);
5770       if (stmts)
5771         gsi_insert_seq_on_edge_immediate (pe, stmts);
5772
5773       return ni_name;
5774     }
5775 }
5776
5777
5778 /* This function generates the following statements:
5779
5780    ni_name = number of iterations loop executes
5781    ratio = ni_name / vf
5782    ratio_mult_vf_name = ratio * vf
5783
5784    and places them on the loop preheader edge.  */
5785
5786 static void
5787 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5788                                  tree ni_name,
5789                                  tree *ratio_mult_vf_name_ptr,
5790                                  tree *ratio_name_ptr)
5791 {
5792   tree ni_minus_gap_name;
5793   tree var;
5794   tree ratio_name;
5795   tree ratio_mult_vf_name;
5796   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5797   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5798   tree log_vf;
5799
5800   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5801
5802   /* If epilogue loop is required because of data accesses with gaps, we
5803      subtract one iteration from the total number of iterations here for
5804      correct calculation of RATIO.  */
5805   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5806     {
5807       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5808                                        ni_name,
5809                                        build_one_cst (TREE_TYPE (ni_name)));
5810       if (!is_gimple_val (ni_minus_gap_name))
5811         {
5812           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5813           gimple stmts = NULL;
5814           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5815                                                     true, var);
5816           gsi_insert_seq_on_edge_immediate (pe, stmts);
5817         }
5818     }
5819   else
5820     ni_minus_gap_name = ni_name;
5821
5822   /* Create: ratio = ni >> log2(vf) */
5823   /* ???  As we have ni == number of latch executions + 1, ni could
5824      have overflown to zero.  So avoid computing ratio based on ni
5825      but compute it using the fact that we know ratio will be at least
5826      one, thus via (ni - vf) >> log2(vf) + 1.  */
5827   ratio_name
5828     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5829                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5830                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5831                                              ni_minus_gap_name,
5832                                              build_int_cst
5833                                                (TREE_TYPE (ni_name), vf)),
5834                                 log_vf),
5835                    build_int_cst (TREE_TYPE (ni_name), 1));
5836   if (!is_gimple_val (ratio_name))
5837     {
5838       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5839       gimple stmts = NULL;
5840       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5841       gsi_insert_seq_on_edge_immediate (pe, stmts);
5842     }
5843   *ratio_name_ptr = ratio_name;
5844
5845   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5846
5847   if (ratio_mult_vf_name_ptr)
5848     {
5849       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5850                                         ratio_name, log_vf);
5851       if (!is_gimple_val (ratio_mult_vf_name))
5852         {
5853           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5854           gimple stmts = NULL;
5855           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5856                                                      true, var);
5857           gsi_insert_seq_on_edge_immediate (pe, stmts);
5858         }
5859       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5860     }
5861
5862   return;
5863 }
5864
5865
5866 /* Function vect_transform_loop.
5867
5868    The analysis phase has determined that the loop is vectorizable.
5869    Vectorize the loop - created vectorized stmts to replace the scalar
5870    stmts in the loop, and update the loop exit condition.  */
5871
5872 void
5873 vect_transform_loop (loop_vec_info loop_vinfo)
5874 {
5875   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5876   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5877   int nbbs = loop->num_nodes;
5878   int i;
5879   tree ratio = NULL;
5880   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5881   bool grouped_store;
5882   bool slp_scheduled = false;
5883   gimple stmt, pattern_stmt;
5884   gimple_seq pattern_def_seq = NULL;
5885   gimple_stmt_iterator pattern_def_si = gsi_none ();
5886   bool transform_pattern_stmt = false;
5887   bool check_profitability = false;
5888   int th;
5889   /* Record number of iterations before we started tampering with the profile. */
5890   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5891
5892   if (dump_enabled_p ())
5893     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5894
5895   /* If profile is inprecise, we have chance to fix it up.  */
5896   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5897     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5898
5899   /* Use the more conservative vectorization threshold.  If the number
5900      of iterations is constant assume the cost check has been performed
5901      by our caller.  If the threshold makes all loops profitable that
5902      run at least the vectorization factor number of times checking
5903      is pointless, too.  */
5904   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5905   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5906       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5907     {
5908       if (dump_enabled_p ())
5909         dump_printf_loc (MSG_NOTE, vect_location,
5910                          "Profitability threshold is %d loop iterations.\n",
5911                          th);
5912       check_profitability = true;
5913     }
5914
5915   /* Version the loop first, if required, so the profitability check
5916      comes first.  */
5917
5918   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5919       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5920     {
5921       vect_loop_versioning (loop_vinfo, th, check_profitability);
5922       check_profitability = false;
5923     }
5924
5925   tree ni_name = vect_build_loop_niters (loop_vinfo);
5926   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5927
5928   /* Peel the loop if there are data refs with unknown alignment.
5929      Only one data ref with unknown store is allowed.  */
5930
5931   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5932     {
5933       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5934                                      th, check_profitability);
5935       check_profitability = false;
5936       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5937          be re-computed.  */
5938       ni_name = NULL_TREE;
5939     }
5940
5941   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5942      compile time constant), or it is a constant that doesn't divide by the
5943      vectorization factor, then an epilog loop needs to be created.
5944      We therefore duplicate the loop: the original loop will be vectorized,
5945      and will compute the first (n/VF) iterations.  The second copy of the loop
5946      will remain scalar and will compute the remaining (n%VF) iterations.
5947      (VF is the vectorization factor).  */
5948
5949   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5950       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5951     {
5952       tree ratio_mult_vf;
5953       if (!ni_name)
5954         ni_name = vect_build_loop_niters (loop_vinfo);
5955       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5956                                        &ratio);
5957       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5958                                       th, check_profitability);
5959     }
5960   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5961     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5962                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5963   else
5964     {
5965       if (!ni_name)
5966         ni_name = vect_build_loop_niters (loop_vinfo);
5967       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5968     }
5969
5970   /* 1) Make sure the loop header has exactly two entries
5971      2) Make sure we have a preheader basic block.  */
5972
5973   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5974
5975   split_edge (loop_preheader_edge (loop));
5976
5977   /* FORNOW: the vectorizer supports only loops which body consist
5978      of one basic block (header + empty latch). When the vectorizer will
5979      support more involved loop forms, the order by which the BBs are
5980      traversed need to be reconsidered.  */
5981
5982   for (i = 0; i < nbbs; i++)
5983     {
5984       basic_block bb = bbs[i];
5985       stmt_vec_info stmt_info;
5986
5987       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5988            gsi_next (&si))
5989         {
5990           gphi *phi = si.phi ();
5991           if (dump_enabled_p ())
5992             {
5993               dump_printf_loc (MSG_NOTE, vect_location,
5994                                "------>vectorizing phi: ");
5995               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5996               dump_printf (MSG_NOTE, "\n");
5997             }
5998           stmt_info = vinfo_for_stmt (phi);
5999           if (!stmt_info)
6000             continue;
6001
6002           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6003             vect_loop_kill_debug_uses (loop, phi);
6004
6005           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6006               && !STMT_VINFO_LIVE_P (stmt_info))
6007             continue;
6008
6009           if (STMT_VINFO_VECTYPE (stmt_info)
6010               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6011                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6012               && dump_enabled_p ())
6013             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6014
6015           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6016             {
6017               if (dump_enabled_p ())
6018                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6019               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6020             }
6021         }
6022
6023       pattern_stmt = NULL;
6024       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6025            !gsi_end_p (si) || transform_pattern_stmt;)
6026         {
6027           bool is_store;
6028
6029           if (transform_pattern_stmt)
6030             stmt = pattern_stmt;
6031           else
6032             {
6033               stmt = gsi_stmt (si);
6034               /* During vectorization remove existing clobber stmts.  */
6035               if (gimple_clobber_p (stmt))
6036                 {
6037                   unlink_stmt_vdef (stmt);
6038                   gsi_remove (&si, true);
6039                   release_defs (stmt);
6040                   continue;
6041                 }
6042             }
6043
6044           if (dump_enabled_p ())
6045             {
6046               dump_printf_loc (MSG_NOTE, vect_location,
6047                                "------>vectorizing statement: ");
6048               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6049               dump_printf (MSG_NOTE, "\n");
6050             }
6051
6052           stmt_info = vinfo_for_stmt (stmt);
6053
6054           /* vector stmts created in the outer-loop during vectorization of
6055              stmts in an inner-loop may not have a stmt_info, and do not
6056              need to be vectorized.  */
6057           if (!stmt_info)
6058             {
6059               gsi_next (&si);
6060               continue;
6061             }
6062
6063           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6064             vect_loop_kill_debug_uses (loop, stmt);
6065
6066           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6067               && !STMT_VINFO_LIVE_P (stmt_info))
6068             {
6069               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6070                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6071                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6072                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6073                 {
6074                   stmt = pattern_stmt;
6075                   stmt_info = vinfo_for_stmt (stmt);
6076                 }
6077               else
6078                 {
6079                   gsi_next (&si);
6080                   continue;
6081                 }
6082             }
6083           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6084                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6085                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6086                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6087             transform_pattern_stmt = true;
6088
6089           /* If pattern statement has def stmts, vectorize them too.  */
6090           if (is_pattern_stmt_p (stmt_info))
6091             {
6092               if (pattern_def_seq == NULL)
6093                 {
6094                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6095                   pattern_def_si = gsi_start (pattern_def_seq);
6096                 }
6097               else if (!gsi_end_p (pattern_def_si))
6098                 gsi_next (&pattern_def_si);
6099               if (pattern_def_seq != NULL)
6100                 {
6101                   gimple pattern_def_stmt = NULL;
6102                   stmt_vec_info pattern_def_stmt_info = NULL;
6103
6104                   while (!gsi_end_p (pattern_def_si))
6105                     {
6106                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6107                       pattern_def_stmt_info
6108                         = vinfo_for_stmt (pattern_def_stmt);
6109                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6110                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6111                         break;
6112                       gsi_next (&pattern_def_si);
6113                     }
6114
6115                   if (!gsi_end_p (pattern_def_si))
6116                     {
6117                       if (dump_enabled_p ())
6118                         {
6119                           dump_printf_loc (MSG_NOTE, vect_location,
6120                                            "==> vectorizing pattern def "
6121                                            "stmt: ");
6122                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6123                                             pattern_def_stmt, 0);
6124                           dump_printf (MSG_NOTE, "\n");
6125                         }
6126
6127                       stmt = pattern_def_stmt;
6128                       stmt_info = pattern_def_stmt_info;
6129                     }
6130                   else
6131                     {
6132                       pattern_def_si = gsi_none ();
6133                       transform_pattern_stmt = false;
6134                     }
6135                 }
6136               else
6137                 transform_pattern_stmt = false;
6138             }
6139
6140           if (STMT_VINFO_VECTYPE (stmt_info))
6141             {
6142               unsigned int nunits
6143                 = (unsigned int)
6144                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6145               if (!STMT_SLP_TYPE (stmt_info)
6146                   && nunits != (unsigned int) vectorization_factor
6147                   && dump_enabled_p ())
6148                   /* For SLP VF is set according to unrolling factor, and not
6149                      to vector size, hence for SLP this print is not valid.  */
6150                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6151             }
6152
6153           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6154              reached.  */
6155           if (STMT_SLP_TYPE (stmt_info))
6156             {
6157               if (!slp_scheduled)
6158                 {
6159                   slp_scheduled = true;
6160
6161                   if (dump_enabled_p ())
6162                     dump_printf_loc (MSG_NOTE, vect_location,
6163                                      "=== scheduling SLP instances ===\n");
6164
6165                   vect_schedule_slp (loop_vinfo, NULL);
6166                 }
6167
6168               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6169               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6170                 {
6171                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6172                     {
6173                       pattern_def_seq = NULL;
6174                       gsi_next (&si);
6175                     }
6176                   continue;
6177                 }
6178             }
6179
6180           /* -------- vectorize statement ------------ */
6181           if (dump_enabled_p ())
6182             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6183
6184           grouped_store = false;
6185           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6186           if (is_store)
6187             {
6188               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6189                 {
6190                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6191                      interleaving chain was completed - free all the stores in
6192                      the chain.  */
6193                   gsi_next (&si);
6194                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6195                 }
6196               else
6197                 {
6198                   /* Free the attached stmt_vec_info and remove the stmt.  */
6199                   gimple store = gsi_stmt (si);
6200                   free_stmt_vec_info (store);
6201                   unlink_stmt_vdef (store);
6202                   gsi_remove (&si, true);
6203                   release_defs (store);
6204                 }
6205
6206               /* Stores can only appear at the end of pattern statements.  */
6207               gcc_assert (!transform_pattern_stmt);
6208               pattern_def_seq = NULL;
6209             }
6210           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6211             {
6212               pattern_def_seq = NULL;
6213               gsi_next (&si);
6214             }
6215         }                       /* stmts in BB */
6216     }                           /* BBs in loop */
6217
6218   slpeel_make_loop_iterate_ntimes (loop, ratio);
6219
6220   /* Reduce loop iterations by the vectorization factor.  */
6221   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6222                       expected_iterations / vectorization_factor);
6223   loop->nb_iterations_upper_bound
6224     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6225   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6226       && loop->nb_iterations_upper_bound != 0)
6227     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6228   if (loop->any_estimate)
6229     {
6230       loop->nb_iterations_estimate
6231         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6232        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6233            && loop->nb_iterations_estimate != 0)
6234          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6235     }
6236
6237   if (dump_enabled_p ())
6238     {
6239       dump_printf_loc (MSG_NOTE, vect_location,
6240                        "LOOP VECTORIZED\n");
6241       if (loop->inner)
6242         dump_printf_loc (MSG_NOTE, vect_location,
6243                          "OUTER LOOP VECTORIZED\n");
6244       dump_printf (MSG_NOTE, "\n");
6245     }
6246 }