gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "predict.h"
  40 #include "hard-reg-set.h"
  41 #include "function.h"
  42 #include "dominance.h"
  43 #include "cfg.h"
  44 #include "cfganal.h"
  45 #include "basic-block.h"
  46 #include "gimple-pretty-print.h"
  47 #include "tree-ssa-alias.h"
  48 #include "internal-fn.h"
  49 #include "gimple-expr.h"
  50 #include "is-a.h"
  51 #include "gimple.h"
  52 #include "gimplify.h"
  53 #include "gimple-iterator.h"
  54 #include "gimplify-me.h"
  55 #include "gimple-ssa.h"
  56 #include "tree-phinodes.h"
  57 #include "ssa-iterators.h"
  58 #include "stringpool.h"
  59 #include "tree-ssanames.h"
  60 #include "tree-ssa-loop-ivopts.h"
  61 #include "tree-ssa-loop-manip.h"
  62 #include "tree-ssa-loop-niter.h"
  63 #include "tree-pass.h"
  64 #include "cfgloop.h"
  65 #include "hashtab.h"
  66 #include "rtl.h"
  67 #include "flags.h"
  68 #include "statistics.h"
  69 #include "real.h"
  70 #include "fixed-value.h"
  71 #include "insn-config.h"
  72 #include "expmed.h"
  73 #include "dojump.h"
  74 #include "explow.h"
  75 #include "calls.h"
  76 #include "emit-rtl.h"
  77 #include "varasm.h"
  78 #include "stmt.h"
  79 #include "expr.h"
  80 #include "recog.h"
  81 #include "insn-codes.h"
  82 #include "optabs.h"
  83 #include "params.h"
  84 #include "diagnostic-core.h"
  85 #include "tree-chrec.h"
  86 #include "tree-scalar-evolution.h"
  87 #include "tree-vectorizer.h"
  88 #include "target.h"
  89
  90 /* Loop Vectorization Pass.
  91
  92    This pass tries to vectorize loops.
  93
  94    For example, the vectorizer transforms the following simple loop:
  95
  96         short a[N]; short b[N]; short c[N]; int i;
  97
  98         for (i=0; i<N; i++){
  99           a[i] = b[i] + c[i];
 100         }
 101
 102    as if it was manually vectorized by rewriting the source code into:
 103
 104         typedef int __attribute__((mode(V8HI))) v8hi;
 105         short a[N];  short b[N]; short c[N];   int i;
 106         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
 107         v8hi va, vb, vc;
 108
 109         for (i=0; i<N/8; i++){
 110           vb = pb[i];
 111           vc = pc[i];
 112           va = vb + vc;
 113           pa[i] = va;
 114         }
 115
 116         The main entry to this pass is vectorize_loops(), in which
 117    the vectorizer applies a set of analyses on a given set of loops,
 118    followed by the actual vectorization transformation for the loops that
 119    had successfully passed the analysis phase.
 120         Throughout this pass we make a distinction between two types of
 121    data: scalars (which are represented by SSA_NAMES), and memory references
 122    ("data-refs").  These two types of data require different handling both
 123    during analysis and transformation. The types of data-refs that the
 124    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 125    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 126    accesses are required to have a simple (consecutive) access pattern.
 127
 128    Analysis phase:
 129    ===============
 130         The driver for the analysis phase is vect_analyze_loop().
 131    It applies a set of analyses, some of which rely on the scalar evolution
 132    analyzer (scev) developed by Sebastian Pop.
 133
 134         During the analysis phase the vectorizer records some information
 135    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 136    loop, as well as general information about the loop as a whole, which is
 137    recorded in a "loop_vec_info" struct attached to each loop.
 138
 139    Transformation phase:
 140    =====================
 141         The loop transformation phase scans all the stmts in the loop, and
 142    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 143    the loop that needs to be vectorized.  It inserts the vector code sequence
 144    just before the scalar stmt S, and records a pointer to the vector code
 145    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 146    attached to S).  This pointer will be used for the vectorization of following
 147    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 148    otherwise, we rely on dead code elimination for removing it.
 149
 150         For example, say stmt S1 was vectorized into stmt VS1:
 151
 152    VS1: vb = px[i];
 153    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 154    S2:  a = b;
 155
 156    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 157    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 158    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 159    resulting sequence would be:
 160
 161    VS1: vb = px[i];
 162    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 163    VS2: va = vb;
 164    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 165
 166         Operands that are not SSA_NAMEs, are data-refs that appear in
 167    load/store operations (like 'x[i]' in S1), and are handled differently.
 168
 169    Target modeling:
 170    =================
 171         Currently the only target specific information that is used is the
 172    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 173    Targets that can support different sizes of vectors, for now will need
 174    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 175    flexibility will be added in the future.
 176
 177         Since we only vectorize operations which vector form can be
 178    expressed using existing tree codes, to verify that an operation is
 179    supported, the vectorizer checks the relevant optab at the relevant
 180    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 181    the value found is CODE_FOR_nothing, then there's no target support, and
 182    we can't vectorize the stmt.
 183
 184    For additional information on this project see:
 185    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 186 */
 187
 188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 189
 190 /* Function vect_determine_vectorization_factor
 191
 192    Determine the vectorization factor (VF).  VF is the number of data elements
 193    that are operated upon in parallel in a single iteration of the vectorized
 194    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 195    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 196    elements can fit in a single vector register.
 197
 198    We currently support vectorization of loops in which all types operated upon
 199    are of the same size.  Therefore this function currently sets VF according to
 200    the size of the types operated upon, and fails if there are multiple sizes
 201    in the loop.
 202
 203    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 204    original loop:
 205         for (i=0; i<N; i++){
 206           a[i] = b[i] + c[i];
 207         }
 208
 209    vectorized loop:
 210         for (i=0; i<N; i+=VF){
 211           a[i:VF] = b[i:VF] + c[i:VF];
 212         }
 213 */
 214
 215 static bool
 216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 217 {
 218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 219   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 220   int nbbs = loop->num_nodes;
 221   unsigned int vectorization_factor = 0;
 222   tree scalar_type;
 223   gphi *phi;
 224   tree vectype;
 225   unsigned int nunits;
 226   stmt_vec_info stmt_info;
 227   int i;
 228   HOST_WIDE_INT dummy;
 229   gimple stmt, pattern_stmt = NULL;
 230   gimple_seq pattern_def_seq = NULL;
 231   gimple_stmt_iterator pattern_def_si = gsi_none ();
 232   bool analyze_pattern_stmt = false;
 233
 234   if (dump_enabled_p ())
 235     dump_printf_loc (MSG_NOTE, vect_location,
 236                      "=== vect_determine_vectorization_factor ===\n");
 237
 238   for (i = 0; i < nbbs; i++)
 239     {
 240       basic_block bb = bbs[i];
 241
 242       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 243            gsi_next (&si))
 244         {
 245           phi = si.phi ();
 246           stmt_info = vinfo_for_stmt (phi);
 247           if (dump_enabled_p ())
 248             {
 249               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 250               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 251               dump_printf (MSG_NOTE, "\n");
 252             }
 253
 254           gcc_assert (stmt_info);
 255
 256           if (STMT_VINFO_RELEVANT_P (stmt_info))
 257             {
 258               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 259               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location,
 264                                    "get vectype for scalar type:  ");
 265                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 266                   dump_printf (MSG_NOTE, "\n");
 267                 }
 268
 269               vectype = get_vectype_for_scalar_type (scalar_type);
 270               if (!vectype)
 271                 {
 272                   if (dump_enabled_p ())
 273                     {
 274                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 275                                        "not vectorized: unsupported "
 276                                        "data-type ");
 277                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 278                                          scalar_type);
 279                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 280                     }
 281                   return false;
 282                 }
 283               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 284
 285               if (dump_enabled_p ())
 286                 {
 287                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 288                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 289                   dump_printf (MSG_NOTE, "\n");
 290                 }
 291
 292               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 293               if (dump_enabled_p ())
 294                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 295                                  nunits);
 296
 297               if (!vectorization_factor
 298                   || (nunits > vectorization_factor))
 299                 vectorization_factor = nunits;
 300             }
 301         }
 302
 303       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 304            !gsi_end_p (si) || analyze_pattern_stmt;)
 305         {
 306           tree vf_vectype;
 307
 308           if (analyze_pattern_stmt)
 309             stmt = pattern_stmt;
 310           else
 311             stmt = gsi_stmt (si);
 312
 313           stmt_info = vinfo_for_stmt (stmt);
 314
 315           if (dump_enabled_p ())
 316             {
 317               dump_printf_loc (MSG_NOTE, vect_location,
 318                                "==> examining statement: ");
 319               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 320               dump_printf (MSG_NOTE, "\n");
 321             }
 322
 323           gcc_assert (stmt_info);
 324
 325           /* Skip stmts which do not need to be vectorized.  */
 326           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 327                && !STMT_VINFO_LIVE_P (stmt_info))
 328               || gimple_clobber_p (stmt))
 329             {
 330               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 331                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 332                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 333                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 334                 {
 335                   stmt = pattern_stmt;
 336                   stmt_info = vinfo_for_stmt (pattern_stmt);
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_NOTE, vect_location,
 340                                        "==> examining pattern statement: ");
 341                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 342                       dump_printf (MSG_NOTE, "\n");
 343                     }
 344                 }
 345               else
 346                 {
 347                   if (dump_enabled_p ())
 348                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 349                   gsi_next (&si);
 350                   continue;
 351                 }
 352             }
 353           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 354                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 355                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 356                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 357             analyze_pattern_stmt = true;
 358
 359           /* If a pattern statement has def stmts, analyze them too.  */
 360           if (is_pattern_stmt_p (stmt_info))
 361             {
 362               if (pattern_def_seq == NULL)
 363                 {
 364                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 365                   pattern_def_si = gsi_start (pattern_def_seq);
 366                 }
 367               else if (!gsi_end_p (pattern_def_si))
 368                 gsi_next (&pattern_def_si);
 369               if (pattern_def_seq != NULL)
 370                 {
 371                   gimple pattern_def_stmt = NULL;
 372                   stmt_vec_info pattern_def_stmt_info = NULL;
 373
 374                   while (!gsi_end_p (pattern_def_si))
 375                     {
 376                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 377                       pattern_def_stmt_info
 378                         = vinfo_for_stmt (pattern_def_stmt);
 379                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 380                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 381                         break;
 382                       gsi_next (&pattern_def_si);
 383                     }
 384
 385                   if (!gsi_end_p (pattern_def_si))
 386                     {
 387                       if (dump_enabled_p ())
 388                         {
 389                           dump_printf_loc (MSG_NOTE, vect_location,
 390                                            "==> examining pattern def stmt: ");
 391                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 392                                             pattern_def_stmt, 0);
 393                           dump_printf (MSG_NOTE, "\n");
 394                         }
 395
 396                       stmt = pattern_def_stmt;
 397                       stmt_info = pattern_def_stmt_info;
 398                     }
 399                   else
 400                     {
 401                       pattern_def_si = gsi_none ();
 402                       analyze_pattern_stmt = false;
 403                     }
 404                 }
 405               else
 406                 analyze_pattern_stmt = false;
 407             }
 408
 409           if (gimple_get_lhs (stmt) == NULL_TREE
 410               /* MASK_STORE has no lhs, but is ok.  */
 411               && (!is_gimple_call (stmt)
 412                   || !gimple_call_internal_p (stmt)
 413                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 414             {
 415               if (is_gimple_call (stmt))
 416                 {
 417                   /* Ignore calls with no lhs.  These must be calls to
 418                      #pragma omp simd functions, and what vectorization factor
 419                      it really needs can't be determined until
 420                      vectorizable_simd_clone_call.  */
 421                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 422                     {
 423                       pattern_def_seq = NULL;
 424                       gsi_next (&si);
 425                     }
 426                   continue;
 427                 }
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: irregular stmt.");
 432                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 433                                     0);
 434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 435                 }
 436               return false;
 437             }
 438
 439           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: vector stmt in loop:");
 445                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 446                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 447                 }
 448               return false;
 449             }
 450
 451           if (STMT_VINFO_VECTYPE (stmt_info))
 452             {
 453               /* The only case when a vectype had been already set is for stmts
 454                  that contain a dataref, or for "pattern-stmts" (stmts
 455                  generated by the vectorizer to represent/replace a certain
 456                  idiom).  */
 457               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 458                           || is_pattern_stmt_p (stmt_info)
 459                           || !gsi_end_p (pattern_def_si));
 460               vectype = STMT_VINFO_VECTYPE (stmt_info);
 461             }
 462           else
 463             {
 464               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 465               if (is_gimple_call (stmt)
 466                   && gimple_call_internal_p (stmt)
 467                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 468                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 469               else
 470                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 471               if (dump_enabled_p ())
 472                 {
 473                   dump_printf_loc (MSG_NOTE, vect_location,
 474                                    "get vectype for scalar type:  ");
 475                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 476                   dump_printf (MSG_NOTE, "\n");
 477                 }
 478               vectype = get_vectype_for_scalar_type (scalar_type);
 479               if (!vectype)
 480                 {
 481                   if (dump_enabled_p ())
 482                     {
 483                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                                        "not vectorized: unsupported "
 485                                        "data-type ");
 486                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 487                                          scalar_type);
 488                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 489                     }
 490                   return false;
 491                 }
 492
 493               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 494
 495               if (dump_enabled_p ())
 496                 {
 497                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 498                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 499                   dump_printf (MSG_NOTE, "\n");
 500                 }
 501             }
 502
 503           /* The vectorization factor is according to the smallest
 504              scalar type (or the largest vector size, but we only
 505              support one vector size per loop).  */
 506           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                        &dummy);
 508           if (dump_enabled_p ())
 509             {
 510               dump_printf_loc (MSG_NOTE, vect_location,
 511                                "get vectype for scalar type:  ");
 512               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513               dump_printf (MSG_NOTE, "\n");
 514             }
 515           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   return true;
 583 }
 584
 585
 586 /* Function vect_is_simple_iv_evolution.
 587
 588    FORNOW: A simple evolution of an induction variables in the loop is
 589    considered a polynomial evolution.  */
 590
 591 static bool
 592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 593                              tree * step)
 594 {
 595   tree init_expr;
 596   tree step_expr;
 597   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 598   basic_block bb;
 599
 600   /* When there is no evolution in this loop, the evolution function
 601      is not "simple".  */
 602   if (evolution_part == NULL_TREE)
 603     return false;
 604
 605   /* When the evolution is a polynomial of degree >= 2
 606      the evolution function is not "simple".  */
 607   if (tree_is_chrec (evolution_part))
 608     return false;
 609
 610   step_expr = evolution_part;
 611   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 612
 613   if (dump_enabled_p ())
 614     {
 615       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 616       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 617       dump_printf (MSG_NOTE, ",  init: ");
 618       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 619       dump_printf (MSG_NOTE, "\n");
 620     }
 621
 622   *init = init_expr;
 623   *step = step_expr;
 624
 625   if (TREE_CODE (step_expr) != INTEGER_CST
 626       && (TREE_CODE (step_expr) != SSA_NAME
 627           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 628               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 629           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 630               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 631                   || !flag_associative_math)))
 632       && (TREE_CODE (step_expr) != REAL_CST
 633           || !flag_associative_math))
 634     {
 635       if (dump_enabled_p ())
 636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 637                          "step unknown.\n");
 638       return false;
 639     }
 640
 641   return true;
 642 }
 643
 644 /* Function vect_analyze_scalar_cycles_1.
 645
 646    Examine the cross iteration def-use cycles of scalar variables
 647    in LOOP.  LOOP_VINFO represents the loop that is now being
 648    considered for vectorization (can be LOOP, or an outer-loop
 649    enclosing LOOP).  */
 650
 651 static void
 652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 653 {
 654   basic_block bb = loop->header;
 655   tree init, step;
 656   auto_vec<gimple, 64> worklist;
 657   gphi_iterator gsi;
 658   bool double_reduc;
 659
 660   if (dump_enabled_p ())
 661     dump_printf_loc (MSG_NOTE, vect_location,
 662                      "=== vect_analyze_scalar_cycles ===\n");
 663
 664   /* First - identify all inductions.  Reduction detection assumes that all the
 665      inductions have been identified, therefore, this order must not be
 666      changed.  */
 667   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 668     {
 669       gphi *phi = gsi.phi ();
 670       tree access_fn = NULL;
 671       tree def = PHI_RESULT (phi);
 672       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 673
 674       if (dump_enabled_p ())
 675         {
 676           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 677           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 678           dump_printf (MSG_NOTE, "\n");
 679         }
 680
 681       /* Skip virtual phi's.  The data dependences that are associated with
 682          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 683       if (virtual_operand_p (def))
 684         continue;
 685
 686       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 687
 688       /* Analyze the evolution function.  */
 689       access_fn = analyze_scalar_evolution (loop, def);
 690       if (access_fn)
 691         {
 692           STRIP_NOPS (access_fn);
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_NOTE, vect_location,
 696                                "Access function of PHI: ");
 697               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 698               dump_printf (MSG_NOTE, "\n");
 699             }
 700           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 701             = evolution_part_in_loop_num (access_fn, loop->num);
 702         }
 703
 704       if (!access_fn
 705           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 706           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 707               && TREE_CODE (step) != INTEGER_CST))
 708         {
 709           worklist.safe_push (phi);
 710           continue;
 711         }
 712
 713       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 714
 715       if (dump_enabled_p ())
 716         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 717       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 718     }
 719
 720
 721   /* Second - identify all reductions and nested cycles.  */
 722   while (worklist.length () > 0)
 723     {
 724       gimple phi = worklist.pop ();
 725       tree def = PHI_RESULT (phi);
 726       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 727       gimple reduc_stmt;
 728       bool nested_cycle;
 729
 730       if (dump_enabled_p ())
 731         {
 732           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 733           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 734           dump_printf (MSG_NOTE, "\n");
 735         }
 736
 737       gcc_assert (!virtual_operand_p (def)
 738                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 739
 740       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 741       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 742                                                 &double_reduc);
 743       if (reduc_stmt)
 744         {
 745           if (double_reduc)
 746             {
 747               if (dump_enabled_p ())
 748                 dump_printf_loc (MSG_NOTE, vect_location,
 749                                  "Detected double reduction.\n");
 750
 751               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 752               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 753                                                     vect_double_reduction_def;
 754             }
 755           else
 756             {
 757               if (nested_cycle)
 758                 {
 759                   if (dump_enabled_p ())
 760                     dump_printf_loc (MSG_NOTE, vect_location,
 761                                      "Detected vectorizable nested cycle.\n");
 762
 763                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 764                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 765                                                              vect_nested_cycle;
 766                 }
 767               else
 768                 {
 769                   if (dump_enabled_p ())
 770                     dump_printf_loc (MSG_NOTE, vect_location,
 771                                      "Detected reduction.\n");
 772
 773                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 774                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 775                                                            vect_reduction_def;
 776                   /* Store the reduction cycles for possible vectorization in
 777                      loop-aware SLP.  */
 778                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 779                 }
 780             }
 781         }
 782       else
 783         if (dump_enabled_p ())
 784           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 785                            "Unknown def-use cycle pattern.\n");
 786     }
 787 }
 788
 789
 790 /* Function vect_analyze_scalar_cycles.
 791
 792    Examine the cross iteration def-use cycles of scalar variables, by
 793    analyzing the loop-header PHIs of scalar variables.  Classify each
 794    cycle as one of the following: invariant, induction, reduction, unknown.
 795    We do that for the loop represented by LOOP_VINFO, and also to its
 796    inner-loop, if exists.
 797    Examples for scalar cycles:
 798
 799    Example1: reduction:
 800
 801               loop1:
 802               for (i=0; i<N; i++)
 803                  sum += a[i];
 804
 805    Example2: induction:
 806
 807               loop2:
 808               for (i=0; i<N; i++)
 809                  a[i] = i;  */
 810
 811 static void
 812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 813 {
 814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 815
 816   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 817
 818   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 819      Reductions in such inner-loop therefore have different properties than
 820      the reductions in the nest that gets vectorized:
 821      1. When vectorized, they are executed in the same order as in the original
 822         scalar loop, so we can't change the order of computation when
 823         vectorizing them.
 824      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 825         current checks are too strict.  */
 826
 827   if (loop->inner)
 828     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 829 }
 830
 831
 832 /* Function vect_get_loop_niters.
 833
 834    Determine how many iterations the loop is executed and place it
 835    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 836    in NUMBER_OF_ITERATIONSM1.
 837
 838    Return the loop exit condition.  */
 839
 840
 841 static gcond *
 842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 843                       tree *number_of_iterationsm1)
 844 {
 845   tree niters;
 846
 847   if (dump_enabled_p ())
 848     dump_printf_loc (MSG_NOTE, vect_location,
 849                      "=== get_loop_niters ===\n");
 850
 851   niters = number_of_latch_executions (loop);
 852   *number_of_iterationsm1 = niters;
 853
 854   /* We want the number of loop header executions which is the number
 855      of latch executions plus one.
 856      ???  For UINT_MAX latch executions this number overflows to zero
 857      for loops like do { n++; } while (n != 0);  */
 858   if (niters && !chrec_contains_undetermined (niters))
 859     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 860                           build_int_cst (TREE_TYPE (niters), 1));
 861   *number_of_iterations = niters;
 862
 863   return get_loop_exit_condition (loop);
 864 }
 865
 866
 867 /* Function bb_in_loop_p
 868
 869    Used as predicate for dfs order traversal of the loop bbs.  */
 870
 871 static bool
 872 bb_in_loop_p (const_basic_block bb, const void *data)
 873 {
 874   const struct loop *const loop = (const struct loop *)data;
 875   if (flow_bb_inside_loop_p (loop, bb))
 876     return true;
 877   return false;
 878 }
 879
 880
 881 /* Function new_loop_vec_info.
 882
 883    Create and initialize a new loop_vec_info struct for LOOP, as well as
 884    stmt_vec_info structs for all the stmts in LOOP.  */
 885
 886 static loop_vec_info
 887 new_loop_vec_info (struct loop *loop)
 888 {
 889   loop_vec_info res;
 890   basic_block *bbs;
 891   gimple_stmt_iterator si;
 892   unsigned int i, nbbs;
 893
 894   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 895   LOOP_VINFO_LOOP (res) = loop;
 896
 897   bbs = get_loop_body (loop);
 898
 899   /* Create/Update stmt_info for all stmts in the loop.  */
 900   for (i = 0; i < loop->num_nodes; i++)
 901     {
 902       basic_block bb = bbs[i];
 903
 904       /* BBs in a nested inner-loop will have been already processed (because
 905          we will have called vect_analyze_loop_form for any nested inner-loop).
 906          Therefore, for stmts in an inner-loop we just want to update the
 907          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 908          loop_info of the outer-loop we are currently considering to vectorize
 909          (instead of the loop_info of the inner-loop).
 910          For stmts in other BBs we need to create a stmt_info from scratch.  */
 911       if (bb->loop_father != loop)
 912         {
 913           /* Inner-loop bb.  */
 914           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 915           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 916             {
 917               gimple phi = gsi_stmt (si);
 918               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 919               loop_vec_info inner_loop_vinfo =
 920                 STMT_VINFO_LOOP_VINFO (stmt_info);
 921               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 922               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 923             }
 924           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 925            {
 926               gimple stmt = gsi_stmt (si);
 927               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 928               loop_vec_info inner_loop_vinfo =
 929                  STMT_VINFO_LOOP_VINFO (stmt_info);
 930               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 931               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 932            }
 933         }
 934       else
 935         {
 936           /* bb in current nest.  */
 937           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 938             {
 939               gimple phi = gsi_stmt (si);
 940               gimple_set_uid (phi, 0);
 941               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 942             }
 943
 944           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 945             {
 946               gimple stmt = gsi_stmt (si);
 947               gimple_set_uid (stmt, 0);
 948               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 949             }
 950         }
 951     }
 952
 953   /* CHECKME: We want to visit all BBs before their successors (except for
 954      latch blocks, for which this assertion wouldn't hold).  In the simple
 955      case of the loop forms we allow, a dfs order of the BBs would the same
 956      as reversed postorder traversal, so we are safe.  */
 957
 958    free (bbs);
 959    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 960    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 961                               bbs, loop->num_nodes, loop);
 962    gcc_assert (nbbs == loop->num_nodes);
 963
 964   LOOP_VINFO_BBS (res) = bbs;
 965   LOOP_VINFO_NITERSM1 (res) = NULL;
 966   LOOP_VINFO_NITERS (res) = NULL;
 967   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 968   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 969   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 970   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 971   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 972   LOOP_VINFO_VECT_FACTOR (res) = 0;
 973   LOOP_VINFO_LOOP_NEST (res).create (3);
 974   LOOP_VINFO_DATAREFS (res).create (10);
 975   LOOP_VINFO_DDRS (res).create (10 * 10);
 976   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 977   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 978              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 979   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 980              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 981   LOOP_VINFO_GROUPED_STORES (res).create (10);
 982   LOOP_VINFO_REDUCTIONS (res).create (10);
 983   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 984   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 985   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 986   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 987   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 988   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 989   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 990
 991   return res;
 992 }
 993
 994
 995 /* Function destroy_loop_vec_info.
 996
 997    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 998    stmts in the loop.  */
 999
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1002 {
1003   struct loop *loop;
1004   basic_block *bbs;
1005   int nbbs;
1006   gimple_stmt_iterator si;
1007   int j;
1008   vec<slp_instance> slp_instances;
1009   slp_instance instance;
1010   bool swapped;
1011
1012   if (!loop_vinfo)
1013     return;
1014
1015   loop = LOOP_VINFO_LOOP (loop_vinfo);
1016
1017   bbs = LOOP_VINFO_BBS (loop_vinfo);
1018   nbbs = clean_stmts ? loop->num_nodes : 0;
1019   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1020
1021   for (j = 0; j < nbbs; j++)
1022     {
1023       basic_block bb = bbs[j];
1024       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025         free_stmt_vec_info (gsi_stmt (si));
1026
1027       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1028         {
1029           gimple stmt = gsi_stmt (si);
1030
1031           /* We may have broken canonical form by moving a constant
1032              into RHS1 of a commutative op.  Fix such occurrences.  */
1033           if (swapped && is_gimple_assign (stmt))
1034             {
1035               enum tree_code code = gimple_assign_rhs_code (stmt);
1036
1037               if ((code == PLUS_EXPR
1038                    || code == POINTER_PLUS_EXPR
1039                    || code == MULT_EXPR)
1040                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041                 swap_ssa_operands (stmt,
1042                                    gimple_assign_rhs1_ptr (stmt),
1043                                    gimple_assign_rhs2_ptr (stmt));
1044             }
1045
1046           /* Free stmt_vec_info.  */
1047           free_stmt_vec_info (stmt);
1048           gsi_next (&si);
1049         }
1050     }
1051
1052   free (LOOP_VINFO_BBS (loop_vinfo));
1053   vect_destroy_datarefs (loop_vinfo, NULL);
1054   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060     vect_free_slp_instance (instance);
1061
1062   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1066
1067   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1069
1070   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1071
1072   free (loop_vinfo);
1073   loop->aux = NULL;
1074 }
1075
1076
1077 /* Function vect_analyze_loop_1.
1078
1079    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080    for it. The different analyses will record information in the
1081    loop_vec_info struct.  This is a subset of the analyses applied in
1082    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083    that is now considered for (outer-loop) vectorization.  */
1084
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1087 {
1088   loop_vec_info loop_vinfo;
1089
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location,
1092                      "===== analyze_loop_nest_1 =====\n");
1093
1094   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1095
1096   loop_vinfo = vect_analyze_loop_form (loop);
1097   if (!loop_vinfo)
1098     {
1099       if (dump_enabled_p ())
1100         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101                          "bad inner-loop form.\n");
1102       return NULL;
1103     }
1104
1105   return loop_vinfo;
1106 }
1107
1108
1109 /* Function vect_analyze_loop_form.
1110
1111    Verify that certain CFG restrictions hold, including:
1112    - the loop has a pre-header
1113    - the loop has a single entry and exit
1114    - the loop exit condition is simple enough, and the number of iterations
1115      can be analyzed (a countable loop).  */
1116
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1119 {
1120   loop_vec_info loop_vinfo;
1121   gcond *loop_cond;
1122   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123   loop_vec_info inner_loop_vinfo = NULL;
1124
1125   if (dump_enabled_p ())
1126     dump_printf_loc (MSG_NOTE, vect_location,
1127                      "=== vect_analyze_loop_form ===\n");
1128
1129   /* Different restrictions apply when we are considering an inner-most loop,
1130      vs. an outer (nested) loop.
1131      (FORNOW. May want to relax some of these restrictions in the future).  */
1132
1133   if (!loop->inner)
1134     {
1135       /* Inner-most loop.  We currently require that the number of BBs is
1136          exactly 2 (the header and latch).  Vectorizable inner-most loops
1137          look like this:
1138
1139                         (pre-header)
1140                            |
1141                           header <--------+
1142                            | |            |
1143                            | +--> latch --+
1144                            |
1145                         (exit-bb)  */
1146
1147       if (loop->num_nodes != 2)
1148         {
1149           if (dump_enabled_p ())
1150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151                              "not vectorized: control flow in loop.\n");
1152           return NULL;
1153         }
1154
1155       if (empty_block_p (loop->header))
1156         {
1157           if (dump_enabled_p ())
1158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159                              "not vectorized: empty loop.\n");
1160           return NULL;
1161         }
1162     }
1163   else
1164     {
1165       struct loop *innerloop = loop->inner;
1166       edge entryedge;
1167
1168       /* Nested loop. We currently require that the loop is doubly-nested,
1169          contains a single inner loop, and the number of BBs is exactly 5.
1170          Vectorizable outer-loops look like this:
1171
1172                         (pre-header)
1173                            |
1174                           header <---+
1175                            |         |
1176                           inner-loop |
1177                            |         |
1178                           tail ------+
1179                            |
1180                         (exit-bb)
1181
1182          The inner-loop has the properties expected of inner-most loops
1183          as described above.  */
1184
1185       if ((loop->inner)->inner || (loop->inner)->next)
1186         {
1187           if (dump_enabled_p ())
1188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                              "not vectorized: multiple nested loops.\n");
1190           return NULL;
1191         }
1192
1193       /* Analyze the inner-loop.  */
1194       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195       if (!inner_loop_vinfo)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: Bad inner loop.\n");
1200           return NULL;
1201         }
1202
1203       if (!expr_invariant_in_loop_p (loop,
1204                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1205         {
1206           if (dump_enabled_p ())
1207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208                              "not vectorized: inner-loop count not"
1209                              " invariant.\n");
1210           destroy_loop_vec_info (inner_loop_vinfo, true);
1211           return NULL;
1212         }
1213
1214       if (loop->num_nodes != 5)
1215         {
1216           if (dump_enabled_p ())
1217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                              "not vectorized: control flow in loop.\n");
1219           destroy_loop_vec_info (inner_loop_vinfo, true);
1220           return NULL;
1221         }
1222
1223       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224       entryedge = EDGE_PRED (innerloop->header, 0);
1225       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226         entryedge = EDGE_PRED (innerloop->header, 1);
1227
1228       if (entryedge->src != loop->header
1229           || !single_exit (innerloop)
1230           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: unsupported outerloop form.\n");
1235           destroy_loop_vec_info (inner_loop_vinfo, true);
1236           return NULL;
1237         }
1238
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_NOTE, vect_location,
1241                          "Considering outer-loop vectorization.\n");
1242     }
1243
1244   if (!single_exit (loop)
1245       || EDGE_COUNT (loop->header->preds) != 2)
1246     {
1247       if (dump_enabled_p ())
1248         {
1249           if (!single_exit (loop))
1250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                              "not vectorized: multiple exits.\n");
1252           else if (EDGE_COUNT (loop->header->preds) != 2)
1253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                              "not vectorized: too many incoming edges.\n");
1255         }
1256       if (inner_loop_vinfo)
1257         destroy_loop_vec_info (inner_loop_vinfo, true);
1258       return NULL;
1259     }
1260
1261   /* We assume that the loop exit condition is at the end of the loop. i.e,
1262      that the loop is represented as a do-while (with a proper if-guard
1263      before the loop if needed), where the loop header contains all the
1264      executable statements, and the latch is empty.  */
1265   if (!empty_block_p (loop->latch)
1266       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267     {
1268       if (dump_enabled_p ())
1269         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270                          "not vectorized: latch block not empty.\n");
1271       if (inner_loop_vinfo)
1272         destroy_loop_vec_info (inner_loop_vinfo, true);
1273       return NULL;
1274     }
1275
1276   /* Make sure there exists a single-predecessor exit bb:  */
1277   if (!single_pred_p (single_exit (loop)->dest))
1278     {
1279       edge e = single_exit (loop);
1280       if (!(e->flags & EDGE_ABNORMAL))
1281         {
1282           split_loop_exit_edge (e);
1283           if (dump_enabled_p ())
1284             dump_printf (MSG_NOTE, "split exit edge.\n");
1285         }
1286       else
1287         {
1288           if (dump_enabled_p ())
1289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290                              "not vectorized: abnormal loop exit edge.\n");
1291           if (inner_loop_vinfo)
1292             destroy_loop_vec_info (inner_loop_vinfo, true);
1293           return NULL;
1294         }
1295     }
1296
1297   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298                                     &number_of_iterationsm1);
1299   if (!loop_cond)
1300     {
1301       if (dump_enabled_p ())
1302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                          "not vectorized: complicated exit condition.\n");
1304       if (inner_loop_vinfo)
1305         destroy_loop_vec_info (inner_loop_vinfo, true);
1306       return NULL;
1307     }
1308
1309   if (!number_of_iterations
1310       || chrec_contains_undetermined (number_of_iterations))
1311     {
1312       if (dump_enabled_p ())
1313         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                          "not vectorized: number of iterations cannot be "
1315                          "computed.\n");
1316       if (inner_loop_vinfo)
1317         destroy_loop_vec_info (inner_loop_vinfo, true);
1318       return NULL;
1319     }
1320
1321   if (integer_zerop (number_of_iterations))
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: number of iterations = 0.\n");
1326       if (inner_loop_vinfo)
1327         destroy_loop_vec_info (inner_loop_vinfo, true);
1328       return NULL;
1329     }
1330
1331   loop_vinfo = new_loop_vec_info (loop);
1332   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1335
1336   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1337     {
1338       if (dump_enabled_p ())
1339         {
1340           dump_printf_loc (MSG_NOTE, vect_location,
1341                            "Symbolic number of iterations is ");
1342           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343           dump_printf (MSG_NOTE, "\n");
1344         }
1345     }
1346
1347   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1348
1349   /* CHECKME: May want to keep it around it in the future.  */
1350   if (inner_loop_vinfo)
1351     destroy_loop_vec_info (inner_loop_vinfo, false);
1352
1353   gcc_assert (!loop->aux);
1354   loop->aux = loop_vinfo;
1355   return loop_vinfo;
1356 }
1357
1358 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1359    statements update the vectorization factor.  */
1360
1361 static void
1362 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1363 {
1364   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1365   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1366   int nbbs = loop->num_nodes;
1367   unsigned int vectorization_factor;
1368   int i;
1369
1370   if (dump_enabled_p ())
1371     dump_printf_loc (MSG_NOTE, vect_location,
1372                      "=== vect_update_vf_for_slp ===\n");
1373
1374   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1375   gcc_assert (vectorization_factor != 0);
1376
1377   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1378      vectorization factor of the loop is the unrolling factor required by
1379      the SLP instances.  If that unrolling factor is 1, we say, that we
1380      perform pure SLP on loop - cross iteration parallelism is not
1381      exploited.  */
1382   bool only_slp_in_loop = true;
1383   for (i = 0; i < nbbs; i++)
1384     {
1385       basic_block bb = bbs[i];
1386       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1387            gsi_next (&si))
1388         {
1389           gimple stmt = gsi_stmt (si);
1390           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1391           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1392               && STMT_VINFO_RELATED_STMT (stmt_info))
1393             {
1394               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1395               stmt_info = vinfo_for_stmt (stmt);
1396             }
1397           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399               && !PURE_SLP_STMT (stmt_info))
1400             /* STMT needs both SLP and loop-based vectorization.  */
1401             only_slp_in_loop = false;
1402         }
1403     }
1404
1405   if (only_slp_in_loop)
1406     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1407   else
1408     vectorization_factor
1409       = least_common_multiple (vectorization_factor,
1410                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1411
1412   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1413   if (dump_enabled_p ())
1414     dump_printf_loc (MSG_NOTE, vect_location,
1415                      "Updating vectorization factor to %d\n",
1416                      vectorization_factor);
1417 }
1418
1419 /* Function vect_analyze_loop_operations.
1420
1421    Scan the loop stmts and make sure they are all vectorizable.  */
1422
1423 static bool
1424 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1425 {
1426   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1427   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1428   int nbbs = loop->num_nodes;
1429   unsigned int vectorization_factor;
1430   int i;
1431   stmt_vec_info stmt_info;
1432   bool need_to_vectorize = false;
1433   int min_profitable_iters;
1434   int min_scalar_loop_bound;
1435   unsigned int th;
1436   bool ok;
1437   HOST_WIDE_INT max_niter;
1438   HOST_WIDE_INT estimated_niter;
1439   int min_profitable_estimate;
1440
1441   if (dump_enabled_p ())
1442     dump_printf_loc (MSG_NOTE, vect_location,
1443                      "=== vect_analyze_loop_operations ===\n");
1444
1445   for (i = 0; i < nbbs; i++)
1446     {
1447       basic_block bb = bbs[i];
1448
1449       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1450            gsi_next (&si))
1451         {
1452           gphi *phi = si.phi ();
1453           ok = true;
1454
1455           stmt_info = vinfo_for_stmt (phi);
1456           if (dump_enabled_p ())
1457             {
1458               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1459               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1460               dump_printf (MSG_NOTE, "\n");
1461             }
1462
1463           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1464              (i.e., a phi in the tail of the outer-loop).  */
1465           if (! is_loop_header_bb_p (bb))
1466             {
1467               /* FORNOW: we currently don't support the case that these phis
1468                  are not used in the outerloop (unless it is double reduction,
1469                  i.e., this phi is vect_reduction_def), cause this case
1470                  requires to actually do something here.  */
1471               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1472                    || STMT_VINFO_LIVE_P (stmt_info))
1473                   && STMT_VINFO_DEF_TYPE (stmt_info)
1474                      != vect_double_reduction_def)
1475                 {
1476                   if (dump_enabled_p ())
1477                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1478                                      "Unsupported loop-closed phi in "
1479                                      "outer-loop.\n");
1480                   return false;
1481                 }
1482
1483               /* If PHI is used in the outer loop, we check that its operand
1484                  is defined in the inner loop.  */
1485               if (STMT_VINFO_RELEVANT_P (stmt_info))
1486                 {
1487                   tree phi_op;
1488                   gimple op_def_stmt;
1489
1490                   if (gimple_phi_num_args (phi) != 1)
1491                     return false;
1492
1493                   phi_op = PHI_ARG_DEF (phi, 0);
1494                   if (TREE_CODE (phi_op) != SSA_NAME)
1495                     return false;
1496
1497                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1498                   if (gimple_nop_p (op_def_stmt)
1499                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1500                       || !vinfo_for_stmt (op_def_stmt))
1501                     return false;
1502
1503                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1504                         != vect_used_in_outer
1505                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1506                            != vect_used_in_outer_by_reduction)
1507                     return false;
1508                 }
1509
1510               continue;
1511             }
1512
1513           gcc_assert (stmt_info);
1514
1515           if (STMT_VINFO_LIVE_P (stmt_info))
1516             {
1517               /* FORNOW: not yet supported.  */
1518               if (dump_enabled_p ())
1519                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1520                                  "not vectorized: value used after loop.\n");
1521               return false;
1522             }
1523
1524           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1525               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1526             {
1527               /* A scalar-dependence cycle that we don't support.  */
1528               if (dump_enabled_p ())
1529                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530                                  "not vectorized: scalar dependence cycle.\n");
1531               return false;
1532             }
1533
1534           if (STMT_VINFO_RELEVANT_P (stmt_info))
1535             {
1536               need_to_vectorize = true;
1537               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1538                 ok = vectorizable_induction (phi, NULL, NULL);
1539             }
1540
1541           if (!ok)
1542             {
1543               if (dump_enabled_p ())
1544                 {
1545                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546                                    "not vectorized: relevant phi not "
1547                                    "supported: ");
1548                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1549                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1550                 }
1551               return false;
1552             }
1553         }
1554
1555       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556            gsi_next (&si))
1557         {
1558           gimple stmt = gsi_stmt (si);
1559           if (STMT_SLP_TYPE (vinfo_for_stmt (stmt)))
1560             {
1561               need_to_vectorize = true;
1562               continue;
1563             }
1564           if (!gimple_clobber_p (stmt)
1565               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1566             return false;
1567         }
1568     } /* bbs */
1569
1570   /* All operations in the loop are either irrelevant (deal with loop
1571      control, or dead), or only used outside the loop and can be moved
1572      out of the loop (e.g. invariants, inductions).  The loop can be
1573      optimized away by scalar optimizations.  We're better off not
1574      touching this loop.  */
1575   if (!need_to_vectorize)
1576     {
1577       if (dump_enabled_p ())
1578         dump_printf_loc (MSG_NOTE, vect_location,
1579                          "All the computation can be taken out of the loop.\n");
1580       if (dump_enabled_p ())
1581         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1582                          "not vectorized: redundant loop. no profit to "
1583                          "vectorize.\n");
1584       return false;
1585     }
1586
1587   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1588   gcc_assert (vectorization_factor != 0);
1589
1590   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1591     dump_printf_loc (MSG_NOTE, vect_location,
1592                      "vectorization_factor = %d, niters = "
1593                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1594                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1595
1596   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1597        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1598       || ((max_niter = max_stmt_executions_int (loop)) != -1
1599           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1600     {
1601       if (dump_enabled_p ())
1602         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1603                          "not vectorized: iteration count too small.\n");
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "not vectorized: iteration count smaller than "
1607                          "vectorization factor.\n");
1608       return false;
1609     }
1610
1611   /* Analyze cost.  Decide if worth while to vectorize.  */
1612
1613   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1614                                       &min_profitable_estimate);
1615   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1616
1617   if (min_profitable_iters < 0)
1618     {
1619       if (dump_enabled_p ())
1620         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621                          "not vectorized: vectorization not profitable.\n");
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624                          "not vectorized: vector version will never be "
1625                          "profitable.\n");
1626       return false;
1627     }
1628
1629   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1630                             * vectorization_factor) - 1);
1631
1632
1633   /* Use the cost model only if it is more conservative than user specified
1634      threshold.  */
1635
1636   th = (unsigned) min_scalar_loop_bound;
1637   if (min_profitable_iters
1638       && (!min_scalar_loop_bound
1639           || min_profitable_iters > min_scalar_loop_bound))
1640     th = (unsigned) min_profitable_iters;
1641
1642   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1643
1644   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1645       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1646     {
1647       if (dump_enabled_p ())
1648         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                          "not vectorized: vectorization not profitable.\n");
1650       if (dump_enabled_p ())
1651         dump_printf_loc (MSG_NOTE, vect_location,
1652                          "not vectorized: iteration count smaller than user "
1653                          "specified loop bound parameter or minimum profitable "
1654                          "iterations (whichever is more conservative).\n");
1655       return false;
1656     }
1657
1658   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1659       && ((unsigned HOST_WIDE_INT) estimated_niter
1660           <= MAX (th, (unsigned)min_profitable_estimate)))
1661     {
1662       if (dump_enabled_p ())
1663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664                          "not vectorized: estimated iteration count too "
1665                          "small.\n");
1666       if (dump_enabled_p ())
1667         dump_printf_loc (MSG_NOTE, vect_location,
1668                          "not vectorized: estimated iteration count smaller "
1669                          "than specified loop bound parameter or minimum "
1670                          "profitable iterations (whichever is more "
1671                          "conservative).\n");
1672       return false;
1673     }
1674
1675   return true;
1676 }
1677
1678
1679 /* Function vect_analyze_loop_2.
1680
1681    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1682    for it.  The different analyses will record information in the
1683    loop_vec_info struct.  */
1684 static bool
1685 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1686 {
1687   bool ok;
1688   int max_vf = MAX_VECTORIZATION_FACTOR;
1689   int min_vf = 2;
1690   unsigned int th;
1691   unsigned int n_stmts = 0;
1692
1693   /* Find all data references in the loop (which correspond to vdefs/vuses)
1694      and analyze their evolution in the loop.  Also adjust the minimal
1695      vectorization factor according to the loads and stores.
1696
1697      FORNOW: Handle only simple, array references, which
1698      alignment can be forced, and aligned pointer-references.  */
1699
1700   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1701   if (!ok)
1702     {
1703       if (dump_enabled_p ())
1704         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1705                          "bad data references.\n");
1706       return false;
1707     }
1708
1709   /* Classify all cross-iteration scalar data-flow cycles.
1710      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1711
1712   vect_analyze_scalar_cycles (loop_vinfo);
1713
1714   vect_pattern_recog (loop_vinfo, NULL);
1715
1716   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1717      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1718
1719   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1720   if (!ok)
1721     {
1722       if (dump_enabled_p ())
1723         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724                          "bad data access.\n");
1725       return false;
1726     }
1727
1728   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1729
1730   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1731   if (!ok)
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "unexpected pattern.\n");
1736       return false;
1737     }
1738
1739   /* Analyze data dependences between the data-refs in the loop
1740      and adjust the maximum vectorization factor according to
1741      the dependences.
1742      FORNOW: fail at the first data dependence that we encounter.  */
1743
1744   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1745   if (!ok
1746       || max_vf < min_vf)
1747     {
1748       if (dump_enabled_p ())
1749             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1750                              "bad data dependence.\n");
1751       return false;
1752     }
1753
1754   ok = vect_determine_vectorization_factor (loop_vinfo);
1755   if (!ok)
1756     {
1757       if (dump_enabled_p ())
1758         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1759                          "can't determine vectorization factor.\n");
1760       return false;
1761     }
1762   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1763     {
1764       if (dump_enabled_p ())
1765         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1766                          "bad data dependence.\n");
1767       return false;
1768     }
1769
1770   /* Analyze the alignment of the data-refs in the loop.
1771      Fail if a data reference is found that cannot be vectorized.  */
1772
1773   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1774   if (!ok)
1775     {
1776       if (dump_enabled_p ())
1777         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778                          "bad data alignment.\n");
1779       return false;
1780     }
1781
1782   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1783      It is important to call pruning after vect_analyze_data_ref_accesses,
1784      since we use grouping information gathered by interleaving analysis.  */
1785   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1786   if (!ok)
1787     {
1788       if (dump_enabled_p ())
1789         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790                          "number of versioning for alias "
1791                          "run-time tests exceeds %d "
1792                          "(--param vect-max-version-for-alias-checks)\n",
1793                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1794       return false;
1795     }
1796
1797   /* This pass will decide on using loop versioning and/or loop peeling in
1798      order to enhance the alignment of data references in the loop.  */
1799
1800   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1801   if (!ok)
1802     {
1803       if (dump_enabled_p ())
1804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805                          "bad data alignment.\n");
1806       return false;
1807     }
1808
1809   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1810   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1811   if (ok)
1812     {
1813       /* If there are any SLP instances mark them as pure_slp.  */
1814       if (vect_make_slp_decision (loop_vinfo))
1815         {
1816           /* Find stmts that need to be both vectorized and SLPed.  */
1817           vect_detect_hybrid_slp (loop_vinfo);
1818
1819           /* Update the vectorization factor based on the SLP decision.  */
1820           vect_update_vf_for_slp (loop_vinfo);
1821
1822           /* Once VF is set, SLP costs should be updated since the number of
1823              created vector stmts depends on VF.  */
1824           vect_update_slp_costs_according_to_vf (loop_vinfo);
1825
1826           /* Analyze operations in the SLP instances.  Note this may
1827              remove unsupported SLP instances which makes the above
1828              SLP kind detection invalid.  */
1829           unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1830           vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1831           if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1832             return false;
1833         }
1834     }
1835   else
1836     return false;
1837
1838   /* Scan all the remaining operations in the loop that are not subject
1839      to SLP and make sure they are vectorizable.  */
1840   ok = vect_analyze_loop_operations (loop_vinfo);
1841   if (!ok)
1842     {
1843       if (dump_enabled_p ())
1844         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1845                          "bad operation or unsupported loop bound.\n");
1846       return false;
1847     }
1848
1849   /* Decide whether we need to create an epilogue loop to handle
1850      remaining scalar iterations.  */
1851   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1852         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1853        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1854
1855   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1856       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1857     {
1858       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1859                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1860           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1861         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1862     }
1863   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1864            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1865                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1866                /* In case of versioning, check if the maximum number of
1867                   iterations is greater than th.  If they are identical,
1868                   the epilogue is unnecessary.  */
1869                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1870                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1871                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1872                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1873     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1874
1875   /* If an epilogue loop is required make sure we can create one.  */
1876   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1877       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1878     {
1879       if (dump_enabled_p ())
1880         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1881       if (!vect_can_advance_ivs_p (loop_vinfo)
1882           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1883                                            single_exit (LOOP_VINFO_LOOP
1884                                                          (loop_vinfo))))
1885         {
1886           if (dump_enabled_p ())
1887             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888                              "not vectorized: can't create required "
1889                              "epilog loop\n");
1890           return false;
1891         }
1892     }
1893
1894   return true;
1895 }
1896
1897 /* Function vect_analyze_loop.
1898
1899    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1900    for it.  The different analyses will record information in the
1901    loop_vec_info struct.  */
1902 loop_vec_info
1903 vect_analyze_loop (struct loop *loop)
1904 {
1905   loop_vec_info loop_vinfo;
1906   unsigned int vector_sizes;
1907
1908   /* Autodetect first vector size we try.  */
1909   current_vector_size = 0;
1910   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1911
1912   if (dump_enabled_p ())
1913     dump_printf_loc (MSG_NOTE, vect_location,
1914                      "===== analyze_loop_nest =====\n");
1915
1916   if (loop_outer (loop)
1917       && loop_vec_info_for_loop (loop_outer (loop))
1918       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1919     {
1920       if (dump_enabled_p ())
1921         dump_printf_loc (MSG_NOTE, vect_location,
1922                          "outer-loop already vectorized.\n");
1923       return NULL;
1924     }
1925
1926   while (1)
1927     {
1928       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1929       loop_vinfo = vect_analyze_loop_form (loop);
1930       if (!loop_vinfo)
1931         {
1932           if (dump_enabled_p ())
1933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934                              "bad loop form.\n");
1935           return NULL;
1936         }
1937
1938       if (vect_analyze_loop_2 (loop_vinfo))
1939         {
1940           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1941
1942           return loop_vinfo;
1943         }
1944
1945       destroy_loop_vec_info (loop_vinfo, true);
1946
1947       vector_sizes &= ~current_vector_size;
1948       if (vector_sizes == 0
1949           || current_vector_size == 0)
1950         return NULL;
1951
1952       /* Try the next biggest vector size.  */
1953       current_vector_size = 1 << floor_log2 (vector_sizes);
1954       if (dump_enabled_p ())
1955         dump_printf_loc (MSG_NOTE, vect_location,
1956                          "***** Re-trying analysis with "
1957                          "vector size %d\n", current_vector_size);
1958     }
1959 }
1960
1961
1962 /* Function reduction_code_for_scalar_code
1963
1964    Input:
1965    CODE - tree_code of a reduction operations.
1966
1967    Output:
1968    REDUC_CODE - the corresponding tree-code to be used to reduce the
1969       vector of partial results into a single scalar result, or ERROR_MARK
1970       if the operation is a supported reduction operation, but does not have
1971       such a tree-code.
1972
1973    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1974
1975 static bool
1976 reduction_code_for_scalar_code (enum tree_code code,
1977                                 enum tree_code *reduc_code)
1978 {
1979   switch (code)
1980     {
1981       case MAX_EXPR:
1982         *reduc_code = REDUC_MAX_EXPR;
1983         return true;
1984
1985       case MIN_EXPR:
1986         *reduc_code = REDUC_MIN_EXPR;
1987         return true;
1988
1989       case PLUS_EXPR:
1990         *reduc_code = REDUC_PLUS_EXPR;
1991         return true;
1992
1993       case MULT_EXPR:
1994       case MINUS_EXPR:
1995       case BIT_IOR_EXPR:
1996       case BIT_XOR_EXPR:
1997       case BIT_AND_EXPR:
1998         *reduc_code = ERROR_MARK;
1999         return true;
2000
2001       default:
2002        return false;
2003     }
2004 }
2005
2006
2007 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2008    STMT is printed with a message MSG. */
2009
2010 static void
2011 report_vect_op (int msg_type, gimple stmt, const char *msg)
2012 {
2013   dump_printf_loc (msg_type, vect_location, "%s", msg);
2014   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2015   dump_printf (msg_type, "\n");
2016 }
2017
2018
2019 /* Detect SLP reduction of the form:
2020
2021    #a1 = phi <a5, a0>
2022    a2 = operation (a1)
2023    a3 = operation (a2)
2024    a4 = operation (a3)
2025    a5 = operation (a4)
2026
2027    #a = phi <a5>
2028
2029    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2030    FIRST_STMT is the first reduction stmt in the chain
2031    (a2 = operation (a1)).
2032
2033    Return TRUE if a reduction chain was detected.  */
2034
2035 static bool
2036 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2037 {
2038   struct loop *loop = (gimple_bb (phi))->loop_father;
2039   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2040   enum tree_code code;
2041   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2042   stmt_vec_info use_stmt_info, current_stmt_info;
2043   tree lhs;
2044   imm_use_iterator imm_iter;
2045   use_operand_p use_p;
2046   int nloop_uses, size = 0, n_out_of_loop_uses;
2047   bool found = false;
2048
2049   if (loop != vect_loop)
2050     return false;
2051
2052   lhs = PHI_RESULT (phi);
2053   code = gimple_assign_rhs_code (first_stmt);
2054   while (1)
2055     {
2056       nloop_uses = 0;
2057       n_out_of_loop_uses = 0;
2058       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2059         {
2060           gimple use_stmt = USE_STMT (use_p);
2061           if (is_gimple_debug (use_stmt))
2062             continue;
2063
2064           /* Check if we got back to the reduction phi.  */
2065           if (use_stmt == phi)
2066             {
2067               loop_use_stmt = use_stmt;
2068               found = true;
2069               break;
2070             }
2071
2072           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2073             {
2074               loop_use_stmt = use_stmt;
2075               nloop_uses++;
2076             }
2077            else
2078              n_out_of_loop_uses++;
2079
2080            /* There are can be either a single use in the loop or two uses in
2081               phi nodes.  */
2082            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2083              return false;
2084         }
2085
2086       if (found)
2087         break;
2088
2089       /* We reached a statement with no loop uses.  */
2090       if (nloop_uses == 0)
2091         return false;
2092
2093       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2094       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2095         return false;
2096
2097       if (!is_gimple_assign (loop_use_stmt)
2098           || code != gimple_assign_rhs_code (loop_use_stmt)
2099           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2100         return false;
2101
2102       /* Insert USE_STMT into reduction chain.  */
2103       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2104       if (current_stmt)
2105         {
2106           current_stmt_info = vinfo_for_stmt (current_stmt);
2107           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2108           GROUP_FIRST_ELEMENT (use_stmt_info)
2109             = GROUP_FIRST_ELEMENT (current_stmt_info);
2110         }
2111       else
2112         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2113
2114       lhs = gimple_assign_lhs (loop_use_stmt);
2115       current_stmt = loop_use_stmt;
2116       size++;
2117    }
2118
2119   if (!found || loop_use_stmt != phi || size < 2)
2120     return false;
2121
2122   /* Swap the operands, if needed, to make the reduction operand be the second
2123      operand.  */
2124   lhs = PHI_RESULT (phi);
2125   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2126   while (next_stmt)
2127     {
2128       if (gimple_assign_rhs2 (next_stmt) == lhs)
2129         {
2130           tree op = gimple_assign_rhs1 (next_stmt);
2131           gimple def_stmt = NULL;
2132
2133           if (TREE_CODE (op) == SSA_NAME)
2134             def_stmt = SSA_NAME_DEF_STMT (op);
2135
2136           /* Check that the other def is either defined in the loop
2137              ("vect_internal_def"), or it's an induction (defined by a
2138              loop-header phi-node).  */
2139           if (def_stmt
2140               && gimple_bb (def_stmt)
2141               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2142               && (is_gimple_assign (def_stmt)
2143                   || is_gimple_call (def_stmt)
2144                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2145                            == vect_induction_def
2146                   || (gimple_code (def_stmt) == GIMPLE_PHI
2147                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2148                                   == vect_internal_def
2149                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2150             {
2151               lhs = gimple_assign_lhs (next_stmt);
2152               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2153               continue;
2154             }
2155
2156           return false;
2157         }
2158       else
2159         {
2160           tree op = gimple_assign_rhs2 (next_stmt);
2161           gimple def_stmt = NULL;
2162
2163           if (TREE_CODE (op) == SSA_NAME)
2164             def_stmt = SSA_NAME_DEF_STMT (op);
2165
2166           /* Check that the other def is either defined in the loop
2167             ("vect_internal_def"), or it's an induction (defined by a
2168             loop-header phi-node).  */
2169           if (def_stmt
2170               && gimple_bb (def_stmt)
2171               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2172               && (is_gimple_assign (def_stmt)
2173                   || is_gimple_call (def_stmt)
2174                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2175                               == vect_induction_def
2176                   || (gimple_code (def_stmt) == GIMPLE_PHI
2177                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2178                                   == vect_internal_def
2179                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2180             {
2181               if (dump_enabled_p ())
2182                 {
2183                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2184                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2185                   dump_printf (MSG_NOTE, "\n");
2186                 }
2187
2188               swap_ssa_operands (next_stmt,
2189                                  gimple_assign_rhs1_ptr (next_stmt),
2190                                  gimple_assign_rhs2_ptr (next_stmt));
2191               update_stmt (next_stmt);
2192
2193               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2194                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2195             }
2196           else
2197             return false;
2198         }
2199
2200       lhs = gimple_assign_lhs (next_stmt);
2201       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2202     }
2203
2204   /* Save the chain for further analysis in SLP detection.  */
2205   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2206   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2207   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2208
2209   return true;
2210 }
2211
2212
2213 /* Function vect_is_simple_reduction_1
2214
2215    (1) Detect a cross-iteration def-use cycle that represents a simple
2216    reduction computation.  We look for the following pattern:
2217
2218    loop_header:
2219      a1 = phi < a0, a2 >
2220      a3 = ...
2221      a2 = operation (a3, a1)
2222
2223    or
2224
2225    a3 = ...
2226    loop_header:
2227      a1 = phi < a0, a2 >
2228      a2 = operation (a3, a1)
2229
2230    such that:
2231    1. operation is commutative and associative and it is safe to
2232       change the order of the computation (if CHECK_REDUCTION is true)
2233    2. no uses for a2 in the loop (a2 is used out of the loop)
2234    3. no uses of a1 in the loop besides the reduction operation
2235    4. no uses of a1 outside the loop.
2236
2237    Conditions 1,4 are tested here.
2238    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2239
2240    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2241    nested cycles, if CHECK_REDUCTION is false.
2242
2243    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2244    reductions:
2245
2246      a1 = phi < a0, a2 >
2247      inner loop (def of a3)
2248      a2 = phi < a3 >
2249
2250    If MODIFY is true it tries also to rework the code in-place to enable
2251    detection of more reduction patterns.  For the time being we rewrite
2252    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2253 */
2254
2255 static gimple
2256 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2257                             bool check_reduction, bool *double_reduc,
2258                             bool modify)
2259 {
2260   struct loop *loop = (gimple_bb (phi))->loop_father;
2261   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2262   edge latch_e = loop_latch_edge (loop);
2263   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2264   gimple def_stmt, def1 = NULL, def2 = NULL;
2265   enum tree_code orig_code, code;
2266   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2267   tree type;
2268   int nloop_uses;
2269   tree name;
2270   imm_use_iterator imm_iter;
2271   use_operand_p use_p;
2272   bool phi_def;
2273
2274   *double_reduc = false;
2275
2276   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2277      otherwise, we assume outer loop vectorization.  */
2278   gcc_assert ((check_reduction && loop == vect_loop)
2279               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2280
2281   name = PHI_RESULT (phi);
2282   /* ???  If there are no uses of the PHI result the inner loop reduction
2283      won't be detected as possibly double-reduction by vectorizable_reduction
2284      because that tries to walk the PHI arg from the preheader edge which
2285      can be constant.  See PR60382.  */
2286   if (has_zero_uses (name))
2287     return NULL;
2288   nloop_uses = 0;
2289   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2290     {
2291       gimple use_stmt = USE_STMT (use_p);
2292       if (is_gimple_debug (use_stmt))
2293         continue;
2294
2295       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2296         {
2297           if (dump_enabled_p ())
2298             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299                              "intermediate value used outside loop.\n");
2300
2301           return NULL;
2302         }
2303
2304       nloop_uses++;
2305       if (nloop_uses > 1)
2306         {
2307           if (dump_enabled_p ())
2308             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                              "reduction used in loop.\n");
2310           return NULL;
2311         }
2312     }
2313
2314   if (TREE_CODE (loop_arg) != SSA_NAME)
2315     {
2316       if (dump_enabled_p ())
2317         {
2318           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319                            "reduction: not ssa_name: ");
2320           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2321           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2322         }
2323       return NULL;
2324     }
2325
2326   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2327   if (!def_stmt)
2328     {
2329       if (dump_enabled_p ())
2330         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2331                          "reduction: no def_stmt.\n");
2332       return NULL;
2333     }
2334
2335   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2336     {
2337       if (dump_enabled_p ())
2338         {
2339           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2340           dump_printf (MSG_NOTE, "\n");
2341         }
2342       return NULL;
2343     }
2344
2345   if (is_gimple_assign (def_stmt))
2346     {
2347       name = gimple_assign_lhs (def_stmt);
2348       phi_def = false;
2349     }
2350   else
2351     {
2352       name = PHI_RESULT (def_stmt);
2353       phi_def = true;
2354     }
2355
2356   nloop_uses = 0;
2357   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2358     {
2359       gimple use_stmt = USE_STMT (use_p);
2360       if (is_gimple_debug (use_stmt))
2361         continue;
2362       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2363         nloop_uses++;
2364       if (nloop_uses > 1)
2365         {
2366           if (dump_enabled_p ())
2367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368                              "reduction used in loop.\n");
2369           return NULL;
2370         }
2371     }
2372
2373   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2374      defined in the inner loop.  */
2375   if (phi_def)
2376     {
2377       op1 = PHI_ARG_DEF (def_stmt, 0);
2378
2379       if (gimple_phi_num_args (def_stmt) != 1
2380           || TREE_CODE (op1) != SSA_NAME)
2381         {
2382           if (dump_enabled_p ())
2383             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2384                              "unsupported phi node definition.\n");
2385
2386           return NULL;
2387         }
2388
2389       def1 = SSA_NAME_DEF_STMT (op1);
2390       if (gimple_bb (def1)
2391           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2392           && loop->inner
2393           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2394           && is_gimple_assign (def1))
2395         {
2396           if (dump_enabled_p ())
2397             report_vect_op (MSG_NOTE, def_stmt,
2398                             "detected double reduction: ");
2399
2400           *double_reduc = true;
2401           return def_stmt;
2402         }
2403
2404       return NULL;
2405     }
2406
2407   code = orig_code = gimple_assign_rhs_code (def_stmt);
2408
2409   /* We can handle "res -= x[i]", which is non-associative by
2410      simply rewriting this into "res += -x[i]".  Avoid changing
2411      gimple instruction for the first simple tests and only do this
2412      if we're allowed to change code at all.  */
2413   if (code == MINUS_EXPR
2414       && modify
2415       && (op1 = gimple_assign_rhs1 (def_stmt))
2416       && TREE_CODE (op1) == SSA_NAME
2417       && SSA_NAME_DEF_STMT (op1) == phi)
2418     code = PLUS_EXPR;
2419
2420   if (check_reduction
2421       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2422     {
2423       if (dump_enabled_p ())
2424         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2425                         "reduction: not commutative/associative: ");
2426       return NULL;
2427     }
2428
2429   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2430     {
2431       if (code != COND_EXPR)
2432         {
2433           if (dump_enabled_p ())
2434             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2435                             "reduction: not binary operation: ");
2436
2437           return NULL;
2438         }
2439
2440       op3 = gimple_assign_rhs1 (def_stmt);
2441       if (COMPARISON_CLASS_P (op3))
2442         {
2443           op4 = TREE_OPERAND (op3, 1);
2444           op3 = TREE_OPERAND (op3, 0);
2445         }
2446
2447       op1 = gimple_assign_rhs2 (def_stmt);
2448       op2 = gimple_assign_rhs3 (def_stmt);
2449
2450       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2451         {
2452           if (dump_enabled_p ())
2453             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2454                             "reduction: uses not ssa_names: ");
2455
2456           return NULL;
2457         }
2458     }
2459   else
2460     {
2461       op1 = gimple_assign_rhs1 (def_stmt);
2462       op2 = gimple_assign_rhs2 (def_stmt);
2463
2464       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2465         {
2466           if (dump_enabled_p ())
2467             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2468                             "reduction: uses not ssa_names: ");
2469
2470           return NULL;
2471         }
2472    }
2473
2474   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2475   if ((TREE_CODE (op1) == SSA_NAME
2476        && !types_compatible_p (type,TREE_TYPE (op1)))
2477       || (TREE_CODE (op2) == SSA_NAME
2478           && !types_compatible_p (type, TREE_TYPE (op2)))
2479       || (op3 && TREE_CODE (op3) == SSA_NAME
2480           && !types_compatible_p (type, TREE_TYPE (op3)))
2481       || (op4 && TREE_CODE (op4) == SSA_NAME
2482           && !types_compatible_p (type, TREE_TYPE (op4))))
2483     {
2484       if (dump_enabled_p ())
2485         {
2486           dump_printf_loc (MSG_NOTE, vect_location,
2487                            "reduction: multiple types: operation type: ");
2488           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2489           dump_printf (MSG_NOTE, ", operands types: ");
2490           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2491                              TREE_TYPE (op1));
2492           dump_printf (MSG_NOTE, ",");
2493           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2494                              TREE_TYPE (op2));
2495           if (op3)
2496             {
2497               dump_printf (MSG_NOTE, ",");
2498               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2499                                  TREE_TYPE (op3));
2500             }
2501
2502           if (op4)
2503             {
2504               dump_printf (MSG_NOTE, ",");
2505               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2506                                  TREE_TYPE (op4));
2507             }
2508           dump_printf (MSG_NOTE, "\n");
2509         }
2510
2511       return NULL;
2512     }
2513
2514   /* Check that it's ok to change the order of the computation.
2515      Generally, when vectorizing a reduction we change the order of the
2516      computation.  This may change the behavior of the program in some
2517      cases, so we need to check that this is ok.  One exception is when
2518      vectorizing an outer-loop: the inner-loop is executed sequentially,
2519      and therefore vectorizing reductions in the inner-loop during
2520      outer-loop vectorization is safe.  */
2521
2522   /* CHECKME: check for !flag_finite_math_only too?  */
2523   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2524       && check_reduction)
2525     {
2526       /* Changing the order of operations changes the semantics.  */
2527       if (dump_enabled_p ())
2528         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2529                         "reduction: unsafe fp math optimization: ");
2530       return NULL;
2531     }
2532   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2533            && check_reduction)
2534     {
2535       /* Changing the order of operations changes the semantics.  */
2536       if (dump_enabled_p ())
2537         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2538                         "reduction: unsafe int math optimization: ");
2539       return NULL;
2540     }
2541   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2542     {
2543       /* Changing the order of operations changes the semantics.  */
2544       if (dump_enabled_p ())
2545         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2546                         "reduction: unsafe fixed-point math optimization: ");
2547       return NULL;
2548     }
2549
2550   /* If we detected "res -= x[i]" earlier, rewrite it into
2551      "res += -x[i]" now.  If this turns out to be useless reassoc
2552      will clean it up again.  */
2553   if (orig_code == MINUS_EXPR)
2554     {
2555       tree rhs = gimple_assign_rhs2 (def_stmt);
2556       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2557       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2558       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2559       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2560                                                           loop_info, NULL));
2561       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2562       gimple_assign_set_rhs2 (def_stmt, negrhs);
2563       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2564       update_stmt (def_stmt);
2565     }
2566
2567   /* Reduction is safe. We're dealing with one of the following:
2568      1) integer arithmetic and no trapv
2569      2) floating point arithmetic, and special flags permit this optimization
2570      3) nested cycle (i.e., outer loop vectorization).  */
2571   if (TREE_CODE (op1) == SSA_NAME)
2572     def1 = SSA_NAME_DEF_STMT (op1);
2573
2574   if (TREE_CODE (op2) == SSA_NAME)
2575     def2 = SSA_NAME_DEF_STMT (op2);
2576
2577   if (code != COND_EXPR
2578       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2579     {
2580       if (dump_enabled_p ())
2581         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2582       return NULL;
2583     }
2584
2585   /* Check that one def is the reduction def, defined by PHI,
2586      the other def is either defined in the loop ("vect_internal_def"),
2587      or it's an induction (defined by a loop-header phi-node).  */
2588
2589   if (def2 && def2 == phi
2590       && (code == COND_EXPR
2591           || !def1 || gimple_nop_p (def1)
2592           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2593           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2594               && (is_gimple_assign (def1)
2595                   || is_gimple_call (def1)
2596                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2597                       == vect_induction_def
2598                   || (gimple_code (def1) == GIMPLE_PHI
2599                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2600                           == vect_internal_def
2601                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2602     {
2603       if (dump_enabled_p ())
2604         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2605       return def_stmt;
2606     }
2607
2608   if (def1 && def1 == phi
2609       && (code == COND_EXPR
2610           || !def2 || gimple_nop_p (def2)
2611           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2612           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2613               && (is_gimple_assign (def2)
2614                   || is_gimple_call (def2)
2615                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2616                       == vect_induction_def
2617                   || (gimple_code (def2) == GIMPLE_PHI
2618                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2619                           == vect_internal_def
2620                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2621     {
2622       if (check_reduction)
2623         {
2624           /* Swap operands (just for simplicity - so that the rest of the code
2625              can assume that the reduction variable is always the last (second)
2626              argument).  */
2627           if (dump_enabled_p ())
2628             report_vect_op (MSG_NOTE, def_stmt,
2629                             "detected reduction: need to swap operands: ");
2630
2631           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2632                              gimple_assign_rhs2_ptr (def_stmt));
2633
2634           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2635             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2636         }
2637       else
2638         {
2639           if (dump_enabled_p ())
2640             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2641         }
2642
2643       return def_stmt;
2644     }
2645
2646   /* Try to find SLP reduction chain.  */
2647   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2648     {
2649       if (dump_enabled_p ())
2650         report_vect_op (MSG_NOTE, def_stmt,
2651                         "reduction: detected reduction chain: ");
2652
2653       return def_stmt;
2654     }
2655
2656   if (dump_enabled_p ())
2657     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2658                     "reduction: unknown pattern: ");
2659
2660   return NULL;
2661 }
2662
2663 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2664    in-place.  Arguments as there.  */
2665
2666 static gimple
2667 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2668                           bool check_reduction, bool *double_reduc)
2669 {
2670   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2671                                      double_reduc, false);
2672 }
2673
2674 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2675    in-place if it enables detection of more reductions.  Arguments
2676    as there.  */
2677
2678 gimple
2679 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2680                           bool check_reduction, bool *double_reduc)
2681 {
2682   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2683                                      double_reduc, true);
2684 }
2685
2686 /* Calculate the cost of one scalar iteration of the loop.  */
2687 int
2688 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2689                                        stmt_vector_for_cost *scalar_cost_vec)
2690 {
2691   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2692   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2693   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2694   int innerloop_iters, i;
2695
2696   /* Count statements in scalar loop.  Using this as scalar cost for a single
2697      iteration for now.
2698
2699      TODO: Add outer loop support.
2700
2701      TODO: Consider assigning different costs to different scalar
2702      statements.  */
2703
2704   /* FORNOW.  */
2705   innerloop_iters = 1;
2706   if (loop->inner)
2707     innerloop_iters = 50; /* FIXME */
2708
2709   for (i = 0; i < nbbs; i++)
2710     {
2711       gimple_stmt_iterator si;
2712       basic_block bb = bbs[i];
2713
2714       if (bb->loop_father == loop->inner)
2715         factor = innerloop_iters;
2716       else
2717         factor = 1;
2718
2719       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2720         {
2721           gimple stmt = gsi_stmt (si);
2722           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2723
2724           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2725             continue;
2726
2727           /* Skip stmts that are not vectorized inside the loop.  */
2728           if (stmt_info
2729               && !STMT_VINFO_RELEVANT_P (stmt_info)
2730               && (!STMT_VINFO_LIVE_P (stmt_info)
2731                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2732               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2733             continue;
2734
2735           vect_cost_for_stmt kind;
2736           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2737             {
2738               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2739                kind = scalar_load;
2740              else
2741                kind = scalar_store;
2742             }
2743           else
2744             kind = scalar_stmt;
2745
2746           scalar_single_iter_cost
2747             += record_stmt_cost (scalar_cost_vec, factor, kind,
2748                                  NULL, 0, vect_prologue);
2749         }
2750     }
2751   return scalar_single_iter_cost;
2752 }
2753
2754 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2755 int
2756 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2757                              int *peel_iters_epilogue,
2758                              stmt_vector_for_cost *scalar_cost_vec,
2759                              stmt_vector_for_cost *prologue_cost_vec,
2760                              stmt_vector_for_cost *epilogue_cost_vec)
2761 {
2762   int retval = 0;
2763   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2764
2765   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2766     {
2767       *peel_iters_epilogue = vf/2;
2768       if (dump_enabled_p ())
2769         dump_printf_loc (MSG_NOTE, vect_location,
2770                          "cost model: epilogue peel iters set to vf/2 "
2771                          "because loop iterations are unknown .\n");
2772
2773       /* If peeled iterations are known but number of scalar loop
2774          iterations are unknown, count a taken branch per peeled loop.  */
2775       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2776                                  NULL, 0, vect_prologue);
2777       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2778                                  NULL, 0, vect_epilogue);
2779     }
2780   else
2781     {
2782       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2783       peel_iters_prologue = niters < peel_iters_prologue ?
2784                             niters : peel_iters_prologue;
2785       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2786       /* If we need to peel for gaps, but no peeling is required, we have to
2787          peel VF iterations.  */
2788       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2789         *peel_iters_epilogue = vf;
2790     }
2791
2792   stmt_info_for_cost *si;
2793   int j;
2794   if (peel_iters_prologue)
2795     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2796       retval += record_stmt_cost (prologue_cost_vec,
2797                                   si->count * peel_iters_prologue,
2798                                   si->kind, NULL, si->misalign,
2799                                   vect_prologue);
2800   if (*peel_iters_epilogue)
2801     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2802       retval += record_stmt_cost (epilogue_cost_vec,
2803                                   si->count * *peel_iters_epilogue,
2804                                   si->kind, NULL, si->misalign,
2805                                   vect_epilogue);
2806
2807   return retval;
2808 }
2809
2810 /* Function vect_estimate_min_profitable_iters
2811
2812    Return the number of iterations required for the vector version of the
2813    loop to be profitable relative to the cost of the scalar version of the
2814    loop.  */
2815
2816 static void
2817 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2818                                     int *ret_min_profitable_niters,
2819                                     int *ret_min_profitable_estimate)
2820 {
2821   int min_profitable_iters;
2822   int min_profitable_estimate;
2823   int peel_iters_prologue;
2824   int peel_iters_epilogue;
2825   unsigned vec_inside_cost = 0;
2826   int vec_outside_cost = 0;
2827   unsigned vec_prologue_cost = 0;
2828   unsigned vec_epilogue_cost = 0;
2829   int scalar_single_iter_cost = 0;
2830   int scalar_outside_cost = 0;
2831   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2832   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2833   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2834
2835   /* Cost model disabled.  */
2836   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2837     {
2838       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2839       *ret_min_profitable_niters = 0;
2840       *ret_min_profitable_estimate = 0;
2841       return;
2842     }
2843
2844   /* Requires loop versioning tests to handle misalignment.  */
2845   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2846     {
2847       /*  FIXME: Make cost depend on complexity of individual check.  */
2848       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2849       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2850                             vect_prologue);
2851       dump_printf (MSG_NOTE,
2852                    "cost model: Adding cost of checks for loop "
2853                    "versioning to treat misalignment.\n");
2854     }
2855
2856   /* Requires loop versioning with alias checks.  */
2857   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2858     {
2859       /*  FIXME: Make cost depend on complexity of individual check.  */
2860       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2861       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2862                             vect_prologue);
2863       dump_printf (MSG_NOTE,
2864                    "cost model: Adding cost of checks for loop "
2865                    "versioning aliasing.\n");
2866     }
2867
2868   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2869       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2870     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2871                           vect_prologue);
2872
2873   /* Count statements in scalar loop.  Using this as scalar cost for a single
2874      iteration for now.
2875
2876      TODO: Add outer loop support.
2877
2878      TODO: Consider assigning different costs to different scalar
2879      statements.  */
2880
2881   auto_vec<stmt_info_for_cost> scalar_cost_vec;
2882   scalar_single_iter_cost
2883      = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2884
2885   /* Add additional cost for the peeled instructions in prologue and epilogue
2886      loop.
2887
2888      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2889      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2890
2891      TODO: Build an expression that represents peel_iters for prologue and
2892      epilogue to be used in a run-time test.  */
2893
2894   if (npeel  < 0)
2895     {
2896       peel_iters_prologue = vf/2;
2897       dump_printf (MSG_NOTE, "cost model: "
2898                    "prologue peel iters set to vf/2.\n");
2899
2900       /* If peeling for alignment is unknown, loop bound of main loop becomes
2901          unknown.  */
2902       peel_iters_epilogue = vf/2;
2903       dump_printf (MSG_NOTE, "cost model: "
2904                    "epilogue peel iters set to vf/2 because "
2905                    "peeling for alignment is unknown.\n");
2906
2907       /* If peeled iterations are unknown, count a taken branch and a not taken
2908          branch per peeled loop. Even if scalar loop iterations are known,
2909          vector iterations are not known since peeled prologue iterations are
2910          not known. Hence guards remain the same.  */
2911       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2912                             NULL, 0, vect_prologue);
2913       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2914                             NULL, 0, vect_prologue);
2915       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2916                             NULL, 0, vect_epilogue);
2917       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2918                             NULL, 0, vect_epilogue);
2919       stmt_info_for_cost *si;
2920       int j;
2921       FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2922         {
2923           struct _stmt_vec_info *stmt_info
2924             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2925           (void) add_stmt_cost (target_cost_data,
2926                                 si->count * peel_iters_prologue,
2927                                 si->kind, stmt_info, si->misalign,
2928                                 vect_prologue);
2929           (void) add_stmt_cost (target_cost_data,
2930                                 si->count * peel_iters_epilogue,
2931                                 si->kind, stmt_info, si->misalign,
2932                                 vect_epilogue);
2933         }
2934     }
2935   else
2936     {
2937       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2938       stmt_info_for_cost *si;
2939       int j;
2940       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2941
2942       prologue_cost_vec.create (2);
2943       epilogue_cost_vec.create (2);
2944       peel_iters_prologue = npeel;
2945
2946       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2947                                           &peel_iters_epilogue,
2948                                           &scalar_cost_vec,
2949                                           &prologue_cost_vec,
2950                                           &epilogue_cost_vec);
2951
2952       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2953         {
2954           struct _stmt_vec_info *stmt_info
2955             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2956           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2957                                 si->misalign, vect_prologue);
2958         }
2959
2960       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2961         {
2962           struct _stmt_vec_info *stmt_info
2963             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2964           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2965                                 si->misalign, vect_epilogue);
2966         }
2967
2968       prologue_cost_vec.release ();
2969       epilogue_cost_vec.release ();
2970     }
2971
2972   /* FORNOW: The scalar outside cost is incremented in one of the
2973      following ways:
2974
2975      1. The vectorizer checks for alignment and aliasing and generates
2976      a condition that allows dynamic vectorization.  A cost model
2977      check is ANDED with the versioning condition.  Hence scalar code
2978      path now has the added cost of the versioning check.
2979
2980        if (cost > th & versioning_check)
2981          jmp to vector code
2982
2983      Hence run-time scalar is incremented by not-taken branch cost.
2984
2985      2. The vectorizer then checks if a prologue is required.  If the
2986      cost model check was not done before during versioning, it has to
2987      be done before the prologue check.
2988
2989        if (cost <= th)
2990          prologue = scalar_iters
2991        if (prologue == 0)
2992          jmp to vector code
2993        else
2994          execute prologue
2995        if (prologue == num_iters)
2996          go to exit
2997
2998      Hence the run-time scalar cost is incremented by a taken branch,
2999      plus a not-taken branch, plus a taken branch cost.
3000
3001      3. The vectorizer then checks if an epilogue is required.  If the
3002      cost model check was not done before during prologue check, it
3003      has to be done with the epilogue check.
3004
3005        if (prologue == 0)
3006          jmp to vector code
3007        else
3008          execute prologue
3009        if (prologue == num_iters)
3010          go to exit
3011        vector code:
3012          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3013            jmp to epilogue
3014
3015      Hence the run-time scalar cost should be incremented by 2 taken
3016      branches.
3017
3018      TODO: The back end may reorder the BBS's differently and reverse
3019      conditions/branch directions.  Change the estimates below to
3020      something more reasonable.  */
3021
3022   /* If the number of iterations is known and we do not do versioning, we can
3023      decide whether to vectorize at compile time.  Hence the scalar version
3024      do not carry cost model guard costs.  */
3025   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3026       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3027       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3028     {
3029       /* Cost model check occurs at versioning.  */
3030       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3031           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3032         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3033       else
3034         {
3035           /* Cost model check occurs at prologue generation.  */
3036           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3037             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3038               + vect_get_stmt_cost (cond_branch_not_taken);
3039           /* Cost model check occurs at epilogue generation.  */
3040           else
3041             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3042         }
3043     }
3044
3045   /* Complete the target-specific cost calculations.  */
3046   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3047                &vec_inside_cost, &vec_epilogue_cost);
3048
3049   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3050
3051   if (dump_enabled_p ())
3052     {
3053       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3054       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3055                    vec_inside_cost);
3056       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3057                    vec_prologue_cost);
3058       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3059                    vec_epilogue_cost);
3060       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3061                    scalar_single_iter_cost);
3062       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3063                    scalar_outside_cost);
3064       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3065                    vec_outside_cost);
3066       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3067                    peel_iters_prologue);
3068       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3069                    peel_iters_epilogue);
3070     }
3071
3072   /* Calculate number of iterations required to make the vector version
3073      profitable, relative to the loop bodies only.  The following condition
3074      must hold true:
3075      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3076      where
3077      SIC = scalar iteration cost, VIC = vector iteration cost,
3078      VOC = vector outside cost, VF = vectorization factor,
3079      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3080      SOC = scalar outside cost for run time cost model check.  */
3081
3082   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3083     {
3084       if (vec_outside_cost <= 0)
3085         min_profitable_iters = 1;
3086       else
3087         {
3088           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3089                                   - vec_inside_cost * peel_iters_prologue
3090                                   - vec_inside_cost * peel_iters_epilogue)
3091                                  / ((scalar_single_iter_cost * vf)
3092                                     - vec_inside_cost);
3093
3094           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3095               <= (((int) vec_inside_cost * min_profitable_iters)
3096                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3097             min_profitable_iters++;
3098         }
3099     }
3100   /* vector version will never be profitable.  */
3101   else
3102     {
3103       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3104         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3105                     "did not happen for a simd loop");
3106
3107       if (dump_enabled_p ())
3108         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3109                          "cost model: the vector iteration cost = %d "
3110                          "divided by the scalar iteration cost = %d "
3111                          "is greater or equal to the vectorization factor = %d"
3112                          ".\n",
3113                          vec_inside_cost, scalar_single_iter_cost, vf);
3114       *ret_min_profitable_niters = -1;
3115       *ret_min_profitable_estimate = -1;
3116       return;
3117     }
3118
3119   dump_printf (MSG_NOTE,
3120                "  Calculated minimum iters for profitability: %d\n",
3121                min_profitable_iters);
3122
3123   min_profitable_iters =
3124         min_profitable_iters < vf ? vf : min_profitable_iters;
3125
3126   /* Because the condition we create is:
3127      if (niters <= min_profitable_iters)
3128        then skip the vectorized loop.  */
3129   min_profitable_iters--;
3130
3131   if (dump_enabled_p ())
3132     dump_printf_loc (MSG_NOTE, vect_location,
3133                      "  Runtime profitability threshold = %d\n",
3134                      min_profitable_iters);
3135
3136   *ret_min_profitable_niters = min_profitable_iters;
3137
3138   /* Calculate number of iterations required to make the vector version
3139      profitable, relative to the loop bodies only.
3140
3141      Non-vectorized variant is SIC * niters and it must win over vector
3142      variant on the expected loop trip count.  The following condition must hold true:
3143      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3144
3145   if (vec_outside_cost <= 0)
3146     min_profitable_estimate = 1;
3147   else
3148     {
3149       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3150                                  - vec_inside_cost * peel_iters_prologue
3151                                  - vec_inside_cost * peel_iters_epilogue)
3152                                  / ((scalar_single_iter_cost * vf)
3153                                    - vec_inside_cost);
3154     }
3155   min_profitable_estimate --;
3156   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3157   if (dump_enabled_p ())
3158     dump_printf_loc (MSG_NOTE, vect_location,
3159                      "  Static estimate profitability threshold = %d\n",
3160                       min_profitable_iters);
3161
3162   *ret_min_profitable_estimate = min_profitable_estimate;
3163 }
3164
3165 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3166    vector elements (not bits) for a vector of mode MODE.  */
3167 static void
3168 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3169                               unsigned char *sel)
3170 {
3171   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3172
3173   for (i = 0; i < nelt; i++)
3174     sel[i] = (i + offset) & (2*nelt - 1);
3175 }
3176
3177 /* Checks whether the target supports whole-vector shifts for vectors of mode
3178    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3179    it supports vec_perm_const with masks for all necessary shift amounts.  */
3180 static bool
3181 have_whole_vector_shift (enum machine_mode mode)
3182 {
3183   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3184     return true;
3185
3186   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3187     return false;
3188
3189   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3190   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3191
3192   for (i = nelt/2; i >= 1; i/=2)
3193     {
3194       calc_vec_perm_mask_for_shift (mode, i, sel);
3195       if (!can_vec_perm_p (mode, false, sel))
3196         return false;
3197     }
3198   return true;
3199 }
3200
3201 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3202
3203 static tree
3204 get_reduction_op (gimple stmt, int reduc_index)
3205 {
3206   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3207     {
3208     case GIMPLE_SINGLE_RHS:
3209       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3210                   == ternary_op);
3211       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3212     case GIMPLE_UNARY_RHS:
3213       return gimple_assign_rhs1 (stmt);
3214     case GIMPLE_BINARY_RHS:
3215       return (reduc_index
3216               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3217     case GIMPLE_TERNARY_RHS:
3218       return gimple_op (stmt, reduc_index + 1);
3219     default:
3220       gcc_unreachable ();
3221     }
3222 }
3223
3224 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3225    functions. Design better to avoid maintenance issues.  */
3226
3227 /* Function vect_model_reduction_cost.
3228
3229    Models cost for a reduction operation, including the vector ops
3230    generated within the strip-mine loop, the initial definition before
3231    the loop, and the epilogue code that must be generated.  */
3232
3233 static bool
3234 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3235                            int ncopies, int reduc_index)
3236 {
3237   int prologue_cost = 0, epilogue_cost = 0;
3238   enum tree_code code;
3239   optab optab;
3240   tree vectype;
3241   gimple stmt, orig_stmt;
3242   tree reduction_op;
3243   machine_mode mode;
3244   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3245   struct loop *loop = NULL;
3246   void *target_cost_data;
3247
3248   if (loop_vinfo)
3249     {
3250       loop = LOOP_VINFO_LOOP (loop_vinfo);
3251       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3252     }
3253   else
3254     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3255
3256   /* Cost of reduction op inside loop.  */
3257   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3258                                         stmt_info, 0, vect_body);
3259   stmt = STMT_VINFO_STMT (stmt_info);
3260
3261   reduction_op = get_reduction_op (stmt, reduc_index);
3262
3263   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3264   if (!vectype)
3265     {
3266       if (dump_enabled_p ())
3267         {
3268           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3269                            "unsupported data-type ");
3270           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3271                              TREE_TYPE (reduction_op));
3272           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3273         }
3274       return false;
3275    }
3276
3277   mode = TYPE_MODE (vectype);
3278   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3279
3280   if (!orig_stmt)
3281     orig_stmt = STMT_VINFO_STMT (stmt_info);
3282
3283   code = gimple_assign_rhs_code (orig_stmt);
3284
3285   /* Add in cost for initial definition.  */
3286   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3287                                   stmt_info, 0, vect_prologue);
3288
3289   /* Determine cost of epilogue code.
3290
3291      We have a reduction operator that will reduce the vector in one statement.
3292      Also requires scalar extract.  */
3293
3294   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3295     {
3296       if (reduc_code != ERROR_MARK)
3297         {
3298           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3299                                           stmt_info, 0, vect_epilogue);
3300           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3301                                           stmt_info, 0, vect_epilogue);
3302         }
3303       else
3304         {
3305           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3306           tree bitsize =
3307             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3308           int element_bitsize = tree_to_uhwi (bitsize);
3309           int nelements = vec_size_in_bits / element_bitsize;
3310
3311           optab = optab_for_tree_code (code, vectype, optab_default);
3312
3313           /* We have a whole vector shift available.  */
3314           if (VECTOR_MODE_P (mode)
3315               && optab_handler (optab, mode) != CODE_FOR_nothing
3316               && have_whole_vector_shift (mode))
3317             {
3318               /* Final reduction via vector shifts and the reduction operator.
3319                  Also requires scalar extract.  */
3320               epilogue_cost += add_stmt_cost (target_cost_data,
3321                                               exact_log2 (nelements) * 2,
3322                                               vector_stmt, stmt_info, 0,
3323                                               vect_epilogue);
3324               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3325                                               vec_to_scalar, stmt_info, 0,
3326                                               vect_epilogue);
3327             }
3328           else
3329             /* Use extracts and reduction op for final reduction.  For N
3330                elements, we have N extracts and N-1 reduction ops.  */
3331             epilogue_cost += add_stmt_cost (target_cost_data,
3332                                             nelements + nelements - 1,
3333                                             vector_stmt, stmt_info, 0,
3334                                             vect_epilogue);
3335         }
3336     }
3337
3338   if (dump_enabled_p ())
3339     dump_printf (MSG_NOTE,
3340                  "vect_model_reduction_cost: inside_cost = %d, "
3341                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3342                  prologue_cost, epilogue_cost);
3343
3344   return true;
3345 }
3346
3347
3348 /* Function vect_model_induction_cost.
3349
3350    Models cost for induction operations.  */
3351
3352 static void
3353 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3354 {
3355   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3356   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3357   unsigned inside_cost, prologue_cost;
3358
3359   /* loop cost for vec_loop.  */
3360   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3361                                stmt_info, 0, vect_body);
3362
3363   /* prologue cost for vec_init and vec_step.  */
3364   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3365                                  stmt_info, 0, vect_prologue);
3366
3367   if (dump_enabled_p ())
3368     dump_printf_loc (MSG_NOTE, vect_location,
3369                      "vect_model_induction_cost: inside_cost = %d, "
3370                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3371 }
3372
3373
3374 /* Function get_initial_def_for_induction
3375
3376    Input:
3377    STMT - a stmt that performs an induction operation in the loop.
3378    IV_PHI - the initial value of the induction variable
3379
3380    Output:
3381    Return a vector variable, initialized with the first VF values of
3382    the induction variable.  E.g., for an iv with IV_PHI='X' and
3383    evolution S, for a vector of 4 units, we want to return:
3384    [X, X + S, X + 2*S, X + 3*S].  */
3385
3386 static tree
3387 get_initial_def_for_induction (gimple iv_phi)
3388 {
3389   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3390   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3391   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3392   tree vectype;
3393   int nunits;
3394   edge pe = loop_preheader_edge (loop);
3395   struct loop *iv_loop;
3396   basic_block new_bb;
3397   tree new_vec, vec_init, vec_step, t;
3398   tree new_var;
3399   tree new_name;
3400   gimple init_stmt, new_stmt;
3401   gphi *induction_phi;
3402   tree induc_def, vec_def, vec_dest;
3403   tree init_expr, step_expr;
3404   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3405   int i;
3406   int ncopies;
3407   tree expr;
3408   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3409   bool nested_in_vect_loop = false;
3410   gimple_seq stmts = NULL;
3411   imm_use_iterator imm_iter;
3412   use_operand_p use_p;
3413   gimple exit_phi;
3414   edge latch_e;
3415   tree loop_arg;
3416   gimple_stmt_iterator si;
3417   basic_block bb = gimple_bb (iv_phi);
3418   tree stepvectype;
3419   tree resvectype;
3420
3421   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3422   if (nested_in_vect_loop_p (loop, iv_phi))
3423     {
3424       nested_in_vect_loop = true;
3425       iv_loop = loop->inner;
3426     }
3427   else
3428     iv_loop = loop;
3429   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3430
3431   latch_e = loop_latch_edge (iv_loop);
3432   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3433
3434   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3435   gcc_assert (step_expr != NULL_TREE);
3436
3437   pe = loop_preheader_edge (iv_loop);
3438   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3439                                      loop_preheader_edge (iv_loop));
3440
3441   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3442   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3443   gcc_assert (vectype);
3444   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3445   ncopies = vf / nunits;
3446
3447   gcc_assert (phi_info);
3448   gcc_assert (ncopies >= 1);
3449
3450   /* Convert the step to the desired type.  */
3451   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3452                                                   step_expr),
3453                                     &stmts, true, NULL_TREE);
3454   if (stmts)
3455     {
3456       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3457       gcc_assert (!new_bb);
3458     }
3459
3460   /* Find the first insertion point in the BB.  */
3461   si = gsi_after_labels (bb);
3462
3463   /* Create the vector that holds the initial_value of the induction.  */
3464   if (nested_in_vect_loop)
3465     {
3466       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3467          been created during vectorization of previous stmts.  We obtain it
3468          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3469       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3470       /* If the initial value is not of proper type, convert it.  */
3471       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3472         {
3473           new_stmt
3474             = gimple_build_assign (vect_get_new_vect_var (vectype,
3475                                                           vect_simple_var,
3476                                                           "vec_iv_"),
3477                                    VIEW_CONVERT_EXPR,
3478                                    build1 (VIEW_CONVERT_EXPR, vectype,
3479                                            vec_init));
3480           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3481           gimple_assign_set_lhs (new_stmt, vec_init);
3482           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3483                                                  new_stmt);
3484           gcc_assert (!new_bb);
3485           set_vinfo_for_stmt (new_stmt,
3486                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3487         }
3488     }
3489   else
3490     {
3491       vec<constructor_elt, va_gc> *v;
3492
3493       /* iv_loop is the loop to be vectorized. Create:
3494          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3495       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3496                                        vect_scalar_var, "var_");
3497       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3498                                                      init_expr),
3499                                        &stmts, false, new_var);
3500       if (stmts)
3501         {
3502           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3503           gcc_assert (!new_bb);
3504         }
3505
3506       vec_alloc (v, nunits);
3507       bool constant_p = is_gimple_min_invariant (new_name);
3508       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3509       for (i = 1; i < nunits; i++)
3510         {
3511           /* Create: new_name_i = new_name + step_expr  */
3512           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3513                                   new_name, step_expr);
3514           if (!is_gimple_min_invariant (new_name))
3515             {
3516               init_stmt = gimple_build_assign (new_var, new_name);
3517               new_name = make_ssa_name (new_var, init_stmt);
3518               gimple_assign_set_lhs (init_stmt, new_name);
3519               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3520               gcc_assert (!new_bb);
3521               if (dump_enabled_p ())
3522                 {
3523                   dump_printf_loc (MSG_NOTE, vect_location,
3524                                    "created new init_stmt: ");
3525                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3526                   dump_printf (MSG_NOTE, "\n");
3527                 }
3528               constant_p = false;
3529             }
3530           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3531         }
3532       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3533       if (constant_p)
3534         new_vec = build_vector_from_ctor (vectype, v);
3535       else
3536         new_vec = build_constructor (vectype, v);
3537       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3538     }
3539
3540
3541   /* Create the vector that holds the step of the induction.  */
3542   if (nested_in_vect_loop)
3543     /* iv_loop is nested in the loop to be vectorized. Generate:
3544        vec_step = [S, S, S, S]  */
3545     new_name = step_expr;
3546   else
3547     {
3548       /* iv_loop is the loop to be vectorized. Generate:
3549           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3550       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3551         {
3552           expr = build_int_cst (integer_type_node, vf);
3553           expr = fold_convert (TREE_TYPE (step_expr), expr);
3554         }
3555       else
3556         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3557       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3558                               expr, step_expr);
3559       if (TREE_CODE (step_expr) == SSA_NAME)
3560         new_name = vect_init_vector (iv_phi, new_name,
3561                                      TREE_TYPE (step_expr), NULL);
3562     }
3563
3564   t = unshare_expr (new_name);
3565   gcc_assert (CONSTANT_CLASS_P (new_name)
3566               || TREE_CODE (new_name) == SSA_NAME);
3567   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3568   gcc_assert (stepvectype);
3569   new_vec = build_vector_from_val (stepvectype, t);
3570   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3571
3572
3573   /* Create the following def-use cycle:
3574      loop prolog:
3575          vec_init = ...
3576          vec_step = ...
3577      loop:
3578          vec_iv = PHI <vec_init, vec_loop>
3579          ...
3580          STMT
3581          ...
3582          vec_loop = vec_iv + vec_step;  */
3583
3584   /* Create the induction-phi that defines the induction-operand.  */
3585   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3586   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3587   set_vinfo_for_stmt (induction_phi,
3588                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3589   induc_def = PHI_RESULT (induction_phi);
3590
3591   /* Create the iv update inside the loop  */
3592   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3593   vec_def = make_ssa_name (vec_dest, new_stmt);
3594   gimple_assign_set_lhs (new_stmt, vec_def);
3595   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3596   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3597                                                    NULL));
3598
3599   /* Set the arguments of the phi node:  */
3600   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3601   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3602                UNKNOWN_LOCATION);
3603
3604
3605   /* In case that vectorization factor (VF) is bigger than the number
3606      of elements that we can fit in a vectype (nunits), we have to generate
3607      more than one vector stmt - i.e - we need to "unroll" the
3608      vector stmt by a factor VF/nunits.  For more details see documentation
3609      in vectorizable_operation.  */
3610
3611   if (ncopies > 1)
3612     {
3613       stmt_vec_info prev_stmt_vinfo;
3614       /* FORNOW. This restriction should be relaxed.  */
3615       gcc_assert (!nested_in_vect_loop);
3616
3617       /* Create the vector that holds the step of the induction.  */
3618       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3619         {
3620           expr = build_int_cst (integer_type_node, nunits);
3621           expr = fold_convert (TREE_TYPE (step_expr), expr);
3622         }
3623       else
3624         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3625       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3626                               expr, step_expr);
3627       if (TREE_CODE (step_expr) == SSA_NAME)
3628         new_name = vect_init_vector (iv_phi, new_name,
3629                                      TREE_TYPE (step_expr), NULL);
3630       t = unshare_expr (new_name);
3631       gcc_assert (CONSTANT_CLASS_P (new_name)
3632                   || TREE_CODE (new_name) == SSA_NAME);
3633       new_vec = build_vector_from_val (stepvectype, t);
3634       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3635
3636       vec_def = induc_def;
3637       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3638       for (i = 1; i < ncopies; i++)
3639         {
3640           /* vec_i = vec_prev + vec_step  */
3641           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3642                                           vec_def, vec_step);
3643           vec_def = make_ssa_name (vec_dest, new_stmt);
3644           gimple_assign_set_lhs (new_stmt, vec_def);
3645
3646           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3647           if (!useless_type_conversion_p (resvectype, vectype))
3648             {
3649               new_stmt
3650                 = gimple_build_assign
3651                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3652                                                 "vec_iv_"),
3653                          VIEW_CONVERT_EXPR,
3654                          build1 (VIEW_CONVERT_EXPR, resvectype,
3655                                  gimple_assign_lhs (new_stmt)));
3656               gimple_assign_set_lhs (new_stmt,
3657                                      make_ssa_name
3658                                        (gimple_assign_lhs (new_stmt), new_stmt));
3659               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3660             }
3661           set_vinfo_for_stmt (new_stmt,
3662                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3663           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3664           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3665         }
3666     }
3667
3668   if (nested_in_vect_loop)
3669     {
3670       /* Find the loop-closed exit-phi of the induction, and record
3671          the final vector of induction results:  */
3672       exit_phi = NULL;
3673       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3674         {
3675           gimple use_stmt = USE_STMT (use_p);
3676           if (is_gimple_debug (use_stmt))
3677             continue;
3678
3679           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3680             {
3681               exit_phi = use_stmt;
3682               break;
3683             }
3684         }
3685       if (exit_phi)
3686         {
3687           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3688           /* FORNOW. Currently not supporting the case that an inner-loop induction
3689              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3690           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3691                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3692
3693           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3694           if (dump_enabled_p ())
3695             {
3696               dump_printf_loc (MSG_NOTE, vect_location,
3697                                "vector of inductions after inner-loop:");
3698               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3699               dump_printf (MSG_NOTE, "\n");
3700             }
3701         }
3702     }
3703
3704
3705   if (dump_enabled_p ())
3706     {
3707       dump_printf_loc (MSG_NOTE, vect_location,
3708                        "transform induction: created def-use cycle: ");
3709       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3710       dump_printf (MSG_NOTE, "\n");
3711       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3712                         SSA_NAME_DEF_STMT (vec_def), 0);
3713       dump_printf (MSG_NOTE, "\n");
3714     }
3715
3716   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3717   if (!useless_type_conversion_p (resvectype, vectype))
3718     {
3719       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3720                                                              vect_simple_var,
3721                                                              "vec_iv_"),
3722                                       VIEW_CONVERT_EXPR,
3723                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3724                                               induc_def));
3725       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3726       gimple_assign_set_lhs (new_stmt, induc_def);
3727       si = gsi_after_labels (bb);
3728       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3729       set_vinfo_for_stmt (new_stmt,
3730                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3731       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3732         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3733     }
3734
3735   return induc_def;
3736 }
3737
3738
3739 /* Function get_initial_def_for_reduction
3740
3741    Input:
3742    STMT - a stmt that performs a reduction operation in the loop.
3743    INIT_VAL - the initial value of the reduction variable
3744
3745    Output:
3746    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3747         of the reduction (used for adjusting the epilog - see below).
3748    Return a vector variable, initialized according to the operation that STMT
3749         performs. This vector will be used as the initial value of the
3750         vector of partial results.
3751
3752    Option1 (adjust in epilog): Initialize the vector as follows:
3753      add/bit or/xor:    [0,0,...,0,0]
3754      mult/bit and:      [1,1,...,1,1]
3755      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3756    and when necessary (e.g. add/mult case) let the caller know
3757    that it needs to adjust the result by init_val.
3758
3759    Option2: Initialize the vector as follows:
3760      add/bit or/xor:    [init_val,0,0,...,0]
3761      mult/bit and:      [init_val,1,1,...,1]
3762      min/max/cond_expr: [init_val,init_val,...,init_val]
3763    and no adjustments are needed.
3764
3765    For example, for the following code:
3766
3767    s = init_val;
3768    for (i=0;i<n;i++)
3769      s = s + a[i];
3770
3771    STMT is 's = s + a[i]', and the reduction variable is 's'.
3772    For a vector of 4 units, we want to return either [0,0,0,init_val],
3773    or [0,0,0,0] and let the caller know that it needs to adjust
3774    the result at the end by 'init_val'.
3775
3776    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3777    initialization vector is simpler (same element in all entries), if
3778    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3779
3780    A cost model should help decide between these two schemes.  */
3781
3782 tree
3783 get_initial_def_for_reduction (gimple stmt, tree init_val,
3784                                tree *adjustment_def)
3785 {
3786   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3787   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3788   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3789   tree scalar_type = TREE_TYPE (init_val);
3790   tree vectype = get_vectype_for_scalar_type (scalar_type);
3791   int nunits;
3792   enum tree_code code = gimple_assign_rhs_code (stmt);
3793   tree def_for_init;
3794   tree init_def;
3795   tree *elts;
3796   int i;
3797   bool nested_in_vect_loop = false;
3798   tree init_value;
3799   REAL_VALUE_TYPE real_init_val = dconst0;
3800   int int_init_val = 0;
3801   gimple def_stmt = NULL;
3802
3803   gcc_assert (vectype);
3804   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3805
3806   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3807               || SCALAR_FLOAT_TYPE_P (scalar_type));
3808
3809   if (nested_in_vect_loop_p (loop, stmt))
3810     nested_in_vect_loop = true;
3811   else
3812     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3813
3814   /* In case of double reduction we only create a vector variable to be put
3815      in the reduction phi node.  The actual statement creation is done in
3816      vect_create_epilog_for_reduction.  */
3817   if (adjustment_def && nested_in_vect_loop
3818       && TREE_CODE (init_val) == SSA_NAME
3819       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3820       && gimple_code (def_stmt) == GIMPLE_PHI
3821       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3822       && vinfo_for_stmt (def_stmt)
3823       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3824           == vect_double_reduction_def)
3825     {
3826       *adjustment_def = NULL;
3827       return vect_create_destination_var (init_val, vectype);
3828     }
3829
3830   if (TREE_CONSTANT (init_val))
3831     {
3832       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3833         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3834       else
3835         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3836     }
3837   else
3838     init_value = init_val;
3839
3840   switch (code)
3841     {
3842       case WIDEN_SUM_EXPR:
3843       case DOT_PROD_EXPR:
3844       case SAD_EXPR:
3845       case PLUS_EXPR:
3846       case MINUS_EXPR:
3847       case BIT_IOR_EXPR:
3848       case BIT_XOR_EXPR:
3849       case MULT_EXPR:
3850       case BIT_AND_EXPR:
3851         /* ADJUSMENT_DEF is NULL when called from
3852            vect_create_epilog_for_reduction to vectorize double reduction.  */
3853         if (adjustment_def)
3854           {
3855             if (nested_in_vect_loop)
3856               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3857                                                               NULL);
3858             else
3859               *adjustment_def = init_val;
3860           }
3861
3862         if (code == MULT_EXPR)
3863           {
3864             real_init_val = dconst1;
3865             int_init_val = 1;
3866           }
3867
3868         if (code == BIT_AND_EXPR)
3869           int_init_val = -1;
3870
3871         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3872           def_for_init = build_real (scalar_type, real_init_val);
3873         else
3874           def_for_init = build_int_cst (scalar_type, int_init_val);
3875
3876         /* Create a vector of '0' or '1' except the first element.  */
3877         elts = XALLOCAVEC (tree, nunits);
3878         for (i = nunits - 2; i >= 0; --i)
3879           elts[i + 1] = def_for_init;
3880
3881         /* Option1: the first element is '0' or '1' as well.  */
3882         if (adjustment_def)
3883           {
3884             elts[0] = def_for_init;
3885             init_def = build_vector (vectype, elts);
3886             break;
3887           }
3888
3889         /* Option2: the first element is INIT_VAL.  */
3890         elts[0] = init_val;
3891         if (TREE_CONSTANT (init_val))
3892           init_def = build_vector (vectype, elts);
3893         else
3894           {
3895             vec<constructor_elt, va_gc> *v;
3896             vec_alloc (v, nunits);
3897             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3898             for (i = 1; i < nunits; ++i)
3899               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3900             init_def = build_constructor (vectype, v);
3901           }
3902
3903         break;
3904
3905       case MIN_EXPR:
3906       case MAX_EXPR:
3907       case COND_EXPR:
3908         if (adjustment_def)
3909           {
3910             *adjustment_def = NULL_TREE;
3911             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3912             break;
3913           }
3914
3915         init_def = build_vector_from_val (vectype, init_value);
3916         break;
3917
3918       default:
3919         gcc_unreachable ();
3920     }
3921
3922   return init_def;
3923 }
3924
3925 /* Function vect_create_epilog_for_reduction
3926
3927    Create code at the loop-epilog to finalize the result of a reduction
3928    computation.
3929
3930    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3931      reduction statements.
3932    STMT is the scalar reduction stmt that is being vectorized.
3933    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3934      number of elements that we can fit in a vectype (nunits).  In this case
3935      we have to generate more than one vector stmt - i.e - we need to "unroll"
3936      the vector stmt by a factor VF/nunits.  For more details see documentation
3937      in vectorizable_operation.
3938    REDUC_CODE is the tree-code for the epilog reduction.
3939    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3940      computation.
3941    REDUC_INDEX is the index of the operand in the right hand side of the
3942      statement that is defined by REDUCTION_PHI.
3943    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3944    SLP_NODE is an SLP node containing a group of reduction statements. The
3945      first one in this group is STMT.
3946
3947    This function:
3948    1. Creates the reduction def-use cycles: sets the arguments for
3949       REDUCTION_PHIS:
3950       The loop-entry argument is the vectorized initial-value of the reduction.
3951       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3952       sums.
3953    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3954       by applying the operation specified by REDUC_CODE if available, or by
3955       other means (whole-vector shifts or a scalar loop).
3956       The function also creates a new phi node at the loop exit to preserve
3957       loop-closed form, as illustrated below.
3958
3959      The flow at the entry to this function:
3960
3961         loop:
3962           vec_def = phi <null, null>            # REDUCTION_PHI
3963           VECT_DEF = vector_stmt                # vectorized form of STMT
3964           s_loop = scalar_stmt                  # (scalar) STMT
3965         loop_exit:
3966           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3967           use <s_out0>
3968           use <s_out0>
3969
3970      The above is transformed by this function into:
3971
3972         loop:
3973           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3974           VECT_DEF = vector_stmt                # vectorized form of STMT
3975           s_loop = scalar_stmt                  # (scalar) STMT
3976         loop_exit:
3977           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3978           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3979           v_out2 = reduce <v_out1>
3980           s_out3 = extract_field <v_out2, 0>
3981           s_out4 = adjust_result <s_out3>
3982           use <s_out4>
3983           use <s_out4>
3984 */
3985
3986 static void
3987 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3988                                   int ncopies, enum tree_code reduc_code,
3989                                   vec<gimple> reduction_phis,
3990                                   int reduc_index, bool double_reduc,
3991                                   slp_tree slp_node)
3992 {
3993   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3994   stmt_vec_info prev_phi_info;
3995   tree vectype;
3996   machine_mode mode;
3997   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3998   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3999   basic_block exit_bb;
4000   tree scalar_dest;
4001   tree scalar_type;
4002   gimple new_phi = NULL, phi;
4003   gimple_stmt_iterator exit_gsi;
4004   tree vec_dest;
4005   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4006   gimple epilog_stmt = NULL;
4007   enum tree_code code = gimple_assign_rhs_code (stmt);
4008   gimple exit_phi;
4009   tree bitsize;
4010   tree adjustment_def = NULL;
4011   tree vec_initial_def = NULL;
4012   tree reduction_op, expr, def;
4013   tree orig_name, scalar_result;
4014   imm_use_iterator imm_iter, phi_imm_iter;
4015   use_operand_p use_p, phi_use_p;
4016   gimple use_stmt, orig_stmt, reduction_phi = NULL;
4017   bool nested_in_vect_loop = false;
4018   auto_vec<gimple> new_phis;
4019   auto_vec<gimple> inner_phis;
4020   enum vect_def_type dt = vect_unknown_def_type;
4021   int j, i;
4022   auto_vec<tree> scalar_results;
4023   unsigned int group_size = 1, k, ratio;
4024   auto_vec<tree> vec_initial_defs;
4025   auto_vec<gimple> phis;
4026   bool slp_reduc = false;
4027   tree new_phi_result;
4028   gimple inner_phi = NULL;
4029
4030   if (slp_node)
4031     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4032
4033   if (nested_in_vect_loop_p (loop, stmt))
4034     {
4035       outer_loop = loop;
4036       loop = loop->inner;
4037       nested_in_vect_loop = true;
4038       gcc_assert (!slp_node);
4039     }
4040
4041   reduction_op = get_reduction_op (stmt, reduc_index);
4042
4043   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4044   gcc_assert (vectype);
4045   mode = TYPE_MODE (vectype);
4046
4047   /* 1. Create the reduction def-use cycle:
4048      Set the arguments of REDUCTION_PHIS, i.e., transform
4049
4050         loop:
4051           vec_def = phi <null, null>            # REDUCTION_PHI
4052           VECT_DEF = vector_stmt                # vectorized form of STMT
4053           ...
4054
4055      into:
4056
4057         loop:
4058           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4059           VECT_DEF = vector_stmt                # vectorized form of STMT
4060           ...
4061
4062      (in case of SLP, do it for all the phis). */
4063
4064   /* Get the loop-entry arguments.  */
4065   if (slp_node)
4066     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4067                        NULL, slp_node, reduc_index);
4068   else
4069     {
4070       vec_initial_defs.create (1);
4071      /* For the case of reduction, vect_get_vec_def_for_operand returns
4072         the scalar def before the loop, that defines the initial value
4073         of the reduction variable.  */
4074       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4075                                                       &adjustment_def);
4076       vec_initial_defs.quick_push (vec_initial_def);
4077     }
4078
4079   /* Set phi nodes arguments.  */
4080   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4081     {
4082       tree vec_init_def, def;
4083       gimple_seq stmts;
4084       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4085                                            true, NULL_TREE);
4086       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4087       def = vect_defs[i];
4088       for (j = 0; j < ncopies; j++)
4089         {
4090           /* Set the loop-entry arg of the reduction-phi.  */
4091           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4092                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4093
4094           /* Set the loop-latch arg for the reduction-phi.  */
4095           if (j > 0)
4096             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4097
4098           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4099                        UNKNOWN_LOCATION);
4100
4101           if (dump_enabled_p ())
4102             {
4103               dump_printf_loc (MSG_NOTE, vect_location,
4104                                "transform reduction: created def-use cycle: ");
4105               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4106               dump_printf (MSG_NOTE, "\n");
4107               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4108               dump_printf (MSG_NOTE, "\n");
4109             }
4110
4111           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4112         }
4113     }
4114
4115   /* 2. Create epilog code.
4116         The reduction epilog code operates across the elements of the vector
4117         of partial results computed by the vectorized loop.
4118         The reduction epilog code consists of:
4119
4120         step 1: compute the scalar result in a vector (v_out2)
4121         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4122         step 3: adjust the scalar result (s_out3) if needed.
4123
4124         Step 1 can be accomplished using one the following three schemes:
4125           (scheme 1) using reduc_code, if available.
4126           (scheme 2) using whole-vector shifts, if available.
4127           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4128                      combined.
4129
4130           The overall epilog code looks like this:
4131
4132           s_out0 = phi <s_loop>         # original EXIT_PHI
4133           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4134           v_out2 = reduce <v_out1>              # step 1
4135           s_out3 = extract_field <v_out2, 0>    # step 2
4136           s_out4 = adjust_result <s_out3>       # step 3
4137
4138           (step 3 is optional, and steps 1 and 2 may be combined).
4139           Lastly, the uses of s_out0 are replaced by s_out4.  */
4140
4141
4142   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4143          v_out1 = phi <VECT_DEF>
4144          Store them in NEW_PHIS.  */
4145
4146   exit_bb = single_exit (loop)->dest;
4147   prev_phi_info = NULL;
4148   new_phis.create (vect_defs.length ());
4149   FOR_EACH_VEC_ELT (vect_defs, i, def)
4150     {
4151       for (j = 0; j < ncopies; j++)
4152         {
4153           tree new_def = copy_ssa_name (def);
4154           phi = create_phi_node (new_def, exit_bb);
4155           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4156           if (j == 0)
4157             new_phis.quick_push (phi);
4158           else
4159             {
4160               def = vect_get_vec_def_for_stmt_copy (dt, def);
4161               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4162             }
4163
4164           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4165           prev_phi_info = vinfo_for_stmt (phi);
4166         }
4167     }
4168
4169   /* The epilogue is created for the outer-loop, i.e., for the loop being
4170      vectorized.  Create exit phis for the outer loop.  */
4171   if (double_reduc)
4172     {
4173       loop = outer_loop;
4174       exit_bb = single_exit (loop)->dest;
4175       inner_phis.create (vect_defs.length ());
4176       FOR_EACH_VEC_ELT (new_phis, i, phi)
4177         {
4178           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4179           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4180           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4181                            PHI_RESULT (phi));
4182           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4183                                                             loop_vinfo, NULL));
4184           inner_phis.quick_push (phi);
4185           new_phis[i] = outer_phi;
4186           prev_phi_info = vinfo_for_stmt (outer_phi);
4187           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4188             {
4189               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4190               new_result = copy_ssa_name (PHI_RESULT (phi));
4191               outer_phi = create_phi_node (new_result, exit_bb);
4192               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4193                                PHI_RESULT (phi));
4194               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4195                                                         loop_vinfo, NULL));
4196               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4197               prev_phi_info = vinfo_for_stmt (outer_phi);
4198             }
4199         }
4200     }
4201
4202   exit_gsi = gsi_after_labels (exit_bb);
4203
4204   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4205          (i.e. when reduc_code is not available) and in the final adjustment
4206          code (if needed).  Also get the original scalar reduction variable as
4207          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4208          represents a reduction pattern), the tree-code and scalar-def are
4209          taken from the original stmt that the pattern-stmt (STMT) replaces.
4210          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4211          are taken from STMT.  */
4212
4213   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4214   if (!orig_stmt)
4215     {
4216       /* Regular reduction  */
4217       orig_stmt = stmt;
4218     }
4219   else
4220     {
4221       /* Reduction pattern  */
4222       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4223       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4224       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4225     }
4226
4227   code = gimple_assign_rhs_code (orig_stmt);
4228   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4229      partial results are added and not subtracted.  */
4230   if (code == MINUS_EXPR)
4231     code = PLUS_EXPR;
4232
4233   scalar_dest = gimple_assign_lhs (orig_stmt);
4234   scalar_type = TREE_TYPE (scalar_dest);
4235   scalar_results.create (group_size);
4236   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4237   bitsize = TYPE_SIZE (scalar_type);
4238
4239   /* In case this is a reduction in an inner-loop while vectorizing an outer
4240      loop - we don't need to extract a single scalar result at the end of the
4241      inner-loop (unless it is double reduction, i.e., the use of reduction is
4242      outside the outer-loop).  The final vector of partial results will be used
4243      in the vectorized outer-loop, or reduced to a scalar result at the end of
4244      the outer-loop.  */
4245   if (nested_in_vect_loop && !double_reduc)
4246     goto vect_finalize_reduction;
4247
4248   /* SLP reduction without reduction chain, e.g.,
4249      # a1 = phi <a2, a0>
4250      # b1 = phi <b2, b0>
4251      a2 = operation (a1)
4252      b2 = operation (b1)  */
4253   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4254
4255   /* In case of reduction chain, e.g.,
4256      # a1 = phi <a3, a0>
4257      a2 = operation (a1)
4258      a3 = operation (a2),
4259
4260      we may end up with more than one vector result.  Here we reduce them to
4261      one vector.  */
4262   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4263     {
4264       tree first_vect = PHI_RESULT (new_phis[0]);
4265       tree tmp;
4266       gassign *new_vec_stmt = NULL;
4267
4268       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4269       for (k = 1; k < new_phis.length (); k++)
4270         {
4271           gimple next_phi = new_phis[k];
4272           tree second_vect = PHI_RESULT (next_phi);
4273
4274           tmp = build2 (code, vectype,  first_vect, second_vect);
4275           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4276           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4277           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4278           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4279         }
4280
4281       new_phi_result = first_vect;
4282       if (new_vec_stmt)
4283         {
4284           new_phis.truncate (0);
4285           new_phis.safe_push (new_vec_stmt);
4286         }
4287     }
4288   else
4289     new_phi_result = PHI_RESULT (new_phis[0]);
4290
4291   /* 2.3 Create the reduction code, using one of the three schemes described
4292          above. In SLP we simply need to extract all the elements from the
4293          vector (without reducing them), so we use scalar shifts.  */
4294   if (reduc_code != ERROR_MARK && !slp_reduc)
4295     {
4296       tree tmp;
4297       tree vec_elem_type;
4298
4299       /*** Case 1:  Create:
4300            v_out2 = reduc_expr <v_out1>  */
4301
4302       if (dump_enabled_p ())
4303         dump_printf_loc (MSG_NOTE, vect_location,
4304                          "Reduce using direct vector reduction.\n");
4305
4306       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4307       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4308         {
4309           tree tmp_dest =
4310               vect_create_destination_var (scalar_dest, vec_elem_type);
4311           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4312           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4313           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4314           gimple_assign_set_lhs (epilog_stmt, new_temp);
4315           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4316
4317           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4318         }
4319       else
4320         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4321       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4322       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4323       gimple_assign_set_lhs (epilog_stmt, new_temp);
4324       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4325       scalar_results.safe_push (new_temp);
4326     }
4327   else
4328     {
4329       bool reduce_with_shift = have_whole_vector_shift (mode);
4330       int element_bitsize = tree_to_uhwi (bitsize);
4331       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4332       tree vec_temp;
4333
4334       /* Regardless of whether we have a whole vector shift, if we're
4335          emulating the operation via tree-vect-generic, we don't want
4336          to use it.  Only the first round of the reduction is likely
4337          to still be profitable via emulation.  */
4338       /* ??? It might be better to emit a reduction tree code here, so that
4339          tree-vect-generic can expand the first round via bit tricks.  */
4340       if (!VECTOR_MODE_P (mode))
4341         reduce_with_shift = false;
4342       else
4343         {
4344           optab optab = optab_for_tree_code (code, vectype, optab_default);
4345           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4346             reduce_with_shift = false;
4347         }
4348
4349       if (reduce_with_shift && !slp_reduc)
4350         {
4351           int nelements = vec_size_in_bits / element_bitsize;
4352           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4353
4354           int elt_offset;
4355
4356           tree zero_vec = build_zero_cst (vectype);
4357           /*** Case 2: Create:
4358              for (offset = nelements/2; offset >= 1; offset/=2)
4359                 {
4360                   Create:  va' = vec_shift <va, offset>
4361                   Create:  va = vop <va, va'>
4362                 }  */
4363
4364           tree rhs;
4365
4366           if (dump_enabled_p ())
4367             dump_printf_loc (MSG_NOTE, vect_location,
4368                              "Reduce using vector shifts\n");
4369
4370           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4371           new_temp = new_phi_result;
4372           for (elt_offset = nelements / 2;
4373                elt_offset >= 1;
4374                elt_offset /= 2)
4375             {
4376               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4377               tree mask = vect_gen_perm_mask_any (vectype, sel);
4378               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4379                                                  new_temp, zero_vec, mask);
4380               new_name = make_ssa_name (vec_dest, epilog_stmt);
4381               gimple_assign_set_lhs (epilog_stmt, new_name);
4382               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4383
4384               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4385                                                  new_temp);
4386               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4387               gimple_assign_set_lhs (epilog_stmt, new_temp);
4388               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4389             }
4390
4391           /* 2.4  Extract the final scalar result.  Create:
4392              s_out3 = extract_field <v_out2, bitpos>  */
4393
4394           if (dump_enabled_p ())
4395             dump_printf_loc (MSG_NOTE, vect_location,
4396                              "extract scalar result\n");
4397
4398           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4399                         bitsize, bitsize_zero_node);
4400           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4401           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4402           gimple_assign_set_lhs (epilog_stmt, new_temp);
4403           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4404           scalar_results.safe_push (new_temp);
4405         }
4406       else
4407         {
4408           /*** Case 3: Create:
4409              s = extract_field <v_out2, 0>
4410              for (offset = element_size;
4411                   offset < vector_size;
4412                   offset += element_size;)
4413                {
4414                  Create:  s' = extract_field <v_out2, offset>
4415                  Create:  s = op <s, s'>  // For non SLP cases
4416                }  */
4417
4418           if (dump_enabled_p ())
4419             dump_printf_loc (MSG_NOTE, vect_location,
4420                              "Reduce using scalar code.\n");
4421
4422           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4423           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4424             {
4425               int bit_offset;
4426               if (gimple_code (new_phi) == GIMPLE_PHI)
4427                 vec_temp = PHI_RESULT (new_phi);
4428               else
4429                 vec_temp = gimple_assign_lhs (new_phi);
4430               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4431                             bitsize_zero_node);
4432               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4433               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4434               gimple_assign_set_lhs (epilog_stmt, new_temp);
4435               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4436
4437               /* In SLP we don't need to apply reduction operation, so we just
4438                  collect s' values in SCALAR_RESULTS.  */
4439               if (slp_reduc)
4440                 scalar_results.safe_push (new_temp);
4441
4442               for (bit_offset = element_bitsize;
4443                    bit_offset < vec_size_in_bits;
4444                    bit_offset += element_bitsize)
4445                 {
4446                   tree bitpos = bitsize_int (bit_offset);
4447                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4448                                      bitsize, bitpos);
4449
4450                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4451                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4452                   gimple_assign_set_lhs (epilog_stmt, new_name);
4453                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4454
4455                   if (slp_reduc)
4456                     {
4457                       /* In SLP we don't need to apply reduction operation, so
4458                          we just collect s' values in SCALAR_RESULTS.  */
4459                       new_temp = new_name;
4460                       scalar_results.safe_push (new_name);
4461                     }
4462                   else
4463                     {
4464                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4465                                                          new_name, new_temp);
4466                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4467                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4468                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4469                     }
4470                 }
4471             }
4472
4473           /* The only case where we need to reduce scalar results in SLP, is
4474              unrolling.  If the size of SCALAR_RESULTS is greater than
4475              GROUP_SIZE, we reduce them combining elements modulo
4476              GROUP_SIZE.  */
4477           if (slp_reduc)
4478             {
4479               tree res, first_res, new_res;
4480               gimple new_stmt;
4481
4482               /* Reduce multiple scalar results in case of SLP unrolling.  */
4483               for (j = group_size; scalar_results.iterate (j, &res);
4484                    j++)
4485                 {
4486                   first_res = scalar_results[j % group_size];
4487                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4488                                                   first_res, res);
4489                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4490                   gimple_assign_set_lhs (new_stmt, new_res);
4491                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4492                   scalar_results[j % group_size] = new_res;
4493                 }
4494             }
4495           else
4496             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4497             scalar_results.safe_push (new_temp);
4498         }
4499     }
4500
4501 vect_finalize_reduction:
4502
4503   if (double_reduc)
4504     loop = loop->inner;
4505
4506   /* 2.5 Adjust the final result by the initial value of the reduction
4507          variable. (When such adjustment is not needed, then
4508          'adjustment_def' is zero).  For example, if code is PLUS we create:
4509          new_temp = loop_exit_def + adjustment_def  */
4510
4511   if (adjustment_def)
4512     {
4513       gcc_assert (!slp_reduc);
4514       if (nested_in_vect_loop)
4515         {
4516           new_phi = new_phis[0];
4517           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4518           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4519           new_dest = vect_create_destination_var (scalar_dest, vectype);
4520         }
4521       else
4522         {
4523           new_temp = scalar_results[0];
4524           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4525           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4526           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4527         }
4528
4529       epilog_stmt = gimple_build_assign (new_dest, expr);
4530       new_temp = make_ssa_name (new_dest, epilog_stmt);
4531       gimple_assign_set_lhs (epilog_stmt, new_temp);
4532       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4533       if (nested_in_vect_loop)
4534         {
4535           set_vinfo_for_stmt (epilog_stmt,
4536                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4537                                                  NULL));
4538           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4539                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4540
4541           if (!double_reduc)
4542             scalar_results.quick_push (new_temp);
4543           else
4544             scalar_results[0] = new_temp;
4545         }
4546       else
4547         scalar_results[0] = new_temp;
4548
4549       new_phis[0] = epilog_stmt;
4550     }
4551
4552   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4553           phis with new adjusted scalar results, i.e., replace use <s_out0>
4554           with use <s_out4>.
4555
4556      Transform:
4557         loop_exit:
4558           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4559           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4560           v_out2 = reduce <v_out1>
4561           s_out3 = extract_field <v_out2, 0>
4562           s_out4 = adjust_result <s_out3>
4563           use <s_out0>
4564           use <s_out0>
4565
4566      into:
4567
4568         loop_exit:
4569           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4570           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4571           v_out2 = reduce <v_out1>
4572           s_out3 = extract_field <v_out2, 0>
4573           s_out4 = adjust_result <s_out3>
4574           use <s_out4>
4575           use <s_out4> */
4576
4577
4578   /* In SLP reduction chain we reduce vector results into one vector if
4579      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4580      the last stmt in the reduction chain, since we are looking for the loop
4581      exit phi node.  */
4582   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4583     {
4584       scalar_dest = gimple_assign_lhs (
4585                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4586       group_size = 1;
4587     }
4588
4589   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4590      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4591      need to match SCALAR_RESULTS with corresponding statements.  The first
4592      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4593      the first vector stmt, etc.
4594      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4595   if (group_size > new_phis.length ())
4596     {
4597       ratio = group_size / new_phis.length ();
4598       gcc_assert (!(group_size % new_phis.length ()));
4599     }
4600   else
4601     ratio = 1;
4602
4603   for (k = 0; k < group_size; k++)
4604     {
4605       if (k % ratio == 0)
4606         {
4607           epilog_stmt = new_phis[k / ratio];
4608           reduction_phi = reduction_phis[k / ratio];
4609           if (double_reduc)
4610             inner_phi = inner_phis[k / ratio];
4611         }
4612
4613       if (slp_reduc)
4614         {
4615           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4616
4617           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4618           /* SLP statements can't participate in patterns.  */
4619           gcc_assert (!orig_stmt);
4620           scalar_dest = gimple_assign_lhs (current_stmt);
4621         }
4622
4623       phis.create (3);
4624       /* Find the loop-closed-use at the loop exit of the original scalar
4625          result.  (The reduction result is expected to have two immediate uses -
4626          one at the latch block, and one at the loop exit).  */
4627       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4628         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4629             && !is_gimple_debug (USE_STMT (use_p)))
4630           phis.safe_push (USE_STMT (use_p));
4631
4632       /* While we expect to have found an exit_phi because of loop-closed-ssa
4633          form we can end up without one if the scalar cycle is dead.  */
4634
4635       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4636         {
4637           if (outer_loop)
4638             {
4639               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4640               gphi *vect_phi;
4641
4642               /* FORNOW. Currently not supporting the case that an inner-loop
4643                  reduction is not used in the outer-loop (but only outside the
4644                  outer-loop), unless it is double reduction.  */
4645               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4646                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4647                           || double_reduc);
4648
4649               if (double_reduc)
4650                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4651               else
4652                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4653               if (!double_reduc
4654                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4655                       != vect_double_reduction_def)
4656                 continue;
4657
4658               /* Handle double reduction:
4659
4660                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4661                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4662                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4663                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4664
4665                  At that point the regular reduction (stmt2 and stmt3) is
4666                  already vectorized, as well as the exit phi node, stmt4.
4667                  Here we vectorize the phi node of double reduction, stmt1, and
4668                  update all relevant statements.  */
4669
4670               /* Go through all the uses of s2 to find double reduction phi
4671                  node, i.e., stmt1 above.  */
4672               orig_name = PHI_RESULT (exit_phi);
4673               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4674                 {
4675                   stmt_vec_info use_stmt_vinfo;
4676                   stmt_vec_info new_phi_vinfo;
4677                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4678                   basic_block bb = gimple_bb (use_stmt);
4679                   gimple use;
4680
4681                   /* Check that USE_STMT is really double reduction phi
4682                      node.  */
4683                   if (gimple_code (use_stmt) != GIMPLE_PHI
4684                       || gimple_phi_num_args (use_stmt) != 2
4685                       || bb->loop_father != outer_loop)
4686                     continue;
4687                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4688                   if (!use_stmt_vinfo
4689                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4690                           != vect_double_reduction_def)
4691                     continue;
4692
4693                   /* Create vector phi node for double reduction:
4694                      vs1 = phi <vs0, vs2>
4695                      vs1 was created previously in this function by a call to
4696                        vect_get_vec_def_for_operand and is stored in
4697                        vec_initial_def;
4698                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4699                      vs0 is created here.  */
4700
4701                   /* Create vector phi node.  */
4702                   vect_phi = create_phi_node (vec_initial_def, bb);
4703                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4704                                     loop_vec_info_for_loop (outer_loop), NULL);
4705                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4706
4707                   /* Create vs0 - initial def of the double reduction phi.  */
4708                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4709                                              loop_preheader_edge (outer_loop));
4710                   init_def = get_initial_def_for_reduction (stmt,
4711                                                           preheader_arg, NULL);
4712                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4713                                                     vectype, NULL);
4714
4715                   /* Update phi node arguments with vs0 and vs2.  */
4716                   add_phi_arg (vect_phi, vect_phi_init,
4717                                loop_preheader_edge (outer_loop),
4718                                UNKNOWN_LOCATION);
4719                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4720                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4721                   if (dump_enabled_p ())
4722                     {
4723                       dump_printf_loc (MSG_NOTE, vect_location,
4724                                        "created double reduction phi node: ");
4725                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4726                       dump_printf (MSG_NOTE, "\n");
4727                     }
4728
4729                   vect_phi_res = PHI_RESULT (vect_phi);
4730
4731                   /* Replace the use, i.e., set the correct vs1 in the regular
4732                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4733                      loop is redundant.  */
4734                   use = reduction_phi;
4735                   for (j = 0; j < ncopies; j++)
4736                     {
4737                       edge pr_edge = loop_preheader_edge (loop);
4738                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4739                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4740                     }
4741                 }
4742             }
4743         }
4744
4745       phis.release ();
4746       if (nested_in_vect_loop)
4747         {
4748           if (double_reduc)
4749             loop = outer_loop;
4750           else
4751             continue;
4752         }
4753
4754       phis.create (3);
4755       /* Find the loop-closed-use at the loop exit of the original scalar
4756          result.  (The reduction result is expected to have two immediate uses,
4757          one at the latch block, and one at the loop exit).  For double
4758          reductions we are looking for exit phis of the outer loop.  */
4759       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4760         {
4761           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4762             {
4763               if (!is_gimple_debug (USE_STMT (use_p)))
4764                 phis.safe_push (USE_STMT (use_p));
4765             }
4766           else
4767             {
4768               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4769                 {
4770                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4771
4772                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4773                     {
4774                       if (!flow_bb_inside_loop_p (loop,
4775                                              gimple_bb (USE_STMT (phi_use_p)))
4776                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4777                         phis.safe_push (USE_STMT (phi_use_p));
4778                     }
4779                 }
4780             }
4781         }
4782
4783       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4784         {
4785           /* Replace the uses:  */
4786           orig_name = PHI_RESULT (exit_phi);
4787           scalar_result = scalar_results[k];
4788           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4789             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4790               SET_USE (use_p, scalar_result);
4791         }
4792
4793       phis.release ();
4794     }
4795 }
4796
4797
4798 /* Function vectorizable_reduction.
4799
4800    Check if STMT performs a reduction operation that can be vectorized.
4801    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4802    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4803    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4804
4805    This function also handles reduction idioms (patterns) that have been
4806    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4807    of this form:
4808      X = pattern_expr (arg0, arg1, ..., X)
4809    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4810    sequence that had been detected and replaced by the pattern-stmt (STMT).
4811
4812    In some cases of reduction patterns, the type of the reduction variable X is
4813    different than the type of the other arguments of STMT.
4814    In such cases, the vectype that is used when transforming STMT into a vector
4815    stmt is different than the vectype that is used to determine the
4816    vectorization factor, because it consists of a different number of elements
4817    than the actual number of elements that are being operated upon in parallel.
4818
4819    For example, consider an accumulation of shorts into an int accumulator.
4820    On some targets it's possible to vectorize this pattern operating on 8
4821    shorts at a time (hence, the vectype for purposes of determining the
4822    vectorization factor should be V8HI); on the other hand, the vectype that
4823    is used to create the vector form is actually V4SI (the type of the result).
4824
4825    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4826    indicates what is the actual level of parallelism (V8HI in the example), so
4827    that the right vectorization factor would be derived.  This vectype
4828    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4829    be used to create the vectorized stmt.  The right vectype for the vectorized
4830    stmt is obtained from the type of the result X:
4831         get_vectype_for_scalar_type (TREE_TYPE (X))
4832
4833    This means that, contrary to "regular" reductions (or "regular" stmts in
4834    general), the following equation:
4835       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4836    does *NOT* necessarily hold for reduction patterns.  */
4837
4838 bool
4839 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4840                         gimple *vec_stmt, slp_tree slp_node)
4841 {
4842   tree vec_dest;
4843   tree scalar_dest;
4844   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4845   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4846   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4847   tree vectype_in = NULL_TREE;
4848   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4849   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4850   enum tree_code code, orig_code, epilog_reduc_code;
4851   machine_mode vec_mode;
4852   int op_type;
4853   optab optab, reduc_optab;
4854   tree new_temp = NULL_TREE;
4855   tree def;
4856   gimple def_stmt;
4857   enum vect_def_type dt;
4858   gphi *new_phi = NULL;
4859   tree scalar_type;
4860   bool is_simple_use;
4861   gimple orig_stmt;
4862   stmt_vec_info orig_stmt_info;
4863   tree expr = NULL_TREE;
4864   int i;
4865   int ncopies;
4866   int epilog_copies;
4867   stmt_vec_info prev_stmt_info, prev_phi_info;
4868   bool single_defuse_cycle = false;
4869   tree reduc_def = NULL_TREE;
4870   gimple new_stmt = NULL;
4871   int j;
4872   tree ops[3];
4873   bool nested_cycle = false, found_nested_cycle_def = false;
4874   gimple reduc_def_stmt = NULL;
4875   bool double_reduc = false, dummy;
4876   basic_block def_bb;
4877   struct loop * def_stmt_loop, *outer_loop = NULL;
4878   tree def_arg;
4879   gimple def_arg_stmt;
4880   auto_vec<tree> vec_oprnds0;
4881   auto_vec<tree> vec_oprnds1;
4882   auto_vec<tree> vect_defs;
4883   auto_vec<gimple> phis;
4884   int vec_num;
4885   tree def0, def1, tem, op0, op1 = NULL_TREE;
4886
4887   /* In case of reduction chain we switch to the first stmt in the chain, but
4888      we don't update STMT_INFO, since only the last stmt is marked as reduction
4889      and has reduction properties.  */
4890   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4891     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4892
4893   if (nested_in_vect_loop_p (loop, stmt))
4894     {
4895       outer_loop = loop;
4896       loop = loop->inner;
4897       nested_cycle = true;
4898     }
4899
4900   /* 1. Is vectorizable reduction?  */
4901   /* Not supportable if the reduction variable is used in the loop, unless
4902      it's a reduction chain.  */
4903   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4904       && !GROUP_FIRST_ELEMENT (stmt_info))
4905     return false;
4906
4907   /* Reductions that are not used even in an enclosing outer-loop,
4908      are expected to be "live" (used out of the loop).  */
4909   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4910       && !STMT_VINFO_LIVE_P (stmt_info))
4911     return false;
4912
4913   /* Make sure it was already recognized as a reduction computation.  */
4914   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4915       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4916     return false;
4917
4918   /* 2. Has this been recognized as a reduction pattern?
4919
4920      Check if STMT represents a pattern that has been recognized
4921      in earlier analysis stages.  For stmts that represent a pattern,
4922      the STMT_VINFO_RELATED_STMT field records the last stmt in
4923      the original sequence that constitutes the pattern.  */
4924
4925   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4926   if (orig_stmt)
4927     {
4928       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4929       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4930       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4931     }
4932
4933   /* 3. Check the operands of the operation.  The first operands are defined
4934         inside the loop body. The last operand is the reduction variable,
4935         which is defined by the loop-header-phi.  */
4936
4937   gcc_assert (is_gimple_assign (stmt));
4938
4939   /* Flatten RHS.  */
4940   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4941     {
4942     case GIMPLE_SINGLE_RHS:
4943       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4944       if (op_type == ternary_op)
4945         {
4946           tree rhs = gimple_assign_rhs1 (stmt);
4947           ops[0] = TREE_OPERAND (rhs, 0);
4948           ops[1] = TREE_OPERAND (rhs, 1);
4949           ops[2] = TREE_OPERAND (rhs, 2);
4950           code = TREE_CODE (rhs);
4951         }
4952       else
4953         return false;
4954       break;
4955
4956     case GIMPLE_BINARY_RHS:
4957       code = gimple_assign_rhs_code (stmt);
4958       op_type = TREE_CODE_LENGTH (code);
4959       gcc_assert (op_type == binary_op);
4960       ops[0] = gimple_assign_rhs1 (stmt);
4961       ops[1] = gimple_assign_rhs2 (stmt);
4962       break;
4963
4964     case GIMPLE_TERNARY_RHS:
4965       code = gimple_assign_rhs_code (stmt);
4966       op_type = TREE_CODE_LENGTH (code);
4967       gcc_assert (op_type == ternary_op);
4968       ops[0] = gimple_assign_rhs1 (stmt);
4969       ops[1] = gimple_assign_rhs2 (stmt);
4970       ops[2] = gimple_assign_rhs3 (stmt);
4971       break;
4972
4973     case GIMPLE_UNARY_RHS:
4974       return false;
4975
4976     default:
4977       gcc_unreachable ();
4978     }
4979   /* The default is that the reduction variable is the last in statement.  */
4980   int reduc_index = op_type - 1;
4981
4982   if (code == COND_EXPR && slp_node)
4983     return false;
4984
4985   scalar_dest = gimple_assign_lhs (stmt);
4986   scalar_type = TREE_TYPE (scalar_dest);
4987   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4988       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4989     return false;
4990
4991   /* Do not try to vectorize bit-precision reductions.  */
4992   if ((TYPE_PRECISION (scalar_type)
4993        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4994     return false;
4995
4996   /* All uses but the last are expected to be defined in the loop.
4997      The last use is the reduction variable.  In case of nested cycle this
4998      assumption is not true: we use reduc_index to record the index of the
4999      reduction variable.  */
5000   for (i = 0; i < op_type - 1; i++)
5001     {
5002       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5003       if (i == 0 && code == COND_EXPR)
5004         continue;
5005
5006       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5007                                             &def_stmt, &def, &dt, &tem);
5008       if (!vectype_in)
5009         vectype_in = tem;
5010       gcc_assert (is_simple_use);
5011
5012       if (dt != vect_internal_def
5013           && dt != vect_external_def
5014           && dt != vect_constant_def
5015           && dt != vect_induction_def
5016           && !(dt == vect_nested_cycle && nested_cycle))
5017         return false;
5018
5019       if (dt == vect_nested_cycle)
5020         {
5021           found_nested_cycle_def = true;
5022           reduc_def_stmt = def_stmt;
5023           reduc_index = i;
5024         }
5025     }
5026
5027   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5028                                         &def_stmt, &def, &dt, &tem);
5029   if (!vectype_in)
5030     vectype_in = tem;
5031   gcc_assert (is_simple_use);
5032   if (!found_nested_cycle_def)
5033     reduc_def_stmt = def_stmt;
5034
5035   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5036     return false;
5037
5038   if (!(dt == vect_reduction_def
5039         || dt == vect_nested_cycle
5040         || ((dt == vect_internal_def || dt == vect_external_def
5041              || dt == vect_constant_def || dt == vect_induction_def)
5042             && nested_cycle && found_nested_cycle_def)))
5043     {
5044       /* For pattern recognized stmts, orig_stmt might be a reduction,
5045          but some helper statements for the pattern might not, or
5046          might be COND_EXPRs with reduction uses in the condition.  */
5047       gcc_assert (orig_stmt);
5048       return false;
5049     }
5050
5051   if (orig_stmt)
5052     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5053                                                        reduc_def_stmt,
5054                                                        !nested_cycle,
5055                                                        &dummy));
5056   else
5057     {
5058       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5059                                              !nested_cycle, &dummy);
5060       /* We changed STMT to be the first stmt in reduction chain, hence we
5061          check that in this case the first element in the chain is STMT.  */
5062       gcc_assert (stmt == tmp
5063                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5064     }
5065
5066   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5067     return false;
5068
5069   if (slp_node || PURE_SLP_STMT (stmt_info))
5070     ncopies = 1;
5071   else
5072     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5073                / TYPE_VECTOR_SUBPARTS (vectype_in));
5074
5075   gcc_assert (ncopies >= 1);
5076
5077   vec_mode = TYPE_MODE (vectype_in);
5078
5079   if (code == COND_EXPR)
5080     {
5081       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5082         {
5083           if (dump_enabled_p ())
5084             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5085                              "unsupported condition in reduction\n");
5086
5087           return false;
5088         }
5089     }
5090   else
5091     {
5092       /* 4. Supportable by target?  */
5093
5094       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5095           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5096         {
5097           /* Shifts and rotates are only supported by vectorizable_shifts,
5098              not vectorizable_reduction.  */
5099           if (dump_enabled_p ())
5100             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5101                              "unsupported shift or rotation.\n");
5102           return false;
5103         }
5104
5105       /* 4.1. check support for the operation in the loop  */
5106       optab = optab_for_tree_code (code, vectype_in, optab_default);
5107       if (!optab)
5108         {
5109           if (dump_enabled_p ())
5110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5111                              "no optab.\n");
5112
5113           return false;
5114         }
5115
5116       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5117         {
5118           if (dump_enabled_p ())
5119             dump_printf (MSG_NOTE, "op not supported by target.\n");
5120
5121           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5122               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5123                   < vect_min_worthwhile_factor (code))
5124             return false;
5125
5126           if (dump_enabled_p ())
5127             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5128         }
5129
5130       /* Worthwhile without SIMD support?  */
5131       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5132           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5133              < vect_min_worthwhile_factor (code))
5134         {
5135           if (dump_enabled_p ())
5136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5137                              "not worthwhile without SIMD support.\n");
5138
5139           return false;
5140         }
5141     }
5142
5143   /* 4.2. Check support for the epilog operation.
5144
5145           If STMT represents a reduction pattern, then the type of the
5146           reduction variable may be different than the type of the rest
5147           of the arguments.  For example, consider the case of accumulation
5148           of shorts into an int accumulator; The original code:
5149                         S1: int_a = (int) short_a;
5150           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5151
5152           was replaced with:
5153                         STMT: int_acc = widen_sum <short_a, int_acc>
5154
5155           This means that:
5156           1. The tree-code that is used to create the vector operation in the
5157              epilog code (that reduces the partial results) is not the
5158              tree-code of STMT, but is rather the tree-code of the original
5159              stmt from the pattern that STMT is replacing.  I.e, in the example
5160              above we want to use 'widen_sum' in the loop, but 'plus' in the
5161              epilog.
5162           2. The type (mode) we use to check available target support
5163              for the vector operation to be created in the *epilog*, is
5164              determined by the type of the reduction variable (in the example
5165              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5166              However the type (mode) we use to check available target support
5167              for the vector operation to be created *inside the loop*, is
5168              determined by the type of the other arguments to STMT (in the
5169              example we'd check this: optab_handler (widen_sum_optab,
5170              vect_short_mode)).
5171
5172           This is contrary to "regular" reductions, in which the types of all
5173           the arguments are the same as the type of the reduction variable.
5174           For "regular" reductions we can therefore use the same vector type
5175           (and also the same tree-code) when generating the epilog code and
5176           when generating the code inside the loop.  */
5177
5178   if (orig_stmt)
5179     {
5180       /* This is a reduction pattern: get the vectype from the type of the
5181          reduction variable, and get the tree-code from orig_stmt.  */
5182       orig_code = gimple_assign_rhs_code (orig_stmt);
5183       gcc_assert (vectype_out);
5184       vec_mode = TYPE_MODE (vectype_out);
5185     }
5186   else
5187     {
5188       /* Regular reduction: use the same vectype and tree-code as used for
5189          the vector code inside the loop can be used for the epilog code. */
5190       orig_code = code;
5191     }
5192
5193   if (nested_cycle)
5194     {
5195       def_bb = gimple_bb (reduc_def_stmt);
5196       def_stmt_loop = def_bb->loop_father;
5197       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5198                                        loop_preheader_edge (def_stmt_loop));
5199       if (TREE_CODE (def_arg) == SSA_NAME
5200           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5201           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5202           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5203           && vinfo_for_stmt (def_arg_stmt)
5204           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5205               == vect_double_reduction_def)
5206         double_reduc = true;
5207     }
5208
5209   epilog_reduc_code = ERROR_MARK;
5210   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5211     {
5212       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5213                                          optab_default);
5214       if (!reduc_optab)
5215         {
5216           if (dump_enabled_p ())
5217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5218                              "no optab for reduction.\n");
5219
5220           epilog_reduc_code = ERROR_MARK;
5221         }
5222       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5223         {
5224           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5225           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5226             {
5227               if (dump_enabled_p ())
5228                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5229                                  "reduc op not supported by target.\n");
5230
5231               epilog_reduc_code = ERROR_MARK;
5232             }
5233         }
5234     }
5235   else
5236     {
5237       if (!nested_cycle || double_reduc)
5238         {
5239           if (dump_enabled_p ())
5240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5241                              "no reduc code for scalar code.\n");
5242
5243           return false;
5244         }
5245     }
5246
5247   if (double_reduc && ncopies > 1)
5248     {
5249       if (dump_enabled_p ())
5250         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5251                          "multiple types in double reduction\n");
5252
5253       return false;
5254     }
5255
5256   /* In case of widenning multiplication by a constant, we update the type
5257      of the constant to be the type of the other operand.  We check that the
5258      constant fits the type in the pattern recognition pass.  */
5259   if (code == DOT_PROD_EXPR
5260       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5261     {
5262       if (TREE_CODE (ops[0]) == INTEGER_CST)
5263         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5264       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5265         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5266       else
5267         {
5268           if (dump_enabled_p ())
5269             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5270                              "invalid types in dot-prod\n");
5271
5272           return false;
5273         }
5274     }
5275
5276   if (!vec_stmt) /* transformation not required.  */
5277     {
5278       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5279                                       reduc_index))
5280         return false;
5281       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5282       return true;
5283     }
5284
5285   /** Transform.  **/
5286
5287   if (dump_enabled_p ())
5288     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5289
5290   /* FORNOW: Multiple types are not supported for condition.  */
5291   if (code == COND_EXPR)
5292     gcc_assert (ncopies == 1);
5293
5294   /* Create the destination vector  */
5295   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5296
5297   /* In case the vectorization factor (VF) is bigger than the number
5298      of elements that we can fit in a vectype (nunits), we have to generate
5299      more than one vector stmt - i.e - we need to "unroll" the
5300      vector stmt by a factor VF/nunits.  For more details see documentation
5301      in vectorizable_operation.  */
5302
5303   /* If the reduction is used in an outer loop we need to generate
5304      VF intermediate results, like so (e.g. for ncopies=2):
5305         r0 = phi (init, r0)
5306         r1 = phi (init, r1)
5307         r0 = x0 + r0;
5308         r1 = x1 + r1;
5309     (i.e. we generate VF results in 2 registers).
5310     In this case we have a separate def-use cycle for each copy, and therefore
5311     for each copy we get the vector def for the reduction variable from the
5312     respective phi node created for this copy.
5313
5314     Otherwise (the reduction is unused in the loop nest), we can combine
5315     together intermediate results, like so (e.g. for ncopies=2):
5316         r = phi (init, r)
5317         r = x0 + r;
5318         r = x1 + r;
5319    (i.e. we generate VF/2 results in a single register).
5320    In this case for each copy we get the vector def for the reduction variable
5321    from the vectorized reduction operation generated in the previous iteration.
5322   */
5323
5324   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5325     {
5326       single_defuse_cycle = true;
5327       epilog_copies = 1;
5328     }
5329   else
5330     epilog_copies = ncopies;
5331
5332   prev_stmt_info = NULL;
5333   prev_phi_info = NULL;
5334   if (slp_node)
5335     {
5336       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5337       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5338                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5339     }
5340   else
5341     {
5342       vec_num = 1;
5343       vec_oprnds0.create (1);
5344       if (op_type == ternary_op)
5345         vec_oprnds1.create (1);
5346     }
5347
5348   phis.create (vec_num);
5349   vect_defs.create (vec_num);
5350   if (!slp_node)
5351     vect_defs.quick_push (NULL_TREE);
5352
5353   for (j = 0; j < ncopies; j++)
5354     {
5355       if (j == 0 || !single_defuse_cycle)
5356         {
5357           for (i = 0; i < vec_num; i++)
5358             {
5359               /* Create the reduction-phi that defines the reduction
5360                  operand.  */
5361               new_phi = create_phi_node (vec_dest, loop->header);
5362               set_vinfo_for_stmt (new_phi,
5363                                   new_stmt_vec_info (new_phi, loop_vinfo,
5364                                                      NULL));
5365                if (j == 0 || slp_node)
5366                  phis.quick_push (new_phi);
5367             }
5368         }
5369
5370       if (code == COND_EXPR)
5371         {
5372           gcc_assert (!slp_node);
5373           vectorizable_condition (stmt, gsi, vec_stmt,
5374                                   PHI_RESULT (phis[0]),
5375                                   reduc_index, NULL);
5376           /* Multiple types are not supported for condition.  */
5377           break;
5378         }
5379
5380       /* Handle uses.  */
5381       if (j == 0)
5382         {
5383           op0 = ops[!reduc_index];
5384           if (op_type == ternary_op)
5385             {
5386               if (reduc_index == 0)
5387                 op1 = ops[2];
5388               else
5389                 op1 = ops[1];
5390             }
5391
5392           if (slp_node)
5393             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5394                                slp_node, -1);
5395           else
5396             {
5397               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5398                                                             stmt, NULL);
5399               vec_oprnds0.quick_push (loop_vec_def0);
5400               if (op_type == ternary_op)
5401                {
5402                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5403                                                                NULL);
5404                  vec_oprnds1.quick_push (loop_vec_def1);
5405                }
5406             }
5407         }
5408       else
5409         {
5410           if (!slp_node)
5411             {
5412               enum vect_def_type dt;
5413               gimple dummy_stmt;
5414               tree dummy;
5415
5416               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5417                                   &dummy_stmt, &dummy, &dt);
5418               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5419                                                               loop_vec_def0);
5420               vec_oprnds0[0] = loop_vec_def0;
5421               if (op_type == ternary_op)
5422                 {
5423                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5424                                       &dummy, &dt);
5425                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5426                                                                 loop_vec_def1);
5427                   vec_oprnds1[0] = loop_vec_def1;
5428                 }
5429             }
5430
5431           if (single_defuse_cycle)
5432             reduc_def = gimple_assign_lhs (new_stmt);
5433
5434           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5435         }
5436
5437       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5438         {
5439           if (slp_node)
5440             reduc_def = PHI_RESULT (phis[i]);
5441           else
5442             {
5443               if (!single_defuse_cycle || j == 0)
5444                 reduc_def = PHI_RESULT (new_phi);
5445             }
5446
5447           def1 = ((op_type == ternary_op)
5448                   ? vec_oprnds1[i] : NULL);
5449           if (op_type == binary_op)
5450             {
5451               if (reduc_index == 0)
5452                 expr = build2 (code, vectype_out, reduc_def, def0);
5453               else
5454                 expr = build2 (code, vectype_out, def0, reduc_def);
5455             }
5456           else
5457             {
5458               if (reduc_index == 0)
5459                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5460               else
5461                 {
5462                   if (reduc_index == 1)
5463                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5464                   else
5465                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5466                 }
5467             }
5468
5469           new_stmt = gimple_build_assign (vec_dest, expr);
5470           new_temp = make_ssa_name (vec_dest, new_stmt);
5471           gimple_assign_set_lhs (new_stmt, new_temp);
5472           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5473
5474           if (slp_node)
5475             {
5476               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5477               vect_defs.quick_push (new_temp);
5478             }
5479           else
5480             vect_defs[0] = new_temp;
5481         }
5482
5483       if (slp_node)
5484         continue;
5485
5486       if (j == 0)
5487         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5488       else
5489         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5490
5491       prev_stmt_info = vinfo_for_stmt (new_stmt);
5492       prev_phi_info = vinfo_for_stmt (new_phi);
5493     }
5494
5495   /* Finalize the reduction-phi (set its arguments) and create the
5496      epilog reduction code.  */
5497   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5498     {
5499       new_temp = gimple_assign_lhs (*vec_stmt);
5500       vect_defs[0] = new_temp;
5501     }
5502
5503   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5504                                     epilog_reduc_code, phis, reduc_index,
5505                                     double_reduc, slp_node);
5506
5507   return true;
5508 }
5509
5510 /* Function vect_min_worthwhile_factor.
5511
5512    For a loop where we could vectorize the operation indicated by CODE,
5513    return the minimum vectorization factor that makes it worthwhile
5514    to use generic vectors.  */
5515 int
5516 vect_min_worthwhile_factor (enum tree_code code)
5517 {
5518   switch (code)
5519     {
5520     case PLUS_EXPR:
5521     case MINUS_EXPR:
5522     case NEGATE_EXPR:
5523       return 4;
5524
5525     case BIT_AND_EXPR:
5526     case BIT_IOR_EXPR:
5527     case BIT_XOR_EXPR:
5528     case BIT_NOT_EXPR:
5529       return 2;
5530
5531     default:
5532       return INT_MAX;
5533     }
5534 }
5535
5536
5537 /* Function vectorizable_induction
5538
5539    Check if PHI performs an induction computation that can be vectorized.
5540    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5541    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5542    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5543
5544 bool
5545 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5546                         gimple *vec_stmt)
5547 {
5548   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5549   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5550   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5551   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5552   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5553   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5554   tree vec_def;
5555
5556   gcc_assert (ncopies >= 1);
5557   /* FORNOW. These restrictions should be relaxed.  */
5558   if (nested_in_vect_loop_p (loop, phi))
5559     {
5560       imm_use_iterator imm_iter;
5561       use_operand_p use_p;
5562       gimple exit_phi;
5563       edge latch_e;
5564       tree loop_arg;
5565
5566       if (ncopies > 1)
5567         {
5568           if (dump_enabled_p ())
5569             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5570                              "multiple types in nested loop.\n");
5571           return false;
5572         }
5573
5574       exit_phi = NULL;
5575       latch_e = loop_latch_edge (loop->inner);
5576       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5577       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5578         {
5579           gimple use_stmt = USE_STMT (use_p);
5580           if (is_gimple_debug (use_stmt))
5581             continue;
5582
5583           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5584             {
5585               exit_phi = use_stmt;
5586               break;
5587             }
5588         }
5589       if (exit_phi)
5590         {
5591           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5592           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5593                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5594             {
5595               if (dump_enabled_p ())
5596                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5597                                  "inner-loop induction only used outside "
5598                                  "of the outer vectorized loop.\n");
5599               return false;
5600             }
5601         }
5602     }
5603
5604   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5605     return false;
5606
5607   /* FORNOW: SLP not supported.  */
5608   if (STMT_SLP_TYPE (stmt_info))
5609     return false;
5610
5611   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5612
5613   if (gimple_code (phi) != GIMPLE_PHI)
5614     return false;
5615
5616   if (!vec_stmt) /* transformation not required.  */
5617     {
5618       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5619       if (dump_enabled_p ())
5620         dump_printf_loc (MSG_NOTE, vect_location,
5621                          "=== vectorizable_induction ===\n");
5622       vect_model_induction_cost (stmt_info, ncopies);
5623       return true;
5624     }
5625
5626   /** Transform.  **/
5627
5628   if (dump_enabled_p ())
5629     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5630
5631   vec_def = get_initial_def_for_induction (phi);
5632   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5633   return true;
5634 }
5635
5636 /* Function vectorizable_live_operation.
5637
5638    STMT computes a value that is used outside the loop.  Check if
5639    it can be supported.  */
5640
5641 bool
5642 vectorizable_live_operation (gimple stmt,
5643                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5644                              gimple *vec_stmt)
5645 {
5646   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5647   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5648   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5649   int i;
5650   int op_type;
5651   tree op;
5652   tree def;
5653   gimple def_stmt;
5654   enum vect_def_type dt;
5655   enum tree_code code;
5656   enum gimple_rhs_class rhs_class;
5657
5658   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5659
5660   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5661     return false;
5662
5663   if (!is_gimple_assign (stmt))
5664     {
5665       if (gimple_call_internal_p (stmt)
5666           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5667           && gimple_call_lhs (stmt)
5668           && loop->simduid
5669           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5670           && loop->simduid
5671              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5672         {
5673           edge e = single_exit (loop);
5674           basic_block merge_bb = e->dest;
5675           imm_use_iterator imm_iter;
5676           use_operand_p use_p;
5677           tree lhs = gimple_call_lhs (stmt);
5678
5679           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5680             {
5681               gimple use_stmt = USE_STMT (use_p);
5682               if (gimple_code (use_stmt) == GIMPLE_PHI
5683                   && gimple_bb (use_stmt) == merge_bb)
5684                 {
5685                   if (vec_stmt)
5686                     {
5687                       tree vfm1
5688                         = build_int_cst (unsigned_type_node,
5689                                          loop_vinfo->vectorization_factor - 1);
5690                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5691                     }
5692                   return true;
5693                 }
5694             }
5695         }
5696
5697       return false;
5698     }
5699
5700   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5701     return false;
5702
5703   /* FORNOW. CHECKME. */
5704   if (nested_in_vect_loop_p (loop, stmt))
5705     return false;
5706
5707   code = gimple_assign_rhs_code (stmt);
5708   op_type = TREE_CODE_LENGTH (code);
5709   rhs_class = get_gimple_rhs_class (code);
5710   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5711   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5712
5713   /* FORNOW: support only if all uses are invariant.  This means
5714      that the scalar operations can remain in place, unvectorized.
5715      The original last scalar value that they compute will be used.  */
5716
5717   for (i = 0; i < op_type; i++)
5718     {
5719       if (rhs_class == GIMPLE_SINGLE_RHS)
5720         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5721       else
5722         op = gimple_op (stmt, i + 1);
5723       if (op
5724           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5725                                   &dt))
5726         {
5727           if (dump_enabled_p ())
5728             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5729                              "use not simple.\n");
5730           return false;
5731         }
5732
5733       if (dt != vect_external_def && dt != vect_constant_def)
5734         return false;
5735     }
5736
5737   /* No transformation is required for the cases we currently support.  */
5738   return true;
5739 }
5740
5741 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5742
5743 static void
5744 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5745 {
5746   ssa_op_iter op_iter;
5747   imm_use_iterator imm_iter;
5748   def_operand_p def_p;
5749   gimple ustmt;
5750
5751   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5752     {
5753       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5754         {
5755           basic_block bb;
5756
5757           if (!is_gimple_debug (ustmt))
5758             continue;
5759
5760           bb = gimple_bb (ustmt);
5761
5762           if (!flow_bb_inside_loop_p (loop, bb))
5763             {
5764               if (gimple_debug_bind_p (ustmt))
5765                 {
5766                   if (dump_enabled_p ())
5767                     dump_printf_loc (MSG_NOTE, vect_location,
5768                                      "killing debug use\n");
5769
5770                   gimple_debug_bind_reset_value (ustmt);
5771                   update_stmt (ustmt);
5772                 }
5773               else
5774                 gcc_unreachable ();
5775             }
5776         }
5777     }
5778 }
5779
5780
5781 /* This function builds ni_name = number of iterations.  Statements
5782    are emitted on the loop preheader edge.  */
5783
5784 static tree
5785 vect_build_loop_niters (loop_vec_info loop_vinfo)
5786 {
5787   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5788   if (TREE_CODE (ni) == INTEGER_CST)
5789     return ni;
5790   else
5791     {
5792       tree ni_name, var;
5793       gimple_seq stmts = NULL;
5794       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5795
5796       var = create_tmp_var (TREE_TYPE (ni), "niters");
5797       ni_name = force_gimple_operand (ni, &stmts, false, var);
5798       if (stmts)
5799         gsi_insert_seq_on_edge_immediate (pe, stmts);
5800
5801       return ni_name;
5802     }
5803 }
5804
5805
5806 /* This function generates the following statements:
5807
5808    ni_name = number of iterations loop executes
5809    ratio = ni_name / vf
5810    ratio_mult_vf_name = ratio * vf
5811
5812    and places them on the loop preheader edge.  */
5813
5814 static void
5815 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5816                                  tree ni_name,
5817                                  tree *ratio_mult_vf_name_ptr,
5818                                  tree *ratio_name_ptr)
5819 {
5820   tree ni_minus_gap_name;
5821   tree var;
5822   tree ratio_name;
5823   tree ratio_mult_vf_name;
5824   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5825   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5826   tree log_vf;
5827
5828   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5829
5830   /* If epilogue loop is required because of data accesses with gaps, we
5831      subtract one iteration from the total number of iterations here for
5832      correct calculation of RATIO.  */
5833   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5834     {
5835       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5836                                        ni_name,
5837                                        build_one_cst (TREE_TYPE (ni_name)));
5838       if (!is_gimple_val (ni_minus_gap_name))
5839         {
5840           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5841           gimple stmts = NULL;
5842           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5843                                                     true, var);
5844           gsi_insert_seq_on_edge_immediate (pe, stmts);
5845         }
5846     }
5847   else
5848     ni_minus_gap_name = ni_name;
5849
5850   /* Create: ratio = ni >> log2(vf) */
5851   /* ???  As we have ni == number of latch executions + 1, ni could
5852      have overflown to zero.  So avoid computing ratio based on ni
5853      but compute it using the fact that we know ratio will be at least
5854      one, thus via (ni - vf) >> log2(vf) + 1.  */
5855   ratio_name
5856     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5857                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5858                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5859                                              ni_minus_gap_name,
5860                                              build_int_cst
5861                                                (TREE_TYPE (ni_name), vf)),
5862                                 log_vf),
5863                    build_int_cst (TREE_TYPE (ni_name), 1));
5864   if (!is_gimple_val (ratio_name))
5865     {
5866       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5867       gimple stmts = NULL;
5868       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5869       gsi_insert_seq_on_edge_immediate (pe, stmts);
5870     }
5871   *ratio_name_ptr = ratio_name;
5872
5873   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5874
5875   if (ratio_mult_vf_name_ptr)
5876     {
5877       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5878                                         ratio_name, log_vf);
5879       if (!is_gimple_val (ratio_mult_vf_name))
5880         {
5881           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5882           gimple stmts = NULL;
5883           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5884                                                      true, var);
5885           gsi_insert_seq_on_edge_immediate (pe, stmts);
5886         }
5887       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5888     }
5889
5890   return;
5891 }
5892
5893
5894 /* Function vect_transform_loop.
5895
5896    The analysis phase has determined that the loop is vectorizable.
5897    Vectorize the loop - created vectorized stmts to replace the scalar
5898    stmts in the loop, and update the loop exit condition.  */
5899
5900 void
5901 vect_transform_loop (loop_vec_info loop_vinfo)
5902 {
5903   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5904   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5905   int nbbs = loop->num_nodes;
5906   int i;
5907   tree ratio = NULL;
5908   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5909   bool grouped_store;
5910   bool slp_scheduled = false;
5911   gimple stmt, pattern_stmt;
5912   gimple_seq pattern_def_seq = NULL;
5913   gimple_stmt_iterator pattern_def_si = gsi_none ();
5914   bool transform_pattern_stmt = false;
5915   bool check_profitability = false;
5916   int th;
5917   /* Record number of iterations before we started tampering with the profile. */
5918   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5919
5920   if (dump_enabled_p ())
5921     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5922
5923   /* If profile is inprecise, we have chance to fix it up.  */
5924   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5925     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5926
5927   /* Use the more conservative vectorization threshold.  If the number
5928      of iterations is constant assume the cost check has been performed
5929      by our caller.  If the threshold makes all loops profitable that
5930      run at least the vectorization factor number of times checking
5931      is pointless, too.  */
5932   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5933   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5934       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5935     {
5936       if (dump_enabled_p ())
5937         dump_printf_loc (MSG_NOTE, vect_location,
5938                          "Profitability threshold is %d loop iterations.\n",
5939                          th);
5940       check_profitability = true;
5941     }
5942
5943   /* Version the loop first, if required, so the profitability check
5944      comes first.  */
5945
5946   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5947       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5948     {
5949       vect_loop_versioning (loop_vinfo, th, check_profitability);
5950       check_profitability = false;
5951     }
5952
5953   tree ni_name = vect_build_loop_niters (loop_vinfo);
5954   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5955
5956   /* Peel the loop if there are data refs with unknown alignment.
5957      Only one data ref with unknown store is allowed.  */
5958
5959   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5960     {
5961       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5962                                      th, check_profitability);
5963       check_profitability = false;
5964       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5965          be re-computed.  */
5966       ni_name = NULL_TREE;
5967     }
5968
5969   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5970      compile time constant), or it is a constant that doesn't divide by the
5971      vectorization factor, then an epilog loop needs to be created.
5972      We therefore duplicate the loop: the original loop will be vectorized,
5973      and will compute the first (n/VF) iterations.  The second copy of the loop
5974      will remain scalar and will compute the remaining (n%VF) iterations.
5975      (VF is the vectorization factor).  */
5976
5977   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5978       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5979     {
5980       tree ratio_mult_vf;
5981       if (!ni_name)
5982         ni_name = vect_build_loop_niters (loop_vinfo);
5983       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5984                                        &ratio);
5985       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5986                                       th, check_profitability);
5987     }
5988   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5989     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5990                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5991   else
5992     {
5993       if (!ni_name)
5994         ni_name = vect_build_loop_niters (loop_vinfo);
5995       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5996     }
5997
5998   /* 1) Make sure the loop header has exactly two entries
5999      2) Make sure we have a preheader basic block.  */
6000
6001   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6002
6003   split_edge (loop_preheader_edge (loop));
6004
6005   /* FORNOW: the vectorizer supports only loops which body consist
6006      of one basic block (header + empty latch). When the vectorizer will
6007      support more involved loop forms, the order by which the BBs are
6008      traversed need to be reconsidered.  */
6009
6010   for (i = 0; i < nbbs; i++)
6011     {
6012       basic_block bb = bbs[i];
6013       stmt_vec_info stmt_info;
6014
6015       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6016            gsi_next (&si))
6017         {
6018           gphi *phi = si.phi ();
6019           if (dump_enabled_p ())
6020             {
6021               dump_printf_loc (MSG_NOTE, vect_location,
6022                                "------>vectorizing phi: ");
6023               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6024               dump_printf (MSG_NOTE, "\n");
6025             }
6026           stmt_info = vinfo_for_stmt (phi);
6027           if (!stmt_info)
6028             continue;
6029
6030           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6031             vect_loop_kill_debug_uses (loop, phi);
6032
6033           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6034               && !STMT_VINFO_LIVE_P (stmt_info))
6035             continue;
6036
6037           if (STMT_VINFO_VECTYPE (stmt_info)
6038               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6039                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6040               && dump_enabled_p ())
6041             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6042
6043           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6044             {
6045               if (dump_enabled_p ())
6046                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6047               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6048             }
6049         }
6050
6051       pattern_stmt = NULL;
6052       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6053            !gsi_end_p (si) || transform_pattern_stmt;)
6054         {
6055           bool is_store;
6056
6057           if (transform_pattern_stmt)
6058             stmt = pattern_stmt;
6059           else
6060             {
6061               stmt = gsi_stmt (si);
6062               /* During vectorization remove existing clobber stmts.  */
6063               if (gimple_clobber_p (stmt))
6064                 {
6065                   unlink_stmt_vdef (stmt);
6066                   gsi_remove (&si, true);
6067                   release_defs (stmt);
6068                   continue;
6069                 }
6070             }
6071
6072           if (dump_enabled_p ())
6073             {
6074               dump_printf_loc (MSG_NOTE, vect_location,
6075                                "------>vectorizing statement: ");
6076               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6077               dump_printf (MSG_NOTE, "\n");
6078             }
6079
6080           stmt_info = vinfo_for_stmt (stmt);
6081
6082           /* vector stmts created in the outer-loop during vectorization of
6083              stmts in an inner-loop may not have a stmt_info, and do not
6084              need to be vectorized.  */
6085           if (!stmt_info)
6086             {
6087               gsi_next (&si);
6088               continue;
6089             }
6090
6091           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6092             vect_loop_kill_debug_uses (loop, stmt);
6093
6094           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6095               && !STMT_VINFO_LIVE_P (stmt_info))
6096             {
6097               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6098                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6099                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6100                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6101                 {
6102                   stmt = pattern_stmt;
6103                   stmt_info = vinfo_for_stmt (stmt);
6104                 }
6105               else
6106                 {
6107                   gsi_next (&si);
6108                   continue;
6109                 }
6110             }
6111           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6112                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6113                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6114                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6115             transform_pattern_stmt = true;
6116
6117           /* If pattern statement has def stmts, vectorize them too.  */
6118           if (is_pattern_stmt_p (stmt_info))
6119             {
6120               if (pattern_def_seq == NULL)
6121                 {
6122                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6123                   pattern_def_si = gsi_start (pattern_def_seq);
6124                 }
6125               else if (!gsi_end_p (pattern_def_si))
6126                 gsi_next (&pattern_def_si);
6127               if (pattern_def_seq != NULL)
6128                 {
6129                   gimple pattern_def_stmt = NULL;
6130                   stmt_vec_info pattern_def_stmt_info = NULL;
6131
6132                   while (!gsi_end_p (pattern_def_si))
6133                     {
6134                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6135                       pattern_def_stmt_info
6136                         = vinfo_for_stmt (pattern_def_stmt);
6137                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6138                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6139                         break;
6140                       gsi_next (&pattern_def_si);
6141                     }
6142
6143                   if (!gsi_end_p (pattern_def_si))
6144                     {
6145                       if (dump_enabled_p ())
6146                         {
6147                           dump_printf_loc (MSG_NOTE, vect_location,
6148                                            "==> vectorizing pattern def "
6149                                            "stmt: ");
6150                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6151                                             pattern_def_stmt, 0);
6152                           dump_printf (MSG_NOTE, "\n");
6153                         }
6154
6155                       stmt = pattern_def_stmt;
6156                       stmt_info = pattern_def_stmt_info;
6157                     }
6158                   else
6159                     {
6160                       pattern_def_si = gsi_none ();
6161                       transform_pattern_stmt = false;
6162                     }
6163                 }
6164               else
6165                 transform_pattern_stmt = false;
6166             }
6167
6168           if (STMT_VINFO_VECTYPE (stmt_info))
6169             {
6170               unsigned int nunits
6171                 = (unsigned int)
6172                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6173               if (!STMT_SLP_TYPE (stmt_info)
6174                   && nunits != (unsigned int) vectorization_factor
6175                   && dump_enabled_p ())
6176                   /* For SLP VF is set according to unrolling factor, and not
6177                      to vector size, hence for SLP this print is not valid.  */
6178                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6179             }
6180
6181           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6182              reached.  */
6183           if (STMT_SLP_TYPE (stmt_info))
6184             {
6185               if (!slp_scheduled)
6186                 {
6187                   slp_scheduled = true;
6188
6189                   if (dump_enabled_p ())
6190                     dump_printf_loc (MSG_NOTE, vect_location,
6191                                      "=== scheduling SLP instances ===\n");
6192
6193                   vect_schedule_slp (loop_vinfo, NULL);
6194                 }
6195
6196               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6197               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6198                 {
6199                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6200                     {
6201                       pattern_def_seq = NULL;
6202                       gsi_next (&si);
6203                     }
6204                   continue;
6205                 }
6206             }
6207
6208           /* -------- vectorize statement ------------ */
6209           if (dump_enabled_p ())
6210             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6211
6212           grouped_store = false;
6213           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6214           if (is_store)
6215             {
6216               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6217                 {
6218                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6219                      interleaving chain was completed - free all the stores in
6220                      the chain.  */
6221                   gsi_next (&si);
6222                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6223                 }
6224               else
6225                 {
6226                   /* Free the attached stmt_vec_info and remove the stmt.  */
6227                   gimple store = gsi_stmt (si);
6228                   free_stmt_vec_info (store);
6229                   unlink_stmt_vdef (store);
6230                   gsi_remove (&si, true);
6231                   release_defs (store);
6232                 }
6233
6234               /* Stores can only appear at the end of pattern statements.  */
6235               gcc_assert (!transform_pattern_stmt);
6236               pattern_def_seq = NULL;
6237             }
6238           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6239             {
6240               pattern_def_seq = NULL;
6241               gsi_next (&si);
6242             }
6243         }                       /* stmts in BB */
6244     }                           /* BBs in loop */
6245
6246   slpeel_make_loop_iterate_ntimes (loop, ratio);
6247
6248   /* Reduce loop iterations by the vectorization factor.  */
6249   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6250                       expected_iterations / vectorization_factor);
6251   loop->nb_iterations_upper_bound
6252     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6253   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6254       && loop->nb_iterations_upper_bound != 0)
6255     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6256   if (loop->any_estimate)
6257     {
6258       loop->nb_iterations_estimate
6259         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6260        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6261            && loop->nb_iterations_estimate != 0)
6262          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6263     }
6264
6265   if (dump_enabled_p ())
6266     {
6267       dump_printf_loc (MSG_NOTE, vect_location,
6268                        "LOOP VECTORIZED\n");
6269       if (loop->inner)
6270         dump_printf_loc (MSG_NOTE, vect_location,
6271                          "OUTER LOOP VECTORIZED\n");
6272       dump_printf (MSG_NOTE, "\n");
6273     }
6274 }