gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "predict.h"
  40 #include "hard-reg-set.h"
  41 #include "function.h"
  42 #include "dominance.h"
  43 #include "cfg.h"
  44 #include "cfganal.h"
  45 #include "basic-block.h"
  46 #include "gimple-pretty-print.h"
  47 #include "tree-ssa-alias.h"
  48 #include "internal-fn.h"
  49 #include "gimple-expr.h"
  50 #include "is-a.h"
  51 #include "gimple.h"
  52 #include "gimplify.h"
  53 #include "gimple-iterator.h"
  54 #include "gimplify-me.h"
  55 #include "gimple-ssa.h"
  56 #include "tree-phinodes.h"
  57 #include "ssa-iterators.h"
  58 #include "stringpool.h"
  59 #include "tree-ssanames.h"
  60 #include "tree-ssa-loop-ivopts.h"
  61 #include "tree-ssa-loop-manip.h"
  62 #include "tree-ssa-loop-niter.h"
  63 #include "tree-pass.h"
  64 #include "cfgloop.h"
  65 #include "hashtab.h"
  66 #include "rtl.h"
  67 #include "flags.h"
  68 #include "statistics.h"
  69 #include "real.h"
  70 #include "fixed-value.h"
  71 #include "insn-config.h"
  72 #include "expmed.h"
  73 #include "dojump.h"
  74 #include "explow.h"
  75 #include "calls.h"
  76 #include "emit-rtl.h"
  77 #include "varasm.h"
  78 #include "stmt.h"
  79 #include "expr.h"
  80 #include "recog.h"
  81 #include "insn-codes.h"
  82 #include "optabs.h"
  83 #include "params.h"
  84 #include "diagnostic-core.h"
  85 #include "tree-chrec.h"
  86 #include "tree-scalar-evolution.h"
  87 #include "tree-vectorizer.h"
  88 #include "target.h"
  89
  90 /* Loop Vectorization Pass.
  91
  92    This pass tries to vectorize loops.
  93
  94    For example, the vectorizer transforms the following simple loop:
  95
  96         short a[N]; short b[N]; short c[N]; int i;
  97
  98         for (i=0; i<N; i++){
  99           a[i] = b[i] + c[i];
 100         }
 101
 102    as if it was manually vectorized by rewriting the source code into:
 103
 104         typedef int __attribute__((mode(V8HI))) v8hi;
 105         short a[N];  short b[N]; short c[N];   int i;
 106         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
 107         v8hi va, vb, vc;
 108
 109         for (i=0; i<N/8; i++){
 110           vb = pb[i];
 111           vc = pc[i];
 112           va = vb + vc;
 113           pa[i] = va;
 114         }
 115
 116         The main entry to this pass is vectorize_loops(), in which
 117    the vectorizer applies a set of analyses on a given set of loops,
 118    followed by the actual vectorization transformation for the loops that
 119    had successfully passed the analysis phase.
 120         Throughout this pass we make a distinction between two types of
 121    data: scalars (which are represented by SSA_NAMES), and memory references
 122    ("data-refs").  These two types of data require different handling both
 123    during analysis and transformation. The types of data-refs that the
 124    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 125    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 126    accesses are required to have a simple (consecutive) access pattern.
 127
 128    Analysis phase:
 129    ===============
 130         The driver for the analysis phase is vect_analyze_loop().
 131    It applies a set of analyses, some of which rely on the scalar evolution
 132    analyzer (scev) developed by Sebastian Pop.
 133
 134         During the analysis phase the vectorizer records some information
 135    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 136    loop, as well as general information about the loop as a whole, which is
 137    recorded in a "loop_vec_info" struct attached to each loop.
 138
 139    Transformation phase:
 140    =====================
 141         The loop transformation phase scans all the stmts in the loop, and
 142    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 143    the loop that needs to be vectorized.  It inserts the vector code sequence
 144    just before the scalar stmt S, and records a pointer to the vector code
 145    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 146    attached to S).  This pointer will be used for the vectorization of following
 147    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 148    otherwise, we rely on dead code elimination for removing it.
 149
 150         For example, say stmt S1 was vectorized into stmt VS1:
 151
 152    VS1: vb = px[i];
 153    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 154    S2:  a = b;
 155
 156    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 157    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 158    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 159    resulting sequence would be:
 160
 161    VS1: vb = px[i];
 162    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 163    VS2: va = vb;
 164    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 165
 166         Operands that are not SSA_NAMEs, are data-refs that appear in
 167    load/store operations (like 'x[i]' in S1), and are handled differently.
 168
 169    Target modeling:
 170    =================
 171         Currently the only target specific information that is used is the
 172    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 173    Targets that can support different sizes of vectors, for now will need
 174    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 175    flexibility will be added in the future.
 176
 177         Since we only vectorize operations which vector form can be
 178    expressed using existing tree codes, to verify that an operation is
 179    supported, the vectorizer checks the relevant optab at the relevant
 180    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 181    the value found is CODE_FOR_nothing, then there's no target support, and
 182    we can't vectorize the stmt.
 183
 184    For additional information on this project see:
 185    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 186 */
 187
 188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 189
 190 /* Function vect_determine_vectorization_factor
 191
 192    Determine the vectorization factor (VF).  VF is the number of data elements
 193    that are operated upon in parallel in a single iteration of the vectorized
 194    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 195    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 196    elements can fit in a single vector register.
 197
 198    We currently support vectorization of loops in which all types operated upon
 199    are of the same size.  Therefore this function currently sets VF according to
 200    the size of the types operated upon, and fails if there are multiple sizes
 201    in the loop.
 202
 203    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 204    original loop:
 205         for (i=0; i<N; i++){
 206           a[i] = b[i] + c[i];
 207         }
 208
 209    vectorized loop:
 210         for (i=0; i<N; i+=VF){
 211           a[i:VF] = b[i:VF] + c[i:VF];
 212         }
 213 */
 214
 215 static bool
 216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 217 {
 218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 219   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 220   int nbbs = loop->num_nodes;
 221   unsigned int vectorization_factor = 0;
 222   tree scalar_type;
 223   gphi *phi;
 224   tree vectype;
 225   unsigned int nunits;
 226   stmt_vec_info stmt_info;
 227   int i;
 228   HOST_WIDE_INT dummy;
 229   gimple stmt, pattern_stmt = NULL;
 230   gimple_seq pattern_def_seq = NULL;
 231   gimple_stmt_iterator pattern_def_si = gsi_none ();
 232   bool analyze_pattern_stmt = false;
 233
 234   if (dump_enabled_p ())
 235     dump_printf_loc (MSG_NOTE, vect_location,
 236                      "=== vect_determine_vectorization_factor ===\n");
 237
 238   for (i = 0; i < nbbs; i++)
 239     {
 240       basic_block bb = bbs[i];
 241
 242       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 243            gsi_next (&si))
 244         {
 245           phi = si.phi ();
 246           stmt_info = vinfo_for_stmt (phi);
 247           if (dump_enabled_p ())
 248             {
 249               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 250               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 251               dump_printf (MSG_NOTE, "\n");
 252             }
 253
 254           gcc_assert (stmt_info);
 255
 256           if (STMT_VINFO_RELEVANT_P (stmt_info))
 257             {
 258               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 259               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location,
 264                                    "get vectype for scalar type:  ");
 265                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 266                   dump_printf (MSG_NOTE, "\n");
 267                 }
 268
 269               vectype = get_vectype_for_scalar_type (scalar_type);
 270               if (!vectype)
 271                 {
 272                   if (dump_enabled_p ())
 273                     {
 274                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 275                                        "not vectorized: unsupported "
 276                                        "data-type ");
 277                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 278                                          scalar_type);
 279                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 280                     }
 281                   return false;
 282                 }
 283               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 284
 285               if (dump_enabled_p ())
 286                 {
 287                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 288                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 289                   dump_printf (MSG_NOTE, "\n");
 290                 }
 291
 292               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 293               if (dump_enabled_p ())
 294                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 295                                  nunits);
 296
 297               if (!vectorization_factor
 298                   || (nunits > vectorization_factor))
 299                 vectorization_factor = nunits;
 300             }
 301         }
 302
 303       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 304            !gsi_end_p (si) || analyze_pattern_stmt;)
 305         {
 306           tree vf_vectype;
 307
 308           if (analyze_pattern_stmt)
 309             stmt = pattern_stmt;
 310           else
 311             stmt = gsi_stmt (si);
 312
 313           stmt_info = vinfo_for_stmt (stmt);
 314
 315           if (dump_enabled_p ())
 316             {
 317               dump_printf_loc (MSG_NOTE, vect_location,
 318                                "==> examining statement: ");
 319               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 320               dump_printf (MSG_NOTE, "\n");
 321             }
 322
 323           gcc_assert (stmt_info);
 324
 325           /* Skip stmts which do not need to be vectorized.  */
 326           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 327                && !STMT_VINFO_LIVE_P (stmt_info))
 328               || gimple_clobber_p (stmt))
 329             {
 330               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 331                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 332                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 333                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 334                 {
 335                   stmt = pattern_stmt;
 336                   stmt_info = vinfo_for_stmt (pattern_stmt);
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_NOTE, vect_location,
 340                                        "==> examining pattern statement: ");
 341                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 342                       dump_printf (MSG_NOTE, "\n");
 343                     }
 344                 }
 345               else
 346                 {
 347                   if (dump_enabled_p ())
 348                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 349                   gsi_next (&si);
 350                   continue;
 351                 }
 352             }
 353           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 354                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 355                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 356                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 357             analyze_pattern_stmt = true;
 358
 359           /* If a pattern statement has def stmts, analyze them too.  */
 360           if (is_pattern_stmt_p (stmt_info))
 361             {
 362               if (pattern_def_seq == NULL)
 363                 {
 364                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 365                   pattern_def_si = gsi_start (pattern_def_seq);
 366                 }
 367               else if (!gsi_end_p (pattern_def_si))
 368                 gsi_next (&pattern_def_si);
 369               if (pattern_def_seq != NULL)
 370                 {
 371                   gimple pattern_def_stmt = NULL;
 372                   stmt_vec_info pattern_def_stmt_info = NULL;
 373
 374                   while (!gsi_end_p (pattern_def_si))
 375                     {
 376                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 377                       pattern_def_stmt_info
 378                         = vinfo_for_stmt (pattern_def_stmt);
 379                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 380                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 381                         break;
 382                       gsi_next (&pattern_def_si);
 383                     }
 384
 385                   if (!gsi_end_p (pattern_def_si))
 386                     {
 387                       if (dump_enabled_p ())
 388                         {
 389                           dump_printf_loc (MSG_NOTE, vect_location,
 390                                            "==> examining pattern def stmt: ");
 391                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 392                                             pattern_def_stmt, 0);
 393                           dump_printf (MSG_NOTE, "\n");
 394                         }
 395
 396                       stmt = pattern_def_stmt;
 397                       stmt_info = pattern_def_stmt_info;
 398                     }
 399                   else
 400                     {
 401                       pattern_def_si = gsi_none ();
 402                       analyze_pattern_stmt = false;
 403                     }
 404                 }
 405               else
 406                 analyze_pattern_stmt = false;
 407             }
 408
 409           if (gimple_get_lhs (stmt) == NULL_TREE
 410               /* MASK_STORE has no lhs, but is ok.  */
 411               && (!is_gimple_call (stmt)
 412                   || !gimple_call_internal_p (stmt)
 413                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 414             {
 415               if (is_gimple_call (stmt))
 416                 {
 417                   /* Ignore calls with no lhs.  These must be calls to
 418                      #pragma omp simd functions, and what vectorization factor
 419                      it really needs can't be determined until
 420                      vectorizable_simd_clone_call.  */
 421                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 422                     {
 423                       pattern_def_seq = NULL;
 424                       gsi_next (&si);
 425                     }
 426                   continue;
 427                 }
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: irregular stmt.");
 432                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 433                                     0);
 434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 435                 }
 436               return false;
 437             }
 438
 439           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: vector stmt in loop:");
 445                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 446                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 447                 }
 448               return false;
 449             }
 450
 451           if (STMT_VINFO_VECTYPE (stmt_info))
 452             {
 453               /* The only case when a vectype had been already set is for stmts
 454                  that contain a dataref, or for "pattern-stmts" (stmts
 455                  generated by the vectorizer to represent/replace a certain
 456                  idiom).  */
 457               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 458                           || is_pattern_stmt_p (stmt_info)
 459                           || !gsi_end_p (pattern_def_si));
 460               vectype = STMT_VINFO_VECTYPE (stmt_info);
 461             }
 462           else
 463             {
 464               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 465               if (is_gimple_call (stmt)
 466                   && gimple_call_internal_p (stmt)
 467                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 468                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 469               else
 470                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 471               if (dump_enabled_p ())
 472                 {
 473                   dump_printf_loc (MSG_NOTE, vect_location,
 474                                    "get vectype for scalar type:  ");
 475                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 476                   dump_printf (MSG_NOTE, "\n");
 477                 }
 478               vectype = get_vectype_for_scalar_type (scalar_type);
 479               if (!vectype)
 480                 {
 481                   if (dump_enabled_p ())
 482                     {
 483                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                                        "not vectorized: unsupported "
 485                                        "data-type ");
 486                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 487                                          scalar_type);
 488                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 489                     }
 490                   return false;
 491                 }
 492
 493               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 494
 495               if (dump_enabled_p ())
 496                 {
 497                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 498                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 499                   dump_printf (MSG_NOTE, "\n");
 500                 }
 501             }
 502
 503           /* The vectorization factor is according to the smallest
 504              scalar type (or the largest vector size, but we only
 505              support one vector size per loop).  */
 506           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                        &dummy);
 508           if (dump_enabled_p ())
 509             {
 510               dump_printf_loc (MSG_NOTE, vect_location,
 511                                "get vectype for scalar type:  ");
 512               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513               dump_printf (MSG_NOTE, "\n");
 514             }
 515           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   return true;
 583 }
 584
 585
 586 /* Function vect_is_simple_iv_evolution.
 587
 588    FORNOW: A simple evolution of an induction variables in the loop is
 589    considered a polynomial evolution.  */
 590
 591 static bool
 592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 593                              tree * step)
 594 {
 595   tree init_expr;
 596   tree step_expr;
 597   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 598   basic_block bb;
 599
 600   /* When there is no evolution in this loop, the evolution function
 601      is not "simple".  */
 602   if (evolution_part == NULL_TREE)
 603     return false;
 604
 605   /* When the evolution is a polynomial of degree >= 2
 606      the evolution function is not "simple".  */
 607   if (tree_is_chrec (evolution_part))
 608     return false;
 609
 610   step_expr = evolution_part;
 611   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 612
 613   if (dump_enabled_p ())
 614     {
 615       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 616       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 617       dump_printf (MSG_NOTE, ",  init: ");
 618       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 619       dump_printf (MSG_NOTE, "\n");
 620     }
 621
 622   *init = init_expr;
 623   *step = step_expr;
 624
 625   if (TREE_CODE (step_expr) != INTEGER_CST
 626       && (TREE_CODE (step_expr) != SSA_NAME
 627           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 628               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 629           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 630               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 631                   || !flag_associative_math)))
 632       && (TREE_CODE (step_expr) != REAL_CST
 633           || !flag_associative_math))
 634     {
 635       if (dump_enabled_p ())
 636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 637                          "step unknown.\n");
 638       return false;
 639     }
 640
 641   return true;
 642 }
 643
 644 /* Function vect_analyze_scalar_cycles_1.
 645
 646    Examine the cross iteration def-use cycles of scalar variables
 647    in LOOP.  LOOP_VINFO represents the loop that is now being
 648    considered for vectorization (can be LOOP, or an outer-loop
 649    enclosing LOOP).  */
 650
 651 static void
 652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 653 {
 654   basic_block bb = loop->header;
 655   tree init, step;
 656   auto_vec<gimple, 64> worklist;
 657   gphi_iterator gsi;
 658   bool double_reduc;
 659
 660   if (dump_enabled_p ())
 661     dump_printf_loc (MSG_NOTE, vect_location,
 662                      "=== vect_analyze_scalar_cycles ===\n");
 663
 664   /* First - identify all inductions.  Reduction detection assumes that all the
 665      inductions have been identified, therefore, this order must not be
 666      changed.  */
 667   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 668     {
 669       gphi *phi = gsi.phi ();
 670       tree access_fn = NULL;
 671       tree def = PHI_RESULT (phi);
 672       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 673
 674       if (dump_enabled_p ())
 675         {
 676           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 677           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 678           dump_printf (MSG_NOTE, "\n");
 679         }
 680
 681       /* Skip virtual phi's.  The data dependences that are associated with
 682          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 683       if (virtual_operand_p (def))
 684         continue;
 685
 686       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 687
 688       /* Analyze the evolution function.  */
 689       access_fn = analyze_scalar_evolution (loop, def);
 690       if (access_fn)
 691         {
 692           STRIP_NOPS (access_fn);
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_NOTE, vect_location,
 696                                "Access function of PHI: ");
 697               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 698               dump_printf (MSG_NOTE, "\n");
 699             }
 700           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 701             = evolution_part_in_loop_num (access_fn, loop->num);
 702         }
 703
 704       if (!access_fn
 705           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 706           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 707               && TREE_CODE (step) != INTEGER_CST))
 708         {
 709           worklist.safe_push (phi);
 710           continue;
 711         }
 712
 713       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 714
 715       if (dump_enabled_p ())
 716         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 717       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 718     }
 719
 720
 721   /* Second - identify all reductions and nested cycles.  */
 722   while (worklist.length () > 0)
 723     {
 724       gimple phi = worklist.pop ();
 725       tree def = PHI_RESULT (phi);
 726       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 727       gimple reduc_stmt;
 728       bool nested_cycle;
 729
 730       if (dump_enabled_p ())
 731         {
 732           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 733           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 734           dump_printf (MSG_NOTE, "\n");
 735         }
 736
 737       gcc_assert (!virtual_operand_p (def)
 738                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 739
 740       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 741       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 742                                                 &double_reduc);
 743       if (reduc_stmt)
 744         {
 745           if (double_reduc)
 746             {
 747               if (dump_enabled_p ())
 748                 dump_printf_loc (MSG_NOTE, vect_location,
 749                                  "Detected double reduction.\n");
 750
 751               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 752               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 753                                                     vect_double_reduction_def;
 754             }
 755           else
 756             {
 757               if (nested_cycle)
 758                 {
 759                   if (dump_enabled_p ())
 760                     dump_printf_loc (MSG_NOTE, vect_location,
 761                                      "Detected vectorizable nested cycle.\n");
 762
 763                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 764                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 765                                                              vect_nested_cycle;
 766                 }
 767               else
 768                 {
 769                   if (dump_enabled_p ())
 770                     dump_printf_loc (MSG_NOTE, vect_location,
 771                                      "Detected reduction.\n");
 772
 773                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 774                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 775                                                            vect_reduction_def;
 776                   /* Store the reduction cycles for possible vectorization in
 777                      loop-aware SLP.  */
 778                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 779                 }
 780             }
 781         }
 782       else
 783         if (dump_enabled_p ())
 784           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 785                            "Unknown def-use cycle pattern.\n");
 786     }
 787 }
 788
 789
 790 /* Function vect_analyze_scalar_cycles.
 791
 792    Examine the cross iteration def-use cycles of scalar variables, by
 793    analyzing the loop-header PHIs of scalar variables.  Classify each
 794    cycle as one of the following: invariant, induction, reduction, unknown.
 795    We do that for the loop represented by LOOP_VINFO, and also to its
 796    inner-loop, if exists.
 797    Examples for scalar cycles:
 798
 799    Example1: reduction:
 800
 801               loop1:
 802               for (i=0; i<N; i++)
 803                  sum += a[i];
 804
 805    Example2: induction:
 806
 807               loop2:
 808               for (i=0; i<N; i++)
 809                  a[i] = i;  */
 810
 811 static void
 812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 813 {
 814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 815
 816   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 817
 818   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 819      Reductions in such inner-loop therefore have different properties than
 820      the reductions in the nest that gets vectorized:
 821      1. When vectorized, they are executed in the same order as in the original
 822         scalar loop, so we can't change the order of computation when
 823         vectorizing them.
 824      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 825         current checks are too strict.  */
 826
 827   if (loop->inner)
 828     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 829 }
 830
 831
 832 /* Function vect_get_loop_niters.
 833
 834    Determine how many iterations the loop is executed and place it
 835    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 836    in NUMBER_OF_ITERATIONSM1.
 837
 838    Return the loop exit condition.  */
 839
 840
 841 static gcond *
 842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 843                       tree *number_of_iterationsm1)
 844 {
 845   tree niters;
 846
 847   if (dump_enabled_p ())
 848     dump_printf_loc (MSG_NOTE, vect_location,
 849                      "=== get_loop_niters ===\n");
 850
 851   niters = number_of_latch_executions (loop);
 852   *number_of_iterationsm1 = niters;
 853
 854   /* We want the number of loop header executions which is the number
 855      of latch executions plus one.
 856      ???  For UINT_MAX latch executions this number overflows to zero
 857      for loops like do { n++; } while (n != 0);  */
 858   if (niters && !chrec_contains_undetermined (niters))
 859     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 860                           build_int_cst (TREE_TYPE (niters), 1));
 861   *number_of_iterations = niters;
 862
 863   return get_loop_exit_condition (loop);
 864 }
 865
 866
 867 /* Function bb_in_loop_p
 868
 869    Used as predicate for dfs order traversal of the loop bbs.  */
 870
 871 static bool
 872 bb_in_loop_p (const_basic_block bb, const void *data)
 873 {
 874   const struct loop *const loop = (const struct loop *)data;
 875   if (flow_bb_inside_loop_p (loop, bb))
 876     return true;
 877   return false;
 878 }
 879
 880
 881 /* Function new_loop_vec_info.
 882
 883    Create and initialize a new loop_vec_info struct for LOOP, as well as
 884    stmt_vec_info structs for all the stmts in LOOP.  */
 885
 886 static loop_vec_info
 887 new_loop_vec_info (struct loop *loop)
 888 {
 889   loop_vec_info res;
 890   basic_block *bbs;
 891   gimple_stmt_iterator si;
 892   unsigned int i, nbbs;
 893
 894   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 895   LOOP_VINFO_LOOP (res) = loop;
 896
 897   bbs = get_loop_body (loop);
 898
 899   /* Create/Update stmt_info for all stmts in the loop.  */
 900   for (i = 0; i < loop->num_nodes; i++)
 901     {
 902       basic_block bb = bbs[i];
 903
 904       /* BBs in a nested inner-loop will have been already processed (because
 905          we will have called vect_analyze_loop_form for any nested inner-loop).
 906          Therefore, for stmts in an inner-loop we just want to update the
 907          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 908          loop_info of the outer-loop we are currently considering to vectorize
 909          (instead of the loop_info of the inner-loop).
 910          For stmts in other BBs we need to create a stmt_info from scratch.  */
 911       if (bb->loop_father != loop)
 912         {
 913           /* Inner-loop bb.  */
 914           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 915           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 916             {
 917               gimple phi = gsi_stmt (si);
 918               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 919               loop_vec_info inner_loop_vinfo =
 920                 STMT_VINFO_LOOP_VINFO (stmt_info);
 921               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 922               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 923             }
 924           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 925            {
 926               gimple stmt = gsi_stmt (si);
 927               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 928               loop_vec_info inner_loop_vinfo =
 929                  STMT_VINFO_LOOP_VINFO (stmt_info);
 930               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 931               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 932            }
 933         }
 934       else
 935         {
 936           /* bb in current nest.  */
 937           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 938             {
 939               gimple phi = gsi_stmt (si);
 940               gimple_set_uid (phi, 0);
 941               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 942             }
 943
 944           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 945             {
 946               gimple stmt = gsi_stmt (si);
 947               gimple_set_uid (stmt, 0);
 948               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 949             }
 950         }
 951     }
 952
 953   /* CHECKME: We want to visit all BBs before their successors (except for
 954      latch blocks, for which this assertion wouldn't hold).  In the simple
 955      case of the loop forms we allow, a dfs order of the BBs would the same
 956      as reversed postorder traversal, so we are safe.  */
 957
 958    free (bbs);
 959    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 960    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 961                               bbs, loop->num_nodes, loop);
 962    gcc_assert (nbbs == loop->num_nodes);
 963
 964   LOOP_VINFO_BBS (res) = bbs;
 965   LOOP_VINFO_NITERSM1 (res) = NULL;
 966   LOOP_VINFO_NITERS (res) = NULL;
 967   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 968   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 969   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 970   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 971   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 972   LOOP_VINFO_VECT_FACTOR (res) = 0;
 973   LOOP_VINFO_LOOP_NEST (res).create (3);
 974   LOOP_VINFO_DATAREFS (res).create (10);
 975   LOOP_VINFO_DDRS (res).create (10 * 10);
 976   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 977   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 978              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 979   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 980              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 981   LOOP_VINFO_GROUPED_STORES (res).create (10);
 982   LOOP_VINFO_REDUCTIONS (res).create (10);
 983   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 984   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 985   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 986   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 987   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 988   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 989   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 990
 991   return res;
 992 }
 993
 994
 995 /* Function destroy_loop_vec_info.
 996
 997    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 998    stmts in the loop.  */
 999
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1002 {
1003   struct loop *loop;
1004   basic_block *bbs;
1005   int nbbs;
1006   gimple_stmt_iterator si;
1007   int j;
1008   vec<slp_instance> slp_instances;
1009   slp_instance instance;
1010   bool swapped;
1011
1012   if (!loop_vinfo)
1013     return;
1014
1015   loop = LOOP_VINFO_LOOP (loop_vinfo);
1016
1017   bbs = LOOP_VINFO_BBS (loop_vinfo);
1018   nbbs = clean_stmts ? loop->num_nodes : 0;
1019   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1020
1021   for (j = 0; j < nbbs; j++)
1022     {
1023       basic_block bb = bbs[j];
1024       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025         free_stmt_vec_info (gsi_stmt (si));
1026
1027       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1028         {
1029           gimple stmt = gsi_stmt (si);
1030
1031           /* We may have broken canonical form by moving a constant
1032              into RHS1 of a commutative op.  Fix such occurrences.  */
1033           if (swapped && is_gimple_assign (stmt))
1034             {
1035               enum tree_code code = gimple_assign_rhs_code (stmt);
1036
1037               if ((code == PLUS_EXPR
1038                    || code == POINTER_PLUS_EXPR
1039                    || code == MULT_EXPR)
1040                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041                 swap_ssa_operands (stmt,
1042                                    gimple_assign_rhs1_ptr (stmt),
1043                                    gimple_assign_rhs2_ptr (stmt));
1044             }
1045
1046           /* Free stmt_vec_info.  */
1047           free_stmt_vec_info (stmt);
1048           gsi_next (&si);
1049         }
1050     }
1051
1052   free (LOOP_VINFO_BBS (loop_vinfo));
1053   vect_destroy_datarefs (loop_vinfo, NULL);
1054   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060     vect_free_slp_instance (instance);
1061
1062   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1066
1067   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1069
1070   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1071
1072   free (loop_vinfo);
1073   loop->aux = NULL;
1074 }
1075
1076
1077 /* Function vect_analyze_loop_1.
1078
1079    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080    for it. The different analyses will record information in the
1081    loop_vec_info struct.  This is a subset of the analyses applied in
1082    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083    that is now considered for (outer-loop) vectorization.  */
1084
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1087 {
1088   loop_vec_info loop_vinfo;
1089
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location,
1092                      "===== analyze_loop_nest_1 =====\n");
1093
1094   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1095
1096   loop_vinfo = vect_analyze_loop_form (loop);
1097   if (!loop_vinfo)
1098     {
1099       if (dump_enabled_p ())
1100         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101                          "bad inner-loop form.\n");
1102       return NULL;
1103     }
1104
1105   return loop_vinfo;
1106 }
1107
1108
1109 /* Function vect_analyze_loop_form.
1110
1111    Verify that certain CFG restrictions hold, including:
1112    - the loop has a pre-header
1113    - the loop has a single entry and exit
1114    - the loop exit condition is simple enough, and the number of iterations
1115      can be analyzed (a countable loop).  */
1116
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1119 {
1120   loop_vec_info loop_vinfo;
1121   gcond *loop_cond;
1122   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123   loop_vec_info inner_loop_vinfo = NULL;
1124
1125   if (dump_enabled_p ())
1126     dump_printf_loc (MSG_NOTE, vect_location,
1127                      "=== vect_analyze_loop_form ===\n");
1128
1129   /* Different restrictions apply when we are considering an inner-most loop,
1130      vs. an outer (nested) loop.
1131      (FORNOW. May want to relax some of these restrictions in the future).  */
1132
1133   if (!loop->inner)
1134     {
1135       /* Inner-most loop.  We currently require that the number of BBs is
1136          exactly 2 (the header and latch).  Vectorizable inner-most loops
1137          look like this:
1138
1139                         (pre-header)
1140                            |
1141                           header <--------+
1142                            | |            |
1143                            | +--> latch --+
1144                            |
1145                         (exit-bb)  */
1146
1147       if (loop->num_nodes != 2)
1148         {
1149           if (dump_enabled_p ())
1150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151                              "not vectorized: control flow in loop.\n");
1152           return NULL;
1153         }
1154
1155       if (empty_block_p (loop->header))
1156         {
1157           if (dump_enabled_p ())
1158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159                              "not vectorized: empty loop.\n");
1160           return NULL;
1161         }
1162     }
1163   else
1164     {
1165       struct loop *innerloop = loop->inner;
1166       edge entryedge;
1167
1168       /* Nested loop. We currently require that the loop is doubly-nested,
1169          contains a single inner loop, and the number of BBs is exactly 5.
1170          Vectorizable outer-loops look like this:
1171
1172                         (pre-header)
1173                            |
1174                           header <---+
1175                            |         |
1176                           inner-loop |
1177                            |         |
1178                           tail ------+
1179                            |
1180                         (exit-bb)
1181
1182          The inner-loop has the properties expected of inner-most loops
1183          as described above.  */
1184
1185       if ((loop->inner)->inner || (loop->inner)->next)
1186         {
1187           if (dump_enabled_p ())
1188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                              "not vectorized: multiple nested loops.\n");
1190           return NULL;
1191         }
1192
1193       /* Analyze the inner-loop.  */
1194       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195       if (!inner_loop_vinfo)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: Bad inner loop.\n");
1200           return NULL;
1201         }
1202
1203       if (!expr_invariant_in_loop_p (loop,
1204                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1205         {
1206           if (dump_enabled_p ())
1207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208                              "not vectorized: inner-loop count not"
1209                              " invariant.\n");
1210           destroy_loop_vec_info (inner_loop_vinfo, true);
1211           return NULL;
1212         }
1213
1214       if (loop->num_nodes != 5)
1215         {
1216           if (dump_enabled_p ())
1217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                              "not vectorized: control flow in loop.\n");
1219           destroy_loop_vec_info (inner_loop_vinfo, true);
1220           return NULL;
1221         }
1222
1223       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224       entryedge = EDGE_PRED (innerloop->header, 0);
1225       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226         entryedge = EDGE_PRED (innerloop->header, 1);
1227
1228       if (entryedge->src != loop->header
1229           || !single_exit (innerloop)
1230           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: unsupported outerloop form.\n");
1235           destroy_loop_vec_info (inner_loop_vinfo, true);
1236           return NULL;
1237         }
1238
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_NOTE, vect_location,
1241                          "Considering outer-loop vectorization.\n");
1242     }
1243
1244   if (!single_exit (loop)
1245       || EDGE_COUNT (loop->header->preds) != 2)
1246     {
1247       if (dump_enabled_p ())
1248         {
1249           if (!single_exit (loop))
1250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                              "not vectorized: multiple exits.\n");
1252           else if (EDGE_COUNT (loop->header->preds) != 2)
1253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                              "not vectorized: too many incoming edges.\n");
1255         }
1256       if (inner_loop_vinfo)
1257         destroy_loop_vec_info (inner_loop_vinfo, true);
1258       return NULL;
1259     }
1260
1261   /* We assume that the loop exit condition is at the end of the loop. i.e,
1262      that the loop is represented as a do-while (with a proper if-guard
1263      before the loop if needed), where the loop header contains all the
1264      executable statements, and the latch is empty.  */
1265   if (!empty_block_p (loop->latch)
1266       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267     {
1268       if (dump_enabled_p ())
1269         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270                          "not vectorized: latch block not empty.\n");
1271       if (inner_loop_vinfo)
1272         destroy_loop_vec_info (inner_loop_vinfo, true);
1273       return NULL;
1274     }
1275
1276   /* Make sure there exists a single-predecessor exit bb:  */
1277   if (!single_pred_p (single_exit (loop)->dest))
1278     {
1279       edge e = single_exit (loop);
1280       if (!(e->flags & EDGE_ABNORMAL))
1281         {
1282           split_loop_exit_edge (e);
1283           if (dump_enabled_p ())
1284             dump_printf (MSG_NOTE, "split exit edge.\n");
1285         }
1286       else
1287         {
1288           if (dump_enabled_p ())
1289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290                              "not vectorized: abnormal loop exit edge.\n");
1291           if (inner_loop_vinfo)
1292             destroy_loop_vec_info (inner_loop_vinfo, true);
1293           return NULL;
1294         }
1295     }
1296
1297   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298                                     &number_of_iterationsm1);
1299   if (!loop_cond)
1300     {
1301       if (dump_enabled_p ())
1302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                          "not vectorized: complicated exit condition.\n");
1304       if (inner_loop_vinfo)
1305         destroy_loop_vec_info (inner_loop_vinfo, true);
1306       return NULL;
1307     }
1308
1309   if (!number_of_iterations
1310       || chrec_contains_undetermined (number_of_iterations))
1311     {
1312       if (dump_enabled_p ())
1313         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                          "not vectorized: number of iterations cannot be "
1315                          "computed.\n");
1316       if (inner_loop_vinfo)
1317         destroy_loop_vec_info (inner_loop_vinfo, true);
1318       return NULL;
1319     }
1320
1321   if (integer_zerop (number_of_iterations))
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: number of iterations = 0.\n");
1326       if (inner_loop_vinfo)
1327         destroy_loop_vec_info (inner_loop_vinfo, true);
1328       return NULL;
1329     }
1330
1331   loop_vinfo = new_loop_vec_info (loop);
1332   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1335
1336   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1337     {
1338       if (dump_enabled_p ())
1339         {
1340           dump_printf_loc (MSG_NOTE, vect_location,
1341                            "Symbolic number of iterations is ");
1342           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343           dump_printf (MSG_NOTE, "\n");
1344         }
1345     }
1346
1347   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1348
1349   /* CHECKME: May want to keep it around it in the future.  */
1350   if (inner_loop_vinfo)
1351     destroy_loop_vec_info (inner_loop_vinfo, false);
1352
1353   gcc_assert (!loop->aux);
1354   loop->aux = loop_vinfo;
1355   return loop_vinfo;
1356 }
1357
1358 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1359    statements update the vectorization factor.  */
1360
1361 static void
1362 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1363 {
1364   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1365   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1366   int nbbs = loop->num_nodes;
1367   unsigned int vectorization_factor;
1368   int i;
1369
1370   if (dump_enabled_p ())
1371     dump_printf_loc (MSG_NOTE, vect_location,
1372                      "=== vect_update_vf_for_slp ===\n");
1373
1374   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1375   gcc_assert (vectorization_factor != 0);
1376
1377   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1378      vectorization factor of the loop is the unrolling factor required by
1379      the SLP instances.  If that unrolling factor is 1, we say, that we
1380      perform pure SLP on loop - cross iteration parallelism is not
1381      exploited.  */
1382   bool only_slp_in_loop = true;
1383   for (i = 0; i < nbbs; i++)
1384     {
1385       basic_block bb = bbs[i];
1386       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1387            gsi_next (&si))
1388         {
1389           gimple stmt = gsi_stmt (si);
1390           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1391           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1392               && STMT_VINFO_RELATED_STMT (stmt_info))
1393             {
1394               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1395               stmt_info = vinfo_for_stmt (stmt);
1396             }
1397           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399               && !PURE_SLP_STMT (stmt_info))
1400             /* STMT needs both SLP and loop-based vectorization.  */
1401             only_slp_in_loop = false;
1402         }
1403     }
1404
1405   if (only_slp_in_loop)
1406     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1407   else
1408     vectorization_factor
1409       = least_common_multiple (vectorization_factor,
1410                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1411
1412   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1413   if (dump_enabled_p ())
1414     dump_printf_loc (MSG_NOTE, vect_location,
1415                      "Updating vectorization factor to %d\n",
1416                      vectorization_factor);
1417 }
1418
1419 /* Function vect_analyze_loop_operations.
1420
1421    Scan the loop stmts and make sure they are all vectorizable.  */
1422
1423 static bool
1424 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1425 {
1426   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1427   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1428   int nbbs = loop->num_nodes;
1429   unsigned int vectorization_factor;
1430   int i;
1431   stmt_vec_info stmt_info;
1432   bool need_to_vectorize = false;
1433   int min_profitable_iters;
1434   int min_scalar_loop_bound;
1435   unsigned int th;
1436   bool ok;
1437   HOST_WIDE_INT max_niter;
1438   HOST_WIDE_INT estimated_niter;
1439   int min_profitable_estimate;
1440
1441   if (dump_enabled_p ())
1442     dump_printf_loc (MSG_NOTE, vect_location,
1443                      "=== vect_analyze_loop_operations ===\n");
1444
1445   for (i = 0; i < nbbs; i++)
1446     {
1447       basic_block bb = bbs[i];
1448
1449       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1450            gsi_next (&si))
1451         {
1452           gphi *phi = si.phi ();
1453           ok = true;
1454
1455           stmt_info = vinfo_for_stmt (phi);
1456           if (dump_enabled_p ())
1457             {
1458               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1459               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1460               dump_printf (MSG_NOTE, "\n");
1461             }
1462
1463           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1464              (i.e., a phi in the tail of the outer-loop).  */
1465           if (! is_loop_header_bb_p (bb))
1466             {
1467               /* FORNOW: we currently don't support the case that these phis
1468                  are not used in the outerloop (unless it is double reduction,
1469                  i.e., this phi is vect_reduction_def), cause this case
1470                  requires to actually do something here.  */
1471               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1472                    || STMT_VINFO_LIVE_P (stmt_info))
1473                   && STMT_VINFO_DEF_TYPE (stmt_info)
1474                      != vect_double_reduction_def)
1475                 {
1476                   if (dump_enabled_p ())
1477                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1478                                      "Unsupported loop-closed phi in "
1479                                      "outer-loop.\n");
1480                   return false;
1481                 }
1482
1483               /* If PHI is used in the outer loop, we check that its operand
1484                  is defined in the inner loop.  */
1485               if (STMT_VINFO_RELEVANT_P (stmt_info))
1486                 {
1487                   tree phi_op;
1488                   gimple op_def_stmt;
1489
1490                   if (gimple_phi_num_args (phi) != 1)
1491                     return false;
1492
1493                   phi_op = PHI_ARG_DEF (phi, 0);
1494                   if (TREE_CODE (phi_op) != SSA_NAME)
1495                     return false;
1496
1497                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1498                   if (gimple_nop_p (op_def_stmt)
1499                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1500                       || !vinfo_for_stmt (op_def_stmt))
1501                     return false;
1502
1503                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1504                         != vect_used_in_outer
1505                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1506                            != vect_used_in_outer_by_reduction)
1507                     return false;
1508                 }
1509
1510               continue;
1511             }
1512
1513           gcc_assert (stmt_info);
1514
1515           if (STMT_VINFO_LIVE_P (stmt_info))
1516             {
1517               /* FORNOW: not yet supported.  */
1518               if (dump_enabled_p ())
1519                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1520                                  "not vectorized: value used after loop.\n");
1521               return false;
1522             }
1523
1524           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1525               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1526             {
1527               /* A scalar-dependence cycle that we don't support.  */
1528               if (dump_enabled_p ())
1529                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530                                  "not vectorized: scalar dependence cycle.\n");
1531               return false;
1532             }
1533
1534           if (STMT_VINFO_RELEVANT_P (stmt_info))
1535             {
1536               need_to_vectorize = true;
1537               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1538                 ok = vectorizable_induction (phi, NULL, NULL);
1539             }
1540
1541           if (!ok)
1542             {
1543               if (dump_enabled_p ())
1544                 {
1545                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546                                    "not vectorized: relevant phi not "
1547                                    "supported: ");
1548                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1549                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1550                 }
1551               return false;
1552             }
1553         }
1554
1555       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556            gsi_next (&si))
1557         {
1558           gimple stmt = gsi_stmt (si);
1559           if (!gimple_clobber_p (stmt)
1560               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1561             return false;
1562         }
1563     } /* bbs */
1564
1565   /* All operations in the loop are either irrelevant (deal with loop
1566      control, or dead), or only used outside the loop and can be moved
1567      out of the loop (e.g. invariants, inductions).  The loop can be
1568      optimized away by scalar optimizations.  We're better off not
1569      touching this loop.  */
1570   if (!need_to_vectorize)
1571     {
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "All the computation can be taken out of the loop.\n");
1575       if (dump_enabled_p ())
1576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577                          "not vectorized: redundant loop. no profit to "
1578                          "vectorize.\n");
1579       return false;
1580     }
1581
1582   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1583   gcc_assert (vectorization_factor != 0);
1584
1585   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1586     dump_printf_loc (MSG_NOTE, vect_location,
1587                      "vectorization_factor = %d, niters = "
1588                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1589                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1590
1591   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1592        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1593       || ((max_niter = max_stmt_executions_int (loop)) != -1
1594           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1595     {
1596       if (dump_enabled_p ())
1597         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598                          "not vectorized: iteration count too small.\n");
1599       if (dump_enabled_p ())
1600         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                          "not vectorized: iteration count smaller than "
1602                          "vectorization factor.\n");
1603       return false;
1604     }
1605
1606   /* Analyze cost.  Decide if worth while to vectorize.  */
1607
1608   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1609                                       &min_profitable_estimate);
1610   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1611
1612   if (min_profitable_iters < 0)
1613     {
1614       if (dump_enabled_p ())
1615         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1616                          "not vectorized: vectorization not profitable.\n");
1617       if (dump_enabled_p ())
1618         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1619                          "not vectorized: vector version will never be "
1620                          "profitable.\n");
1621       return false;
1622     }
1623
1624   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1625                             * vectorization_factor) - 1);
1626
1627
1628   /* Use the cost model only if it is more conservative than user specified
1629      threshold.  */
1630
1631   th = (unsigned) min_scalar_loop_bound;
1632   if (min_profitable_iters
1633       && (!min_scalar_loop_bound
1634           || min_profitable_iters > min_scalar_loop_bound))
1635     th = (unsigned) min_profitable_iters;
1636
1637   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1638
1639   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1640       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1641     {
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644                          "not vectorized: vectorization not profitable.\n");
1645       if (dump_enabled_p ())
1646         dump_printf_loc (MSG_NOTE, vect_location,
1647                          "not vectorized: iteration count smaller than user "
1648                          "specified loop bound parameter or minimum profitable "
1649                          "iterations (whichever is more conservative).\n");
1650       return false;
1651     }
1652
1653   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1654       && ((unsigned HOST_WIDE_INT) estimated_niter
1655           <= MAX (th, (unsigned)min_profitable_estimate)))
1656     {
1657       if (dump_enabled_p ())
1658         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1659                          "not vectorized: estimated iteration count too "
1660                          "small.\n");
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_NOTE, vect_location,
1663                          "not vectorized: estimated iteration count smaller "
1664                          "than specified loop bound parameter or minimum "
1665                          "profitable iterations (whichever is more "
1666                          "conservative).\n");
1667       return false;
1668     }
1669
1670   return true;
1671 }
1672
1673
1674 /* Function vect_analyze_loop_2.
1675
1676    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1677    for it.  The different analyses will record information in the
1678    loop_vec_info struct.  */
1679 static bool
1680 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1681 {
1682   bool ok;
1683   int max_vf = MAX_VECTORIZATION_FACTOR;
1684   int min_vf = 2;
1685   unsigned int th;
1686   unsigned int n_stmts = 0;
1687
1688   /* Find all data references in the loop (which correspond to vdefs/vuses)
1689      and analyze their evolution in the loop.  Also adjust the minimal
1690      vectorization factor according to the loads and stores.
1691
1692      FORNOW: Handle only simple, array references, which
1693      alignment can be forced, and aligned pointer-references.  */
1694
1695   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1696   if (!ok)
1697     {
1698       if (dump_enabled_p ())
1699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700                          "bad data references.\n");
1701       return false;
1702     }
1703
1704   /* Classify all cross-iteration scalar data-flow cycles.
1705      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1706
1707   vect_analyze_scalar_cycles (loop_vinfo);
1708
1709   vect_pattern_recog (loop_vinfo, NULL);
1710
1711   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1712      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1713
1714   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1715   if (!ok)
1716     {
1717       if (dump_enabled_p ())
1718         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                          "bad data access.\n");
1720       return false;
1721     }
1722
1723   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1724
1725   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1726   if (!ok)
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                          "unexpected pattern.\n");
1731       return false;
1732     }
1733
1734   /* Analyze data dependences between the data-refs in the loop
1735      and adjust the maximum vectorization factor according to
1736      the dependences.
1737      FORNOW: fail at the first data dependence that we encounter.  */
1738
1739   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1740   if (!ok
1741       || max_vf < min_vf)
1742     {
1743       if (dump_enabled_p ())
1744             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745                              "bad data dependence.\n");
1746       return false;
1747     }
1748
1749   ok = vect_determine_vectorization_factor (loop_vinfo);
1750   if (!ok)
1751     {
1752       if (dump_enabled_p ())
1753         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754                          "can't determine vectorization factor.\n");
1755       return false;
1756     }
1757   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1758     {
1759       if (dump_enabled_p ())
1760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                          "bad data dependence.\n");
1762       return false;
1763     }
1764
1765   /* Analyze the alignment of the data-refs in the loop.
1766      Fail if a data reference is found that cannot be vectorized.  */
1767
1768   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1769   if (!ok)
1770     {
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773                          "bad data alignment.\n");
1774       return false;
1775     }
1776
1777   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1778      It is important to call pruning after vect_analyze_data_ref_accesses,
1779      since we use grouping information gathered by interleaving analysis.  */
1780   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1781   if (!ok)
1782     {
1783       if (dump_enabled_p ())
1784         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785                          "number of versioning for alias "
1786                          "run-time tests exceeds %d "
1787                          "(--param vect-max-version-for-alias-checks)\n",
1788                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1789       return false;
1790     }
1791
1792   /* This pass will decide on using loop versioning and/or loop peeling in
1793      order to enhance the alignment of data references in the loop.  */
1794
1795   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1796   if (!ok)
1797     {
1798       if (dump_enabled_p ())
1799         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1800                          "bad data alignment.\n");
1801       return false;
1802     }
1803
1804   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1805   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1806   if (ok)
1807     {
1808       /* If there are any SLP instances mark them as pure_slp.  */
1809       if (vect_make_slp_decision (loop_vinfo))
1810         {
1811           /* Find stmts that need to be both vectorized and SLPed.  */
1812           vect_detect_hybrid_slp (loop_vinfo);
1813
1814           /* Update the vectorization factor based on the SLP decision.  */
1815           vect_update_vf_for_slp (loop_vinfo);
1816
1817           /* Once VF is set, SLP costs should be updated since the number of
1818              created vector stmts depends on VF.  */
1819           vect_update_slp_costs_according_to_vf (loop_vinfo);
1820
1821           /* Analyze operations in the SLP instances.  Note this may
1822              remove unsupported SLP instances which makes the above
1823              SLP kind detection invalid.  */
1824           unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1825           vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1826           if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1827             return false;
1828         }
1829     }
1830   else
1831     return false;
1832
1833   /* Scan all the remaining operations in the loop that are not subject
1834      to SLP and make sure they are vectorizable.  */
1835   ok = vect_analyze_loop_operations (loop_vinfo);
1836   if (!ok)
1837     {
1838       if (dump_enabled_p ())
1839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840                          "bad operation or unsupported loop bound.\n");
1841       return false;
1842     }
1843
1844   /* Decide whether we need to create an epilogue loop to handle
1845      remaining scalar iterations.  */
1846   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1847         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1848        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1849
1850   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1851       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1852     {
1853       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1854                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1855           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1856         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1857     }
1858   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1859            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1860                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1861                /* In case of versioning, check if the maximum number of
1862                   iterations is greater than th.  If they are identical,
1863                   the epilogue is unnecessary.  */
1864                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1865                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1866                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1867                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1868     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1869
1870   /* If an epilogue loop is required make sure we can create one.  */
1871   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1872       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1873     {
1874       if (dump_enabled_p ())
1875         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1876       if (!vect_can_advance_ivs_p (loop_vinfo)
1877           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1878                                            single_exit (LOOP_VINFO_LOOP
1879                                                          (loop_vinfo))))
1880         {
1881           if (dump_enabled_p ())
1882             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883                              "not vectorized: can't create required "
1884                              "epilog loop\n");
1885           return false;
1886         }
1887     }
1888
1889   return true;
1890 }
1891
1892 /* Function vect_analyze_loop.
1893
1894    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1895    for it.  The different analyses will record information in the
1896    loop_vec_info struct.  */
1897 loop_vec_info
1898 vect_analyze_loop (struct loop *loop)
1899 {
1900   loop_vec_info loop_vinfo;
1901   unsigned int vector_sizes;
1902
1903   /* Autodetect first vector size we try.  */
1904   current_vector_size = 0;
1905   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1906
1907   if (dump_enabled_p ())
1908     dump_printf_loc (MSG_NOTE, vect_location,
1909                      "===== analyze_loop_nest =====\n");
1910
1911   if (loop_outer (loop)
1912       && loop_vec_info_for_loop (loop_outer (loop))
1913       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1914     {
1915       if (dump_enabled_p ())
1916         dump_printf_loc (MSG_NOTE, vect_location,
1917                          "outer-loop already vectorized.\n");
1918       return NULL;
1919     }
1920
1921   while (1)
1922     {
1923       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1924       loop_vinfo = vect_analyze_loop_form (loop);
1925       if (!loop_vinfo)
1926         {
1927           if (dump_enabled_p ())
1928             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                              "bad loop form.\n");
1930           return NULL;
1931         }
1932
1933       if (vect_analyze_loop_2 (loop_vinfo))
1934         {
1935           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1936
1937           return loop_vinfo;
1938         }
1939
1940       destroy_loop_vec_info (loop_vinfo, true);
1941
1942       vector_sizes &= ~current_vector_size;
1943       if (vector_sizes == 0
1944           || current_vector_size == 0)
1945         return NULL;
1946
1947       /* Try the next biggest vector size.  */
1948       current_vector_size = 1 << floor_log2 (vector_sizes);
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_NOTE, vect_location,
1951                          "***** Re-trying analysis with "
1952                          "vector size %d\n", current_vector_size);
1953     }
1954 }
1955
1956
1957 /* Function reduction_code_for_scalar_code
1958
1959    Input:
1960    CODE - tree_code of a reduction operations.
1961
1962    Output:
1963    REDUC_CODE - the corresponding tree-code to be used to reduce the
1964       vector of partial results into a single scalar result, or ERROR_MARK
1965       if the operation is a supported reduction operation, but does not have
1966       such a tree-code.
1967
1968    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1969
1970 static bool
1971 reduction_code_for_scalar_code (enum tree_code code,
1972                                 enum tree_code *reduc_code)
1973 {
1974   switch (code)
1975     {
1976       case MAX_EXPR:
1977         *reduc_code = REDUC_MAX_EXPR;
1978         return true;
1979
1980       case MIN_EXPR:
1981         *reduc_code = REDUC_MIN_EXPR;
1982         return true;
1983
1984       case PLUS_EXPR:
1985         *reduc_code = REDUC_PLUS_EXPR;
1986         return true;
1987
1988       case MULT_EXPR:
1989       case MINUS_EXPR:
1990       case BIT_IOR_EXPR:
1991       case BIT_XOR_EXPR:
1992       case BIT_AND_EXPR:
1993         *reduc_code = ERROR_MARK;
1994         return true;
1995
1996       default:
1997        return false;
1998     }
1999 }
2000
2001
2002 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2003    STMT is printed with a message MSG. */
2004
2005 static void
2006 report_vect_op (int msg_type, gimple stmt, const char *msg)
2007 {
2008   dump_printf_loc (msg_type, vect_location, "%s", msg);
2009   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2010   dump_printf (msg_type, "\n");
2011 }
2012
2013
2014 /* Detect SLP reduction of the form:
2015
2016    #a1 = phi <a5, a0>
2017    a2 = operation (a1)
2018    a3 = operation (a2)
2019    a4 = operation (a3)
2020    a5 = operation (a4)
2021
2022    #a = phi <a5>
2023
2024    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2025    FIRST_STMT is the first reduction stmt in the chain
2026    (a2 = operation (a1)).
2027
2028    Return TRUE if a reduction chain was detected.  */
2029
2030 static bool
2031 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2032 {
2033   struct loop *loop = (gimple_bb (phi))->loop_father;
2034   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2035   enum tree_code code;
2036   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2037   stmt_vec_info use_stmt_info, current_stmt_info;
2038   tree lhs;
2039   imm_use_iterator imm_iter;
2040   use_operand_p use_p;
2041   int nloop_uses, size = 0, n_out_of_loop_uses;
2042   bool found = false;
2043
2044   if (loop != vect_loop)
2045     return false;
2046
2047   lhs = PHI_RESULT (phi);
2048   code = gimple_assign_rhs_code (first_stmt);
2049   while (1)
2050     {
2051       nloop_uses = 0;
2052       n_out_of_loop_uses = 0;
2053       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2054         {
2055           gimple use_stmt = USE_STMT (use_p);
2056           if (is_gimple_debug (use_stmt))
2057             continue;
2058
2059           /* Check if we got back to the reduction phi.  */
2060           if (use_stmt == phi)
2061             {
2062               loop_use_stmt = use_stmt;
2063               found = true;
2064               break;
2065             }
2066
2067           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2068             {
2069               loop_use_stmt = use_stmt;
2070               nloop_uses++;
2071             }
2072            else
2073              n_out_of_loop_uses++;
2074
2075            /* There are can be either a single use in the loop or two uses in
2076               phi nodes.  */
2077            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2078              return false;
2079         }
2080
2081       if (found)
2082         break;
2083
2084       /* We reached a statement with no loop uses.  */
2085       if (nloop_uses == 0)
2086         return false;
2087
2088       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2089       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2090         return false;
2091
2092       if (!is_gimple_assign (loop_use_stmt)
2093           || code != gimple_assign_rhs_code (loop_use_stmt)
2094           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2095         return false;
2096
2097       /* Insert USE_STMT into reduction chain.  */
2098       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2099       if (current_stmt)
2100         {
2101           current_stmt_info = vinfo_for_stmt (current_stmt);
2102           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2103           GROUP_FIRST_ELEMENT (use_stmt_info)
2104             = GROUP_FIRST_ELEMENT (current_stmt_info);
2105         }
2106       else
2107         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2108
2109       lhs = gimple_assign_lhs (loop_use_stmt);
2110       current_stmt = loop_use_stmt;
2111       size++;
2112    }
2113
2114   if (!found || loop_use_stmt != phi || size < 2)
2115     return false;
2116
2117   /* Swap the operands, if needed, to make the reduction operand be the second
2118      operand.  */
2119   lhs = PHI_RESULT (phi);
2120   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2121   while (next_stmt)
2122     {
2123       if (gimple_assign_rhs2 (next_stmt) == lhs)
2124         {
2125           tree op = gimple_assign_rhs1 (next_stmt);
2126           gimple def_stmt = NULL;
2127
2128           if (TREE_CODE (op) == SSA_NAME)
2129             def_stmt = SSA_NAME_DEF_STMT (op);
2130
2131           /* Check that the other def is either defined in the loop
2132              ("vect_internal_def"), or it's an induction (defined by a
2133              loop-header phi-node).  */
2134           if (def_stmt
2135               && gimple_bb (def_stmt)
2136               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2137               && (is_gimple_assign (def_stmt)
2138                   || is_gimple_call (def_stmt)
2139                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2140                            == vect_induction_def
2141                   || (gimple_code (def_stmt) == GIMPLE_PHI
2142                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2143                                   == vect_internal_def
2144                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2145             {
2146               lhs = gimple_assign_lhs (next_stmt);
2147               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2148               continue;
2149             }
2150
2151           return false;
2152         }
2153       else
2154         {
2155           tree op = gimple_assign_rhs2 (next_stmt);
2156           gimple def_stmt = NULL;
2157
2158           if (TREE_CODE (op) == SSA_NAME)
2159             def_stmt = SSA_NAME_DEF_STMT (op);
2160
2161           /* Check that the other def is either defined in the loop
2162             ("vect_internal_def"), or it's an induction (defined by a
2163             loop-header phi-node).  */
2164           if (def_stmt
2165               && gimple_bb (def_stmt)
2166               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2167               && (is_gimple_assign (def_stmt)
2168                   || is_gimple_call (def_stmt)
2169                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2170                               == vect_induction_def
2171                   || (gimple_code (def_stmt) == GIMPLE_PHI
2172                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2173                                   == vect_internal_def
2174                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2175             {
2176               if (dump_enabled_p ())
2177                 {
2178                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2179                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2180                   dump_printf (MSG_NOTE, "\n");
2181                 }
2182
2183               swap_ssa_operands (next_stmt,
2184                                  gimple_assign_rhs1_ptr (next_stmt),
2185                                  gimple_assign_rhs2_ptr (next_stmt));
2186               update_stmt (next_stmt);
2187
2188               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2189                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2190             }
2191           else
2192             return false;
2193         }
2194
2195       lhs = gimple_assign_lhs (next_stmt);
2196       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2197     }
2198
2199   /* Save the chain for further analysis in SLP detection.  */
2200   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2201   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2202   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2203
2204   return true;
2205 }
2206
2207
2208 /* Function vect_is_simple_reduction_1
2209
2210    (1) Detect a cross-iteration def-use cycle that represents a simple
2211    reduction computation.  We look for the following pattern:
2212
2213    loop_header:
2214      a1 = phi < a0, a2 >
2215      a3 = ...
2216      a2 = operation (a3, a1)
2217
2218    or
2219
2220    a3 = ...
2221    loop_header:
2222      a1 = phi < a0, a2 >
2223      a2 = operation (a3, a1)
2224
2225    such that:
2226    1. operation is commutative and associative and it is safe to
2227       change the order of the computation (if CHECK_REDUCTION is true)
2228    2. no uses for a2 in the loop (a2 is used out of the loop)
2229    3. no uses of a1 in the loop besides the reduction operation
2230    4. no uses of a1 outside the loop.
2231
2232    Conditions 1,4 are tested here.
2233    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2234
2235    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2236    nested cycles, if CHECK_REDUCTION is false.
2237
2238    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2239    reductions:
2240
2241      a1 = phi < a0, a2 >
2242      inner loop (def of a3)
2243      a2 = phi < a3 >
2244
2245    If MODIFY is true it tries also to rework the code in-place to enable
2246    detection of more reduction patterns.  For the time being we rewrite
2247    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2248 */
2249
2250 static gimple
2251 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2252                             bool check_reduction, bool *double_reduc,
2253                             bool modify)
2254 {
2255   struct loop *loop = (gimple_bb (phi))->loop_father;
2256   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2257   edge latch_e = loop_latch_edge (loop);
2258   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2259   gimple def_stmt, def1 = NULL, def2 = NULL;
2260   enum tree_code orig_code, code;
2261   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2262   tree type;
2263   int nloop_uses;
2264   tree name;
2265   imm_use_iterator imm_iter;
2266   use_operand_p use_p;
2267   bool phi_def;
2268
2269   *double_reduc = false;
2270
2271   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2272      otherwise, we assume outer loop vectorization.  */
2273   gcc_assert ((check_reduction && loop == vect_loop)
2274               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2275
2276   name = PHI_RESULT (phi);
2277   /* ???  If there are no uses of the PHI result the inner loop reduction
2278      won't be detected as possibly double-reduction by vectorizable_reduction
2279      because that tries to walk the PHI arg from the preheader edge which
2280      can be constant.  See PR60382.  */
2281   if (has_zero_uses (name))
2282     return NULL;
2283   nloop_uses = 0;
2284   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2285     {
2286       gimple use_stmt = USE_STMT (use_p);
2287       if (is_gimple_debug (use_stmt))
2288         continue;
2289
2290       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2291         {
2292           if (dump_enabled_p ())
2293             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294                              "intermediate value used outside loop.\n");
2295
2296           return NULL;
2297         }
2298
2299       nloop_uses++;
2300       if (nloop_uses > 1)
2301         {
2302           if (dump_enabled_p ())
2303             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304                              "reduction used in loop.\n");
2305           return NULL;
2306         }
2307     }
2308
2309   if (TREE_CODE (loop_arg) != SSA_NAME)
2310     {
2311       if (dump_enabled_p ())
2312         {
2313           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314                            "reduction: not ssa_name: ");
2315           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2316           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2317         }
2318       return NULL;
2319     }
2320
2321   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2322   if (!def_stmt)
2323     {
2324       if (dump_enabled_p ())
2325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326                          "reduction: no def_stmt.\n");
2327       return NULL;
2328     }
2329
2330   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2331     {
2332       if (dump_enabled_p ())
2333         {
2334           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2335           dump_printf (MSG_NOTE, "\n");
2336         }
2337       return NULL;
2338     }
2339
2340   if (is_gimple_assign (def_stmt))
2341     {
2342       name = gimple_assign_lhs (def_stmt);
2343       phi_def = false;
2344     }
2345   else
2346     {
2347       name = PHI_RESULT (def_stmt);
2348       phi_def = true;
2349     }
2350
2351   nloop_uses = 0;
2352   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2353     {
2354       gimple use_stmt = USE_STMT (use_p);
2355       if (is_gimple_debug (use_stmt))
2356         continue;
2357       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2358         nloop_uses++;
2359       if (nloop_uses > 1)
2360         {
2361           if (dump_enabled_p ())
2362             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363                              "reduction used in loop.\n");
2364           return NULL;
2365         }
2366     }
2367
2368   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2369      defined in the inner loop.  */
2370   if (phi_def)
2371     {
2372       op1 = PHI_ARG_DEF (def_stmt, 0);
2373
2374       if (gimple_phi_num_args (def_stmt) != 1
2375           || TREE_CODE (op1) != SSA_NAME)
2376         {
2377           if (dump_enabled_p ())
2378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379                              "unsupported phi node definition.\n");
2380
2381           return NULL;
2382         }
2383
2384       def1 = SSA_NAME_DEF_STMT (op1);
2385       if (gimple_bb (def1)
2386           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2387           && loop->inner
2388           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2389           && is_gimple_assign (def1))
2390         {
2391           if (dump_enabled_p ())
2392             report_vect_op (MSG_NOTE, def_stmt,
2393                             "detected double reduction: ");
2394
2395           *double_reduc = true;
2396           return def_stmt;
2397         }
2398
2399       return NULL;
2400     }
2401
2402   code = orig_code = gimple_assign_rhs_code (def_stmt);
2403
2404   /* We can handle "res -= x[i]", which is non-associative by
2405      simply rewriting this into "res += -x[i]".  Avoid changing
2406      gimple instruction for the first simple tests and only do this
2407      if we're allowed to change code at all.  */
2408   if (code == MINUS_EXPR
2409       && modify
2410       && (op1 = gimple_assign_rhs1 (def_stmt))
2411       && TREE_CODE (op1) == SSA_NAME
2412       && SSA_NAME_DEF_STMT (op1) == phi)
2413     code = PLUS_EXPR;
2414
2415   if (check_reduction
2416       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2417     {
2418       if (dump_enabled_p ())
2419         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2420                         "reduction: not commutative/associative: ");
2421       return NULL;
2422     }
2423
2424   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2425     {
2426       if (code != COND_EXPR)
2427         {
2428           if (dump_enabled_p ())
2429             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2430                             "reduction: not binary operation: ");
2431
2432           return NULL;
2433         }
2434
2435       op3 = gimple_assign_rhs1 (def_stmt);
2436       if (COMPARISON_CLASS_P (op3))
2437         {
2438           op4 = TREE_OPERAND (op3, 1);
2439           op3 = TREE_OPERAND (op3, 0);
2440         }
2441
2442       op1 = gimple_assign_rhs2 (def_stmt);
2443       op2 = gimple_assign_rhs3 (def_stmt);
2444
2445       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2446         {
2447           if (dump_enabled_p ())
2448             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2449                             "reduction: uses not ssa_names: ");
2450
2451           return NULL;
2452         }
2453     }
2454   else
2455     {
2456       op1 = gimple_assign_rhs1 (def_stmt);
2457       op2 = gimple_assign_rhs2 (def_stmt);
2458
2459       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2460         {
2461           if (dump_enabled_p ())
2462             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2463                             "reduction: uses not ssa_names: ");
2464
2465           return NULL;
2466         }
2467    }
2468
2469   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2470   if ((TREE_CODE (op1) == SSA_NAME
2471        && !types_compatible_p (type,TREE_TYPE (op1)))
2472       || (TREE_CODE (op2) == SSA_NAME
2473           && !types_compatible_p (type, TREE_TYPE (op2)))
2474       || (op3 && TREE_CODE (op3) == SSA_NAME
2475           && !types_compatible_p (type, TREE_TYPE (op3)))
2476       || (op4 && TREE_CODE (op4) == SSA_NAME
2477           && !types_compatible_p (type, TREE_TYPE (op4))))
2478     {
2479       if (dump_enabled_p ())
2480         {
2481           dump_printf_loc (MSG_NOTE, vect_location,
2482                            "reduction: multiple types: operation type: ");
2483           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2484           dump_printf (MSG_NOTE, ", operands types: ");
2485           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2486                              TREE_TYPE (op1));
2487           dump_printf (MSG_NOTE, ",");
2488           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2489                              TREE_TYPE (op2));
2490           if (op3)
2491             {
2492               dump_printf (MSG_NOTE, ",");
2493               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2494                                  TREE_TYPE (op3));
2495             }
2496
2497           if (op4)
2498             {
2499               dump_printf (MSG_NOTE, ",");
2500               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2501                                  TREE_TYPE (op4));
2502             }
2503           dump_printf (MSG_NOTE, "\n");
2504         }
2505
2506       return NULL;
2507     }
2508
2509   /* Check that it's ok to change the order of the computation.
2510      Generally, when vectorizing a reduction we change the order of the
2511      computation.  This may change the behavior of the program in some
2512      cases, so we need to check that this is ok.  One exception is when
2513      vectorizing an outer-loop: the inner-loop is executed sequentially,
2514      and therefore vectorizing reductions in the inner-loop during
2515      outer-loop vectorization is safe.  */
2516
2517   /* CHECKME: check for !flag_finite_math_only too?  */
2518   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2519       && check_reduction)
2520     {
2521       /* Changing the order of operations changes the semantics.  */
2522       if (dump_enabled_p ())
2523         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2524                         "reduction: unsafe fp math optimization: ");
2525       return NULL;
2526     }
2527   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2528            && check_reduction)
2529     {
2530       /* Changing the order of operations changes the semantics.  */
2531       if (dump_enabled_p ())
2532         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2533                         "reduction: unsafe int math optimization: ");
2534       return NULL;
2535     }
2536   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2537     {
2538       /* Changing the order of operations changes the semantics.  */
2539       if (dump_enabled_p ())
2540         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2541                         "reduction: unsafe fixed-point math optimization: ");
2542       return NULL;
2543     }
2544
2545   /* If we detected "res -= x[i]" earlier, rewrite it into
2546      "res += -x[i]" now.  If this turns out to be useless reassoc
2547      will clean it up again.  */
2548   if (orig_code == MINUS_EXPR)
2549     {
2550       tree rhs = gimple_assign_rhs2 (def_stmt);
2551       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2552       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2553       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2554       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2555                                                           loop_info, NULL));
2556       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2557       gimple_assign_set_rhs2 (def_stmt, negrhs);
2558       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2559       update_stmt (def_stmt);
2560     }
2561
2562   /* Reduction is safe. We're dealing with one of the following:
2563      1) integer arithmetic and no trapv
2564      2) floating point arithmetic, and special flags permit this optimization
2565      3) nested cycle (i.e., outer loop vectorization).  */
2566   if (TREE_CODE (op1) == SSA_NAME)
2567     def1 = SSA_NAME_DEF_STMT (op1);
2568
2569   if (TREE_CODE (op2) == SSA_NAME)
2570     def2 = SSA_NAME_DEF_STMT (op2);
2571
2572   if (code != COND_EXPR
2573       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2574     {
2575       if (dump_enabled_p ())
2576         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2577       return NULL;
2578     }
2579
2580   /* Check that one def is the reduction def, defined by PHI,
2581      the other def is either defined in the loop ("vect_internal_def"),
2582      or it's an induction (defined by a loop-header phi-node).  */
2583
2584   if (def2 && def2 == phi
2585       && (code == COND_EXPR
2586           || !def1 || gimple_nop_p (def1)
2587           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2588           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2589               && (is_gimple_assign (def1)
2590                   || is_gimple_call (def1)
2591                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2592                       == vect_induction_def
2593                   || (gimple_code (def1) == GIMPLE_PHI
2594                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2595                           == vect_internal_def
2596                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2597     {
2598       if (dump_enabled_p ())
2599         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2600       return def_stmt;
2601     }
2602
2603   if (def1 && def1 == phi
2604       && (code == COND_EXPR
2605           || !def2 || gimple_nop_p (def2)
2606           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2607           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2608               && (is_gimple_assign (def2)
2609                   || is_gimple_call (def2)
2610                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2611                       == vect_induction_def
2612                   || (gimple_code (def2) == GIMPLE_PHI
2613                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2614                           == vect_internal_def
2615                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2616     {
2617       if (check_reduction)
2618         {
2619           /* Swap operands (just for simplicity - so that the rest of the code
2620              can assume that the reduction variable is always the last (second)
2621              argument).  */
2622           if (dump_enabled_p ())
2623             report_vect_op (MSG_NOTE, def_stmt,
2624                             "detected reduction: need to swap operands: ");
2625
2626           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2627                              gimple_assign_rhs2_ptr (def_stmt));
2628
2629           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2630             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2631         }
2632       else
2633         {
2634           if (dump_enabled_p ())
2635             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2636         }
2637
2638       return def_stmt;
2639     }
2640
2641   /* Try to find SLP reduction chain.  */
2642   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2643     {
2644       if (dump_enabled_p ())
2645         report_vect_op (MSG_NOTE, def_stmt,
2646                         "reduction: detected reduction chain: ");
2647
2648       return def_stmt;
2649     }
2650
2651   if (dump_enabled_p ())
2652     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2653                     "reduction: unknown pattern: ");
2654
2655   return NULL;
2656 }
2657
2658 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2659    in-place.  Arguments as there.  */
2660
2661 static gimple
2662 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2663                           bool check_reduction, bool *double_reduc)
2664 {
2665   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2666                                      double_reduc, false);
2667 }
2668
2669 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2670    in-place if it enables detection of more reductions.  Arguments
2671    as there.  */
2672
2673 gimple
2674 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2675                           bool check_reduction, bool *double_reduc)
2676 {
2677   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2678                                      double_reduc, true);
2679 }
2680
2681 /* Calculate the cost of one scalar iteration of the loop.  */
2682 int
2683 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2684                                        stmt_vector_for_cost *scalar_cost_vec)
2685 {
2686   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2687   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2688   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2689   int innerloop_iters, i;
2690
2691   /* Count statements in scalar loop.  Using this as scalar cost for a single
2692      iteration for now.
2693
2694      TODO: Add outer loop support.
2695
2696      TODO: Consider assigning different costs to different scalar
2697      statements.  */
2698
2699   /* FORNOW.  */
2700   innerloop_iters = 1;
2701   if (loop->inner)
2702     innerloop_iters = 50; /* FIXME */
2703
2704   for (i = 0; i < nbbs; i++)
2705     {
2706       gimple_stmt_iterator si;
2707       basic_block bb = bbs[i];
2708
2709       if (bb->loop_father == loop->inner)
2710         factor = innerloop_iters;
2711       else
2712         factor = 1;
2713
2714       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2715         {
2716           gimple stmt = gsi_stmt (si);
2717           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2718
2719           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2720             continue;
2721
2722           /* Skip stmts that are not vectorized inside the loop.  */
2723           if (stmt_info
2724               && !STMT_VINFO_RELEVANT_P (stmt_info)
2725               && (!STMT_VINFO_LIVE_P (stmt_info)
2726                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2727               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2728             continue;
2729
2730           vect_cost_for_stmt kind;
2731           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2732             {
2733               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2734                kind = scalar_load;
2735              else
2736                kind = scalar_store;
2737             }
2738           else
2739             kind = scalar_stmt;
2740
2741           scalar_single_iter_cost
2742             += record_stmt_cost (scalar_cost_vec, factor, kind,
2743                                  NULL, 0, vect_prologue);
2744         }
2745     }
2746   return scalar_single_iter_cost;
2747 }
2748
2749 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2750 int
2751 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2752                              int *peel_iters_epilogue,
2753                              stmt_vector_for_cost *scalar_cost_vec,
2754                              stmt_vector_for_cost *prologue_cost_vec,
2755                              stmt_vector_for_cost *epilogue_cost_vec)
2756 {
2757   int retval = 0;
2758   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2759
2760   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2761     {
2762       *peel_iters_epilogue = vf/2;
2763       if (dump_enabled_p ())
2764         dump_printf_loc (MSG_NOTE, vect_location,
2765                          "cost model: epilogue peel iters set to vf/2 "
2766                          "because loop iterations are unknown .\n");
2767
2768       /* If peeled iterations are known but number of scalar loop
2769          iterations are unknown, count a taken branch per peeled loop.  */
2770       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2771                                  NULL, 0, vect_prologue);
2772       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2773                                  NULL, 0, vect_epilogue);
2774     }
2775   else
2776     {
2777       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2778       peel_iters_prologue = niters < peel_iters_prologue ?
2779                             niters : peel_iters_prologue;
2780       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2781       /* If we need to peel for gaps, but no peeling is required, we have to
2782          peel VF iterations.  */
2783       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2784         *peel_iters_epilogue = vf;
2785     }
2786
2787   stmt_info_for_cost *si;
2788   int j;
2789   if (peel_iters_prologue)
2790     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2791       retval += record_stmt_cost (prologue_cost_vec,
2792                                   si->count * peel_iters_prologue,
2793                                   si->kind, NULL, si->misalign,
2794                                   vect_prologue);
2795   if (*peel_iters_epilogue)
2796     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2797       retval += record_stmt_cost (epilogue_cost_vec,
2798                                   si->count * *peel_iters_epilogue,
2799                                   si->kind, NULL, si->misalign,
2800                                   vect_epilogue);
2801
2802   return retval;
2803 }
2804
2805 /* Function vect_estimate_min_profitable_iters
2806
2807    Return the number of iterations required for the vector version of the
2808    loop to be profitable relative to the cost of the scalar version of the
2809    loop.  */
2810
2811 static void
2812 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2813                                     int *ret_min_profitable_niters,
2814                                     int *ret_min_profitable_estimate)
2815 {
2816   int min_profitable_iters;
2817   int min_profitable_estimate;
2818   int peel_iters_prologue;
2819   int peel_iters_epilogue;
2820   unsigned vec_inside_cost = 0;
2821   int vec_outside_cost = 0;
2822   unsigned vec_prologue_cost = 0;
2823   unsigned vec_epilogue_cost = 0;
2824   int scalar_single_iter_cost = 0;
2825   int scalar_outside_cost = 0;
2826   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2827   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2828   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2829
2830   /* Cost model disabled.  */
2831   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2832     {
2833       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2834       *ret_min_profitable_niters = 0;
2835       *ret_min_profitable_estimate = 0;
2836       return;
2837     }
2838
2839   /* Requires loop versioning tests to handle misalignment.  */
2840   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2841     {
2842       /*  FIXME: Make cost depend on complexity of individual check.  */
2843       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2844       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2845                             vect_prologue);
2846       dump_printf (MSG_NOTE,
2847                    "cost model: Adding cost of checks for loop "
2848                    "versioning to treat misalignment.\n");
2849     }
2850
2851   /* Requires loop versioning with alias checks.  */
2852   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2853     {
2854       /*  FIXME: Make cost depend on complexity of individual check.  */
2855       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2856       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2857                             vect_prologue);
2858       dump_printf (MSG_NOTE,
2859                    "cost model: Adding cost of checks for loop "
2860                    "versioning aliasing.\n");
2861     }
2862
2863   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2864       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2865     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2866                           vect_prologue);
2867
2868   /* Count statements in scalar loop.  Using this as scalar cost for a single
2869      iteration for now.
2870
2871      TODO: Add outer loop support.
2872
2873      TODO: Consider assigning different costs to different scalar
2874      statements.  */
2875
2876   auto_vec<stmt_info_for_cost> scalar_cost_vec;
2877   scalar_single_iter_cost
2878      = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2879
2880   /* Add additional cost for the peeled instructions in prologue and epilogue
2881      loop.
2882
2883      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2884      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2885
2886      TODO: Build an expression that represents peel_iters for prologue and
2887      epilogue to be used in a run-time test.  */
2888
2889   if (npeel  < 0)
2890     {
2891       peel_iters_prologue = vf/2;
2892       dump_printf (MSG_NOTE, "cost model: "
2893                    "prologue peel iters set to vf/2.\n");
2894
2895       /* If peeling for alignment is unknown, loop bound of main loop becomes
2896          unknown.  */
2897       peel_iters_epilogue = vf/2;
2898       dump_printf (MSG_NOTE, "cost model: "
2899                    "epilogue peel iters set to vf/2 because "
2900                    "peeling for alignment is unknown.\n");
2901
2902       /* If peeled iterations are unknown, count a taken branch and a not taken
2903          branch per peeled loop. Even if scalar loop iterations are known,
2904          vector iterations are not known since peeled prologue iterations are
2905          not known. Hence guards remain the same.  */
2906       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2907                             NULL, 0, vect_prologue);
2908       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2909                             NULL, 0, vect_prologue);
2910       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2911                             NULL, 0, vect_epilogue);
2912       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2913                             NULL, 0, vect_epilogue);
2914       stmt_info_for_cost *si;
2915       int j;
2916       FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2917         {
2918           struct _stmt_vec_info *stmt_info
2919             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2920           (void) add_stmt_cost (target_cost_data,
2921                                 si->count * peel_iters_prologue,
2922                                 si->kind, stmt_info, si->misalign,
2923                                 vect_prologue);
2924           (void) add_stmt_cost (target_cost_data,
2925                                 si->count * peel_iters_epilogue,
2926                                 si->kind, stmt_info, si->misalign,
2927                                 vect_epilogue);
2928         }
2929     }
2930   else
2931     {
2932       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2933       stmt_info_for_cost *si;
2934       int j;
2935       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2936
2937       prologue_cost_vec.create (2);
2938       epilogue_cost_vec.create (2);
2939       peel_iters_prologue = npeel;
2940
2941       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2942                                           &peel_iters_epilogue,
2943                                           &scalar_cost_vec,
2944                                           &prologue_cost_vec,
2945                                           &epilogue_cost_vec);
2946
2947       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2948         {
2949           struct _stmt_vec_info *stmt_info
2950             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2951           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2952                                 si->misalign, vect_prologue);
2953         }
2954
2955       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2956         {
2957           struct _stmt_vec_info *stmt_info
2958             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2959           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2960                                 si->misalign, vect_epilogue);
2961         }
2962
2963       prologue_cost_vec.release ();
2964       epilogue_cost_vec.release ();
2965     }
2966
2967   /* FORNOW: The scalar outside cost is incremented in one of the
2968      following ways:
2969
2970      1. The vectorizer checks for alignment and aliasing and generates
2971      a condition that allows dynamic vectorization.  A cost model
2972      check is ANDED with the versioning condition.  Hence scalar code
2973      path now has the added cost of the versioning check.
2974
2975        if (cost > th & versioning_check)
2976          jmp to vector code
2977
2978      Hence run-time scalar is incremented by not-taken branch cost.
2979
2980      2. The vectorizer then checks if a prologue is required.  If the
2981      cost model check was not done before during versioning, it has to
2982      be done before the prologue check.
2983
2984        if (cost <= th)
2985          prologue = scalar_iters
2986        if (prologue == 0)
2987          jmp to vector code
2988        else
2989          execute prologue
2990        if (prologue == num_iters)
2991          go to exit
2992
2993      Hence the run-time scalar cost is incremented by a taken branch,
2994      plus a not-taken branch, plus a taken branch cost.
2995
2996      3. The vectorizer then checks if an epilogue is required.  If the
2997      cost model check was not done before during prologue check, it
2998      has to be done with the epilogue check.
2999
3000        if (prologue == 0)
3001          jmp to vector code
3002        else
3003          execute prologue
3004        if (prologue == num_iters)
3005          go to exit
3006        vector code:
3007          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3008            jmp to epilogue
3009
3010      Hence the run-time scalar cost should be incremented by 2 taken
3011      branches.
3012
3013      TODO: The back end may reorder the BBS's differently and reverse
3014      conditions/branch directions.  Change the estimates below to
3015      something more reasonable.  */
3016
3017   /* If the number of iterations is known and we do not do versioning, we can
3018      decide whether to vectorize at compile time.  Hence the scalar version
3019      do not carry cost model guard costs.  */
3020   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3021       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3022       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3023     {
3024       /* Cost model check occurs at versioning.  */
3025       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3026           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3027         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3028       else
3029         {
3030           /* Cost model check occurs at prologue generation.  */
3031           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3032             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3033               + vect_get_stmt_cost (cond_branch_not_taken);
3034           /* Cost model check occurs at epilogue generation.  */
3035           else
3036             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3037         }
3038     }
3039
3040   /* Complete the target-specific cost calculations.  */
3041   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3042                &vec_inside_cost, &vec_epilogue_cost);
3043
3044   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3045
3046   if (dump_enabled_p ())
3047     {
3048       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3049       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3050                    vec_inside_cost);
3051       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3052                    vec_prologue_cost);
3053       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3054                    vec_epilogue_cost);
3055       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3056                    scalar_single_iter_cost);
3057       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3058                    scalar_outside_cost);
3059       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3060                    vec_outside_cost);
3061       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3062                    peel_iters_prologue);
3063       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3064                    peel_iters_epilogue);
3065     }
3066
3067   /* Calculate number of iterations required to make the vector version
3068      profitable, relative to the loop bodies only.  The following condition
3069      must hold true:
3070      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3071      where
3072      SIC = scalar iteration cost, VIC = vector iteration cost,
3073      VOC = vector outside cost, VF = vectorization factor,
3074      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3075      SOC = scalar outside cost for run time cost model check.  */
3076
3077   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3078     {
3079       if (vec_outside_cost <= 0)
3080         min_profitable_iters = 1;
3081       else
3082         {
3083           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3084                                   - vec_inside_cost * peel_iters_prologue
3085                                   - vec_inside_cost * peel_iters_epilogue)
3086                                  / ((scalar_single_iter_cost * vf)
3087                                     - vec_inside_cost);
3088
3089           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3090               <= (((int) vec_inside_cost * min_profitable_iters)
3091                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3092             min_profitable_iters++;
3093         }
3094     }
3095   /* vector version will never be profitable.  */
3096   else
3097     {
3098       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3099         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3100                     "did not happen for a simd loop");
3101
3102       if (dump_enabled_p ())
3103         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3104                          "cost model: the vector iteration cost = %d "
3105                          "divided by the scalar iteration cost = %d "
3106                          "is greater or equal to the vectorization factor = %d"
3107                          ".\n",
3108                          vec_inside_cost, scalar_single_iter_cost, vf);
3109       *ret_min_profitable_niters = -1;
3110       *ret_min_profitable_estimate = -1;
3111       return;
3112     }
3113
3114   dump_printf (MSG_NOTE,
3115                "  Calculated minimum iters for profitability: %d\n",
3116                min_profitable_iters);
3117
3118   min_profitable_iters =
3119         min_profitable_iters < vf ? vf : min_profitable_iters;
3120
3121   /* Because the condition we create is:
3122      if (niters <= min_profitable_iters)
3123        then skip the vectorized loop.  */
3124   min_profitable_iters--;
3125
3126   if (dump_enabled_p ())
3127     dump_printf_loc (MSG_NOTE, vect_location,
3128                      "  Runtime profitability threshold = %d\n",
3129                      min_profitable_iters);
3130
3131   *ret_min_profitable_niters = min_profitable_iters;
3132
3133   /* Calculate number of iterations required to make the vector version
3134      profitable, relative to the loop bodies only.
3135
3136      Non-vectorized variant is SIC * niters and it must win over vector
3137      variant on the expected loop trip count.  The following condition must hold true:
3138      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3139
3140   if (vec_outside_cost <= 0)
3141     min_profitable_estimate = 1;
3142   else
3143     {
3144       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3145                                  - vec_inside_cost * peel_iters_prologue
3146                                  - vec_inside_cost * peel_iters_epilogue)
3147                                  / ((scalar_single_iter_cost * vf)
3148                                    - vec_inside_cost);
3149     }
3150   min_profitable_estimate --;
3151   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3152   if (dump_enabled_p ())
3153     dump_printf_loc (MSG_NOTE, vect_location,
3154                      "  Static estimate profitability threshold = %d\n",
3155                       min_profitable_iters);
3156
3157   *ret_min_profitable_estimate = min_profitable_estimate;
3158 }
3159
3160 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3161    vector elements (not bits) for a vector of mode MODE.  */
3162 static void
3163 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3164                               unsigned char *sel)
3165 {
3166   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3167
3168   for (i = 0; i < nelt; i++)
3169     sel[i] = (i + offset) & (2*nelt - 1);
3170 }
3171
3172 /* Checks whether the target supports whole-vector shifts for vectors of mode
3173    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3174    it supports vec_perm_const with masks for all necessary shift amounts.  */
3175 static bool
3176 have_whole_vector_shift (enum machine_mode mode)
3177 {
3178   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3179     return true;
3180
3181   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3182     return false;
3183
3184   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3185   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3186
3187   for (i = nelt/2; i >= 1; i/=2)
3188     {
3189       calc_vec_perm_mask_for_shift (mode, i, sel);
3190       if (!can_vec_perm_p (mode, false, sel))
3191         return false;
3192     }
3193   return true;
3194 }
3195
3196 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3197
3198 static tree
3199 get_reduction_op (gimple stmt, int reduc_index)
3200 {
3201   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3202     {
3203     case GIMPLE_SINGLE_RHS:
3204       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3205                   == ternary_op);
3206       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3207     case GIMPLE_UNARY_RHS:
3208       return gimple_assign_rhs1 (stmt);
3209     case GIMPLE_BINARY_RHS:
3210       return (reduc_index
3211               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3212     case GIMPLE_TERNARY_RHS:
3213       return gimple_op (stmt, reduc_index + 1);
3214     default:
3215       gcc_unreachable ();
3216     }
3217 }
3218
3219 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3220    functions. Design better to avoid maintenance issues.  */
3221
3222 /* Function vect_model_reduction_cost.
3223
3224    Models cost for a reduction operation, including the vector ops
3225    generated within the strip-mine loop, the initial definition before
3226    the loop, and the epilogue code that must be generated.  */
3227
3228 static bool
3229 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3230                            int ncopies, int reduc_index)
3231 {
3232   int prologue_cost = 0, epilogue_cost = 0;
3233   enum tree_code code;
3234   optab optab;
3235   tree vectype;
3236   gimple stmt, orig_stmt;
3237   tree reduction_op;
3238   machine_mode mode;
3239   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3240   struct loop *loop = NULL;
3241   void *target_cost_data;
3242
3243   if (loop_vinfo)
3244     {
3245       loop = LOOP_VINFO_LOOP (loop_vinfo);
3246       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3247     }
3248   else
3249     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3250
3251   /* Cost of reduction op inside loop.  */
3252   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3253                                         stmt_info, 0, vect_body);
3254   stmt = STMT_VINFO_STMT (stmt_info);
3255
3256   reduction_op = get_reduction_op (stmt, reduc_index);
3257
3258   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3259   if (!vectype)
3260     {
3261       if (dump_enabled_p ())
3262         {
3263           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3264                            "unsupported data-type ");
3265           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3266                              TREE_TYPE (reduction_op));
3267           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3268         }
3269       return false;
3270    }
3271
3272   mode = TYPE_MODE (vectype);
3273   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3274
3275   if (!orig_stmt)
3276     orig_stmt = STMT_VINFO_STMT (stmt_info);
3277
3278   code = gimple_assign_rhs_code (orig_stmt);
3279
3280   /* Add in cost for initial definition.  */
3281   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3282                                   stmt_info, 0, vect_prologue);
3283
3284   /* Determine cost of epilogue code.
3285
3286      We have a reduction operator that will reduce the vector in one statement.
3287      Also requires scalar extract.  */
3288
3289   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3290     {
3291       if (reduc_code != ERROR_MARK)
3292         {
3293           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3294                                           stmt_info, 0, vect_epilogue);
3295           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3296                                           stmt_info, 0, vect_epilogue);
3297         }
3298       else
3299         {
3300           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3301           tree bitsize =
3302             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3303           int element_bitsize = tree_to_uhwi (bitsize);
3304           int nelements = vec_size_in_bits / element_bitsize;
3305
3306           optab = optab_for_tree_code (code, vectype, optab_default);
3307
3308           /* We have a whole vector shift available.  */
3309           if (VECTOR_MODE_P (mode)
3310               && optab_handler (optab, mode) != CODE_FOR_nothing
3311               && have_whole_vector_shift (mode))
3312             {
3313               /* Final reduction via vector shifts and the reduction operator.
3314                  Also requires scalar extract.  */
3315               epilogue_cost += add_stmt_cost (target_cost_data,
3316                                               exact_log2 (nelements) * 2,
3317                                               vector_stmt, stmt_info, 0,
3318                                               vect_epilogue);
3319               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3320                                               vec_to_scalar, stmt_info, 0,
3321                                               vect_epilogue);
3322             }
3323           else
3324             /* Use extracts and reduction op for final reduction.  For N
3325                elements, we have N extracts and N-1 reduction ops.  */
3326             epilogue_cost += add_stmt_cost (target_cost_data,
3327                                             nelements + nelements - 1,
3328                                             vector_stmt, stmt_info, 0,
3329                                             vect_epilogue);
3330         }
3331     }
3332
3333   if (dump_enabled_p ())
3334     dump_printf (MSG_NOTE,
3335                  "vect_model_reduction_cost: inside_cost = %d, "
3336                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3337                  prologue_cost, epilogue_cost);
3338
3339   return true;
3340 }
3341
3342
3343 /* Function vect_model_induction_cost.
3344
3345    Models cost for induction operations.  */
3346
3347 static void
3348 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3349 {
3350   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3351   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3352   unsigned inside_cost, prologue_cost;
3353
3354   /* loop cost for vec_loop.  */
3355   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3356                                stmt_info, 0, vect_body);
3357
3358   /* prologue cost for vec_init and vec_step.  */
3359   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3360                                  stmt_info, 0, vect_prologue);
3361
3362   if (dump_enabled_p ())
3363     dump_printf_loc (MSG_NOTE, vect_location,
3364                      "vect_model_induction_cost: inside_cost = %d, "
3365                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3366 }
3367
3368
3369 /* Function get_initial_def_for_induction
3370
3371    Input:
3372    STMT - a stmt that performs an induction operation in the loop.
3373    IV_PHI - the initial value of the induction variable
3374
3375    Output:
3376    Return a vector variable, initialized with the first VF values of
3377    the induction variable.  E.g., for an iv with IV_PHI='X' and
3378    evolution S, for a vector of 4 units, we want to return:
3379    [X, X + S, X + 2*S, X + 3*S].  */
3380
3381 static tree
3382 get_initial_def_for_induction (gimple iv_phi)
3383 {
3384   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3385   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3386   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3387   tree vectype;
3388   int nunits;
3389   edge pe = loop_preheader_edge (loop);
3390   struct loop *iv_loop;
3391   basic_block new_bb;
3392   tree new_vec, vec_init, vec_step, t;
3393   tree new_var;
3394   tree new_name;
3395   gimple init_stmt, new_stmt;
3396   gphi *induction_phi;
3397   tree induc_def, vec_def, vec_dest;
3398   tree init_expr, step_expr;
3399   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3400   int i;
3401   int ncopies;
3402   tree expr;
3403   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3404   bool nested_in_vect_loop = false;
3405   gimple_seq stmts = NULL;
3406   imm_use_iterator imm_iter;
3407   use_operand_p use_p;
3408   gimple exit_phi;
3409   edge latch_e;
3410   tree loop_arg;
3411   gimple_stmt_iterator si;
3412   basic_block bb = gimple_bb (iv_phi);
3413   tree stepvectype;
3414   tree resvectype;
3415
3416   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3417   if (nested_in_vect_loop_p (loop, iv_phi))
3418     {
3419       nested_in_vect_loop = true;
3420       iv_loop = loop->inner;
3421     }
3422   else
3423     iv_loop = loop;
3424   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3425
3426   latch_e = loop_latch_edge (iv_loop);
3427   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3428
3429   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3430   gcc_assert (step_expr != NULL_TREE);
3431
3432   pe = loop_preheader_edge (iv_loop);
3433   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3434                                      loop_preheader_edge (iv_loop));
3435
3436   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3437   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3438   gcc_assert (vectype);
3439   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3440   ncopies = vf / nunits;
3441
3442   gcc_assert (phi_info);
3443   gcc_assert (ncopies >= 1);
3444
3445   /* Convert the step to the desired type.  */
3446   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3447                                                   step_expr),
3448                                     &stmts, true, NULL_TREE);
3449   if (stmts)
3450     {
3451       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3452       gcc_assert (!new_bb);
3453     }
3454
3455   /* Find the first insertion point in the BB.  */
3456   si = gsi_after_labels (bb);
3457
3458   /* Create the vector that holds the initial_value of the induction.  */
3459   if (nested_in_vect_loop)
3460     {
3461       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3462          been created during vectorization of previous stmts.  We obtain it
3463          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3464       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3465       /* If the initial value is not of proper type, convert it.  */
3466       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3467         {
3468           new_stmt
3469             = gimple_build_assign (vect_get_new_vect_var (vectype,
3470                                                           vect_simple_var,
3471                                                           "vec_iv_"),
3472                                    VIEW_CONVERT_EXPR,
3473                                    build1 (VIEW_CONVERT_EXPR, vectype,
3474                                            vec_init));
3475           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3476           gimple_assign_set_lhs (new_stmt, vec_init);
3477           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3478                                                  new_stmt);
3479           gcc_assert (!new_bb);
3480           set_vinfo_for_stmt (new_stmt,
3481                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3482         }
3483     }
3484   else
3485     {
3486       vec<constructor_elt, va_gc> *v;
3487
3488       /* iv_loop is the loop to be vectorized. Create:
3489          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3490       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3491                                        vect_scalar_var, "var_");
3492       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3493                                                      init_expr),
3494                                        &stmts, false, new_var);
3495       if (stmts)
3496         {
3497           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3498           gcc_assert (!new_bb);
3499         }
3500
3501       vec_alloc (v, nunits);
3502       bool constant_p = is_gimple_min_invariant (new_name);
3503       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3504       for (i = 1; i < nunits; i++)
3505         {
3506           /* Create: new_name_i = new_name + step_expr  */
3507           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3508                                   new_name, step_expr);
3509           if (!is_gimple_min_invariant (new_name))
3510             {
3511               init_stmt = gimple_build_assign (new_var, new_name);
3512               new_name = make_ssa_name (new_var, init_stmt);
3513               gimple_assign_set_lhs (init_stmt, new_name);
3514               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3515               gcc_assert (!new_bb);
3516               if (dump_enabled_p ())
3517                 {
3518                   dump_printf_loc (MSG_NOTE, vect_location,
3519                                    "created new init_stmt: ");
3520                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3521                   dump_printf (MSG_NOTE, "\n");
3522                 }
3523               constant_p = false;
3524             }
3525           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3526         }
3527       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3528       if (constant_p)
3529         new_vec = build_vector_from_ctor (vectype, v);
3530       else
3531         new_vec = build_constructor (vectype, v);
3532       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3533     }
3534
3535
3536   /* Create the vector that holds the step of the induction.  */
3537   if (nested_in_vect_loop)
3538     /* iv_loop is nested in the loop to be vectorized. Generate:
3539        vec_step = [S, S, S, S]  */
3540     new_name = step_expr;
3541   else
3542     {
3543       /* iv_loop is the loop to be vectorized. Generate:
3544           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3545       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3546         {
3547           expr = build_int_cst (integer_type_node, vf);
3548           expr = fold_convert (TREE_TYPE (step_expr), expr);
3549         }
3550       else
3551         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3552       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3553                               expr, step_expr);
3554       if (TREE_CODE (step_expr) == SSA_NAME)
3555         new_name = vect_init_vector (iv_phi, new_name,
3556                                      TREE_TYPE (step_expr), NULL);
3557     }
3558
3559   t = unshare_expr (new_name);
3560   gcc_assert (CONSTANT_CLASS_P (new_name)
3561               || TREE_CODE (new_name) == SSA_NAME);
3562   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3563   gcc_assert (stepvectype);
3564   new_vec = build_vector_from_val (stepvectype, t);
3565   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3566
3567
3568   /* Create the following def-use cycle:
3569      loop prolog:
3570          vec_init = ...
3571          vec_step = ...
3572      loop:
3573          vec_iv = PHI <vec_init, vec_loop>
3574          ...
3575          STMT
3576          ...
3577          vec_loop = vec_iv + vec_step;  */
3578
3579   /* Create the induction-phi that defines the induction-operand.  */
3580   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3581   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3582   set_vinfo_for_stmt (induction_phi,
3583                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3584   induc_def = PHI_RESULT (induction_phi);
3585
3586   /* Create the iv update inside the loop  */
3587   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3588   vec_def = make_ssa_name (vec_dest, new_stmt);
3589   gimple_assign_set_lhs (new_stmt, vec_def);
3590   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3591   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3592                                                    NULL));
3593
3594   /* Set the arguments of the phi node:  */
3595   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3596   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3597                UNKNOWN_LOCATION);
3598
3599
3600   /* In case that vectorization factor (VF) is bigger than the number
3601      of elements that we can fit in a vectype (nunits), we have to generate
3602      more than one vector stmt - i.e - we need to "unroll" the
3603      vector stmt by a factor VF/nunits.  For more details see documentation
3604      in vectorizable_operation.  */
3605
3606   if (ncopies > 1)
3607     {
3608       stmt_vec_info prev_stmt_vinfo;
3609       /* FORNOW. This restriction should be relaxed.  */
3610       gcc_assert (!nested_in_vect_loop);
3611
3612       /* Create the vector that holds the step of the induction.  */
3613       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3614         {
3615           expr = build_int_cst (integer_type_node, nunits);
3616           expr = fold_convert (TREE_TYPE (step_expr), expr);
3617         }
3618       else
3619         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3620       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3621                               expr, step_expr);
3622       if (TREE_CODE (step_expr) == SSA_NAME)
3623         new_name = vect_init_vector (iv_phi, new_name,
3624                                      TREE_TYPE (step_expr), NULL);
3625       t = unshare_expr (new_name);
3626       gcc_assert (CONSTANT_CLASS_P (new_name)
3627                   || TREE_CODE (new_name) == SSA_NAME);
3628       new_vec = build_vector_from_val (stepvectype, t);
3629       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3630
3631       vec_def = induc_def;
3632       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3633       for (i = 1; i < ncopies; i++)
3634         {
3635           /* vec_i = vec_prev + vec_step  */
3636           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3637                                           vec_def, vec_step);
3638           vec_def = make_ssa_name (vec_dest, new_stmt);
3639           gimple_assign_set_lhs (new_stmt, vec_def);
3640
3641           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3642           if (!useless_type_conversion_p (resvectype, vectype))
3643             {
3644               new_stmt
3645                 = gimple_build_assign
3646                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3647                                                 "vec_iv_"),
3648                          VIEW_CONVERT_EXPR,
3649                          build1 (VIEW_CONVERT_EXPR, resvectype,
3650                                  gimple_assign_lhs (new_stmt)));
3651               gimple_assign_set_lhs (new_stmt,
3652                                      make_ssa_name
3653                                        (gimple_assign_lhs (new_stmt), new_stmt));
3654               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3655             }
3656           set_vinfo_for_stmt (new_stmt,
3657                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3658           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3659           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3660         }
3661     }
3662
3663   if (nested_in_vect_loop)
3664     {
3665       /* Find the loop-closed exit-phi of the induction, and record
3666          the final vector of induction results:  */
3667       exit_phi = NULL;
3668       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3669         {
3670           gimple use_stmt = USE_STMT (use_p);
3671           if (is_gimple_debug (use_stmt))
3672             continue;
3673
3674           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3675             {
3676               exit_phi = use_stmt;
3677               break;
3678             }
3679         }
3680       if (exit_phi)
3681         {
3682           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3683           /* FORNOW. Currently not supporting the case that an inner-loop induction
3684              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3685           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3686                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3687
3688           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3689           if (dump_enabled_p ())
3690             {
3691               dump_printf_loc (MSG_NOTE, vect_location,
3692                                "vector of inductions after inner-loop:");
3693               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3694               dump_printf (MSG_NOTE, "\n");
3695             }
3696         }
3697     }
3698
3699
3700   if (dump_enabled_p ())
3701     {
3702       dump_printf_loc (MSG_NOTE, vect_location,
3703                        "transform induction: created def-use cycle: ");
3704       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3705       dump_printf (MSG_NOTE, "\n");
3706       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3707                         SSA_NAME_DEF_STMT (vec_def), 0);
3708       dump_printf (MSG_NOTE, "\n");
3709     }
3710
3711   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3712   if (!useless_type_conversion_p (resvectype, vectype))
3713     {
3714       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3715                                                              vect_simple_var,
3716                                                              "vec_iv_"),
3717                                       VIEW_CONVERT_EXPR,
3718                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3719                                               induc_def));
3720       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3721       gimple_assign_set_lhs (new_stmt, induc_def);
3722       si = gsi_after_labels (bb);
3723       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3724       set_vinfo_for_stmt (new_stmt,
3725                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3726       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3727         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3728     }
3729
3730   return induc_def;
3731 }
3732
3733
3734 /* Function get_initial_def_for_reduction
3735
3736    Input:
3737    STMT - a stmt that performs a reduction operation in the loop.
3738    INIT_VAL - the initial value of the reduction variable
3739
3740    Output:
3741    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3742         of the reduction (used for adjusting the epilog - see below).
3743    Return a vector variable, initialized according to the operation that STMT
3744         performs. This vector will be used as the initial value of the
3745         vector of partial results.
3746
3747    Option1 (adjust in epilog): Initialize the vector as follows:
3748      add/bit or/xor:    [0,0,...,0,0]
3749      mult/bit and:      [1,1,...,1,1]
3750      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3751    and when necessary (e.g. add/mult case) let the caller know
3752    that it needs to adjust the result by init_val.
3753
3754    Option2: Initialize the vector as follows:
3755      add/bit or/xor:    [init_val,0,0,...,0]
3756      mult/bit and:      [init_val,1,1,...,1]
3757      min/max/cond_expr: [init_val,init_val,...,init_val]
3758    and no adjustments are needed.
3759
3760    For example, for the following code:
3761
3762    s = init_val;
3763    for (i=0;i<n;i++)
3764      s = s + a[i];
3765
3766    STMT is 's = s + a[i]', and the reduction variable is 's'.
3767    For a vector of 4 units, we want to return either [0,0,0,init_val],
3768    or [0,0,0,0] and let the caller know that it needs to adjust
3769    the result at the end by 'init_val'.
3770
3771    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3772    initialization vector is simpler (same element in all entries), if
3773    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3774
3775    A cost model should help decide between these two schemes.  */
3776
3777 tree
3778 get_initial_def_for_reduction (gimple stmt, tree init_val,
3779                                tree *adjustment_def)
3780 {
3781   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3782   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3783   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3784   tree scalar_type = TREE_TYPE (init_val);
3785   tree vectype = get_vectype_for_scalar_type (scalar_type);
3786   int nunits;
3787   enum tree_code code = gimple_assign_rhs_code (stmt);
3788   tree def_for_init;
3789   tree init_def;
3790   tree *elts;
3791   int i;
3792   bool nested_in_vect_loop = false;
3793   tree init_value;
3794   REAL_VALUE_TYPE real_init_val = dconst0;
3795   int int_init_val = 0;
3796   gimple def_stmt = NULL;
3797
3798   gcc_assert (vectype);
3799   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3800
3801   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3802               || SCALAR_FLOAT_TYPE_P (scalar_type));
3803
3804   if (nested_in_vect_loop_p (loop, stmt))
3805     nested_in_vect_loop = true;
3806   else
3807     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3808
3809   /* In case of double reduction we only create a vector variable to be put
3810      in the reduction phi node.  The actual statement creation is done in
3811      vect_create_epilog_for_reduction.  */
3812   if (adjustment_def && nested_in_vect_loop
3813       && TREE_CODE (init_val) == SSA_NAME
3814       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3815       && gimple_code (def_stmt) == GIMPLE_PHI
3816       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3817       && vinfo_for_stmt (def_stmt)
3818       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3819           == vect_double_reduction_def)
3820     {
3821       *adjustment_def = NULL;
3822       return vect_create_destination_var (init_val, vectype);
3823     }
3824
3825   if (TREE_CONSTANT (init_val))
3826     {
3827       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3828         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3829       else
3830         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3831     }
3832   else
3833     init_value = init_val;
3834
3835   switch (code)
3836     {
3837       case WIDEN_SUM_EXPR:
3838       case DOT_PROD_EXPR:
3839       case SAD_EXPR:
3840       case PLUS_EXPR:
3841       case MINUS_EXPR:
3842       case BIT_IOR_EXPR:
3843       case BIT_XOR_EXPR:
3844       case MULT_EXPR:
3845       case BIT_AND_EXPR:
3846         /* ADJUSMENT_DEF is NULL when called from
3847            vect_create_epilog_for_reduction to vectorize double reduction.  */
3848         if (adjustment_def)
3849           {
3850             if (nested_in_vect_loop)
3851               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3852                                                               NULL);
3853             else
3854               *adjustment_def = init_val;
3855           }
3856
3857         if (code == MULT_EXPR)
3858           {
3859             real_init_val = dconst1;
3860             int_init_val = 1;
3861           }
3862
3863         if (code == BIT_AND_EXPR)
3864           int_init_val = -1;
3865
3866         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3867           def_for_init = build_real (scalar_type, real_init_val);
3868         else
3869           def_for_init = build_int_cst (scalar_type, int_init_val);
3870
3871         /* Create a vector of '0' or '1' except the first element.  */
3872         elts = XALLOCAVEC (tree, nunits);
3873         for (i = nunits - 2; i >= 0; --i)
3874           elts[i + 1] = def_for_init;
3875
3876         /* Option1: the first element is '0' or '1' as well.  */
3877         if (adjustment_def)
3878           {
3879             elts[0] = def_for_init;
3880             init_def = build_vector (vectype, elts);
3881             break;
3882           }
3883
3884         /* Option2: the first element is INIT_VAL.  */
3885         elts[0] = init_val;
3886         if (TREE_CONSTANT (init_val))
3887           init_def = build_vector (vectype, elts);
3888         else
3889           {
3890             vec<constructor_elt, va_gc> *v;
3891             vec_alloc (v, nunits);
3892             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3893             for (i = 1; i < nunits; ++i)
3894               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3895             init_def = build_constructor (vectype, v);
3896           }
3897
3898         break;
3899
3900       case MIN_EXPR:
3901       case MAX_EXPR:
3902       case COND_EXPR:
3903         if (adjustment_def)
3904           {
3905             *adjustment_def = NULL_TREE;
3906             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3907             break;
3908           }
3909
3910         init_def = build_vector_from_val (vectype, init_value);
3911         break;
3912
3913       default:
3914         gcc_unreachable ();
3915     }
3916
3917   return init_def;
3918 }
3919
3920 /* Function vect_create_epilog_for_reduction
3921
3922    Create code at the loop-epilog to finalize the result of a reduction
3923    computation.
3924
3925    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3926      reduction statements.
3927    STMT is the scalar reduction stmt that is being vectorized.
3928    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3929      number of elements that we can fit in a vectype (nunits).  In this case
3930      we have to generate more than one vector stmt - i.e - we need to "unroll"
3931      the vector stmt by a factor VF/nunits.  For more details see documentation
3932      in vectorizable_operation.
3933    REDUC_CODE is the tree-code for the epilog reduction.
3934    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3935      computation.
3936    REDUC_INDEX is the index of the operand in the right hand side of the
3937      statement that is defined by REDUCTION_PHI.
3938    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3939    SLP_NODE is an SLP node containing a group of reduction statements. The
3940      first one in this group is STMT.
3941
3942    This function:
3943    1. Creates the reduction def-use cycles: sets the arguments for
3944       REDUCTION_PHIS:
3945       The loop-entry argument is the vectorized initial-value of the reduction.
3946       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3947       sums.
3948    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3949       by applying the operation specified by REDUC_CODE if available, or by
3950       other means (whole-vector shifts or a scalar loop).
3951       The function also creates a new phi node at the loop exit to preserve
3952       loop-closed form, as illustrated below.
3953
3954      The flow at the entry to this function:
3955
3956         loop:
3957           vec_def = phi <null, null>            # REDUCTION_PHI
3958           VECT_DEF = vector_stmt                # vectorized form of STMT
3959           s_loop = scalar_stmt                  # (scalar) STMT
3960         loop_exit:
3961           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3962           use <s_out0>
3963           use <s_out0>
3964
3965      The above is transformed by this function into:
3966
3967         loop:
3968           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3969           VECT_DEF = vector_stmt                # vectorized form of STMT
3970           s_loop = scalar_stmt                  # (scalar) STMT
3971         loop_exit:
3972           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3973           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3974           v_out2 = reduce <v_out1>
3975           s_out3 = extract_field <v_out2, 0>
3976           s_out4 = adjust_result <s_out3>
3977           use <s_out4>
3978           use <s_out4>
3979 */
3980
3981 static void
3982 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3983                                   int ncopies, enum tree_code reduc_code,
3984                                   vec<gimple> reduction_phis,
3985                                   int reduc_index, bool double_reduc,
3986                                   slp_tree slp_node)
3987 {
3988   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3989   stmt_vec_info prev_phi_info;
3990   tree vectype;
3991   machine_mode mode;
3992   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3993   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3994   basic_block exit_bb;
3995   tree scalar_dest;
3996   tree scalar_type;
3997   gimple new_phi = NULL, phi;
3998   gimple_stmt_iterator exit_gsi;
3999   tree vec_dest;
4000   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4001   gimple epilog_stmt = NULL;
4002   enum tree_code code = gimple_assign_rhs_code (stmt);
4003   gimple exit_phi;
4004   tree bitsize;
4005   tree adjustment_def = NULL;
4006   tree vec_initial_def = NULL;
4007   tree reduction_op, expr, def;
4008   tree orig_name, scalar_result;
4009   imm_use_iterator imm_iter, phi_imm_iter;
4010   use_operand_p use_p, phi_use_p;
4011   gimple use_stmt, orig_stmt, reduction_phi = NULL;
4012   bool nested_in_vect_loop = false;
4013   auto_vec<gimple> new_phis;
4014   auto_vec<gimple> inner_phis;
4015   enum vect_def_type dt = vect_unknown_def_type;
4016   int j, i;
4017   auto_vec<tree> scalar_results;
4018   unsigned int group_size = 1, k, ratio;
4019   auto_vec<tree> vec_initial_defs;
4020   auto_vec<gimple> phis;
4021   bool slp_reduc = false;
4022   tree new_phi_result;
4023   gimple inner_phi = NULL;
4024
4025   if (slp_node)
4026     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4027
4028   if (nested_in_vect_loop_p (loop, stmt))
4029     {
4030       outer_loop = loop;
4031       loop = loop->inner;
4032       nested_in_vect_loop = true;
4033       gcc_assert (!slp_node);
4034     }
4035
4036   reduction_op = get_reduction_op (stmt, reduc_index);
4037
4038   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4039   gcc_assert (vectype);
4040   mode = TYPE_MODE (vectype);
4041
4042   /* 1. Create the reduction def-use cycle:
4043      Set the arguments of REDUCTION_PHIS, i.e., transform
4044
4045         loop:
4046           vec_def = phi <null, null>            # REDUCTION_PHI
4047           VECT_DEF = vector_stmt                # vectorized form of STMT
4048           ...
4049
4050      into:
4051
4052         loop:
4053           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4054           VECT_DEF = vector_stmt                # vectorized form of STMT
4055           ...
4056
4057      (in case of SLP, do it for all the phis). */
4058
4059   /* Get the loop-entry arguments.  */
4060   if (slp_node)
4061     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4062                        NULL, slp_node, reduc_index);
4063   else
4064     {
4065       vec_initial_defs.create (1);
4066      /* For the case of reduction, vect_get_vec_def_for_operand returns
4067         the scalar def before the loop, that defines the initial value
4068         of the reduction variable.  */
4069       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4070                                                       &adjustment_def);
4071       vec_initial_defs.quick_push (vec_initial_def);
4072     }
4073
4074   /* Set phi nodes arguments.  */
4075   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4076     {
4077       tree vec_init_def, def;
4078       gimple_seq stmts;
4079       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4080                                            true, NULL_TREE);
4081       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4082       def = vect_defs[i];
4083       for (j = 0; j < ncopies; j++)
4084         {
4085           /* Set the loop-entry arg of the reduction-phi.  */
4086           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4087                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4088
4089           /* Set the loop-latch arg for the reduction-phi.  */
4090           if (j > 0)
4091             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4092
4093           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4094                        UNKNOWN_LOCATION);
4095
4096           if (dump_enabled_p ())
4097             {
4098               dump_printf_loc (MSG_NOTE, vect_location,
4099                                "transform reduction: created def-use cycle: ");
4100               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4101               dump_printf (MSG_NOTE, "\n");
4102               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4103               dump_printf (MSG_NOTE, "\n");
4104             }
4105
4106           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4107         }
4108     }
4109
4110   /* 2. Create epilog code.
4111         The reduction epilog code operates across the elements of the vector
4112         of partial results computed by the vectorized loop.
4113         The reduction epilog code consists of:
4114
4115         step 1: compute the scalar result in a vector (v_out2)
4116         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4117         step 3: adjust the scalar result (s_out3) if needed.
4118
4119         Step 1 can be accomplished using one the following three schemes:
4120           (scheme 1) using reduc_code, if available.
4121           (scheme 2) using whole-vector shifts, if available.
4122           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4123                      combined.
4124
4125           The overall epilog code looks like this:
4126
4127           s_out0 = phi <s_loop>         # original EXIT_PHI
4128           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4129           v_out2 = reduce <v_out1>              # step 1
4130           s_out3 = extract_field <v_out2, 0>    # step 2
4131           s_out4 = adjust_result <s_out3>       # step 3
4132
4133           (step 3 is optional, and steps 1 and 2 may be combined).
4134           Lastly, the uses of s_out0 are replaced by s_out4.  */
4135
4136
4137   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4138          v_out1 = phi <VECT_DEF>
4139          Store them in NEW_PHIS.  */
4140
4141   exit_bb = single_exit (loop)->dest;
4142   prev_phi_info = NULL;
4143   new_phis.create (vect_defs.length ());
4144   FOR_EACH_VEC_ELT (vect_defs, i, def)
4145     {
4146       for (j = 0; j < ncopies; j++)
4147         {
4148           tree new_def = copy_ssa_name (def);
4149           phi = create_phi_node (new_def, exit_bb);
4150           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4151           if (j == 0)
4152             new_phis.quick_push (phi);
4153           else
4154             {
4155               def = vect_get_vec_def_for_stmt_copy (dt, def);
4156               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4157             }
4158
4159           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4160           prev_phi_info = vinfo_for_stmt (phi);
4161         }
4162     }
4163
4164   /* The epilogue is created for the outer-loop, i.e., for the loop being
4165      vectorized.  Create exit phis for the outer loop.  */
4166   if (double_reduc)
4167     {
4168       loop = outer_loop;
4169       exit_bb = single_exit (loop)->dest;
4170       inner_phis.create (vect_defs.length ());
4171       FOR_EACH_VEC_ELT (new_phis, i, phi)
4172         {
4173           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4174           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4175           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4176                            PHI_RESULT (phi));
4177           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4178                                                             loop_vinfo, NULL));
4179           inner_phis.quick_push (phi);
4180           new_phis[i] = outer_phi;
4181           prev_phi_info = vinfo_for_stmt (outer_phi);
4182           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4183             {
4184               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4185               new_result = copy_ssa_name (PHI_RESULT (phi));
4186               outer_phi = create_phi_node (new_result, exit_bb);
4187               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4188                                PHI_RESULT (phi));
4189               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4190                                                         loop_vinfo, NULL));
4191               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4192               prev_phi_info = vinfo_for_stmt (outer_phi);
4193             }
4194         }
4195     }
4196
4197   exit_gsi = gsi_after_labels (exit_bb);
4198
4199   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4200          (i.e. when reduc_code is not available) and in the final adjustment
4201          code (if needed).  Also get the original scalar reduction variable as
4202          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4203          represents a reduction pattern), the tree-code and scalar-def are
4204          taken from the original stmt that the pattern-stmt (STMT) replaces.
4205          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4206          are taken from STMT.  */
4207
4208   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4209   if (!orig_stmt)
4210     {
4211       /* Regular reduction  */
4212       orig_stmt = stmt;
4213     }
4214   else
4215     {
4216       /* Reduction pattern  */
4217       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4218       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4219       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4220     }
4221
4222   code = gimple_assign_rhs_code (orig_stmt);
4223   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4224      partial results are added and not subtracted.  */
4225   if (code == MINUS_EXPR)
4226     code = PLUS_EXPR;
4227
4228   scalar_dest = gimple_assign_lhs (orig_stmt);
4229   scalar_type = TREE_TYPE (scalar_dest);
4230   scalar_results.create (group_size);
4231   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4232   bitsize = TYPE_SIZE (scalar_type);
4233
4234   /* In case this is a reduction in an inner-loop while vectorizing an outer
4235      loop - we don't need to extract a single scalar result at the end of the
4236      inner-loop (unless it is double reduction, i.e., the use of reduction is
4237      outside the outer-loop).  The final vector of partial results will be used
4238      in the vectorized outer-loop, or reduced to a scalar result at the end of
4239      the outer-loop.  */
4240   if (nested_in_vect_loop && !double_reduc)
4241     goto vect_finalize_reduction;
4242
4243   /* SLP reduction without reduction chain, e.g.,
4244      # a1 = phi <a2, a0>
4245      # b1 = phi <b2, b0>
4246      a2 = operation (a1)
4247      b2 = operation (b1)  */
4248   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4249
4250   /* In case of reduction chain, e.g.,
4251      # a1 = phi <a3, a0>
4252      a2 = operation (a1)
4253      a3 = operation (a2),
4254
4255      we may end up with more than one vector result.  Here we reduce them to
4256      one vector.  */
4257   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4258     {
4259       tree first_vect = PHI_RESULT (new_phis[0]);
4260       tree tmp;
4261       gassign *new_vec_stmt = NULL;
4262
4263       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4264       for (k = 1; k < new_phis.length (); k++)
4265         {
4266           gimple next_phi = new_phis[k];
4267           tree second_vect = PHI_RESULT (next_phi);
4268
4269           tmp = build2 (code, vectype,  first_vect, second_vect);
4270           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4271           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4272           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4273           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4274         }
4275
4276       new_phi_result = first_vect;
4277       if (new_vec_stmt)
4278         {
4279           new_phis.truncate (0);
4280           new_phis.safe_push (new_vec_stmt);
4281         }
4282     }
4283   else
4284     new_phi_result = PHI_RESULT (new_phis[0]);
4285
4286   /* 2.3 Create the reduction code, using one of the three schemes described
4287          above. In SLP we simply need to extract all the elements from the
4288          vector (without reducing them), so we use scalar shifts.  */
4289   if (reduc_code != ERROR_MARK && !slp_reduc)
4290     {
4291       tree tmp;
4292       tree vec_elem_type;
4293
4294       /*** Case 1:  Create:
4295            v_out2 = reduc_expr <v_out1>  */
4296
4297       if (dump_enabled_p ())
4298         dump_printf_loc (MSG_NOTE, vect_location,
4299                          "Reduce using direct vector reduction.\n");
4300
4301       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4302       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4303         {
4304           tree tmp_dest =
4305               vect_create_destination_var (scalar_dest, vec_elem_type);
4306           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4307           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4308           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4309           gimple_assign_set_lhs (epilog_stmt, new_temp);
4310           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4311
4312           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4313         }
4314       else
4315         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4316       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4317       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4318       gimple_assign_set_lhs (epilog_stmt, new_temp);
4319       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4320       scalar_results.safe_push (new_temp);
4321     }
4322   else
4323     {
4324       bool reduce_with_shift = have_whole_vector_shift (mode);
4325       int element_bitsize = tree_to_uhwi (bitsize);
4326       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4327       tree vec_temp;
4328
4329       /* Regardless of whether we have a whole vector shift, if we're
4330          emulating the operation via tree-vect-generic, we don't want
4331          to use it.  Only the first round of the reduction is likely
4332          to still be profitable via emulation.  */
4333       /* ??? It might be better to emit a reduction tree code here, so that
4334          tree-vect-generic can expand the first round via bit tricks.  */
4335       if (!VECTOR_MODE_P (mode))
4336         reduce_with_shift = false;
4337       else
4338         {
4339           optab optab = optab_for_tree_code (code, vectype, optab_default);
4340           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4341             reduce_with_shift = false;
4342         }
4343
4344       if (reduce_with_shift && !slp_reduc)
4345         {
4346           int nelements = vec_size_in_bits / element_bitsize;
4347           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4348
4349           int elt_offset;
4350
4351           tree zero_vec = build_zero_cst (vectype);
4352           /*** Case 2: Create:
4353              for (offset = nelements/2; offset >= 1; offset/=2)
4354                 {
4355                   Create:  va' = vec_shift <va, offset>
4356                   Create:  va = vop <va, va'>
4357                 }  */
4358
4359           tree rhs;
4360
4361           if (dump_enabled_p ())
4362             dump_printf_loc (MSG_NOTE, vect_location,
4363                              "Reduce using vector shifts\n");
4364
4365           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4366           new_temp = new_phi_result;
4367           for (elt_offset = nelements / 2;
4368                elt_offset >= 1;
4369                elt_offset /= 2)
4370             {
4371               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4372               tree mask = vect_gen_perm_mask_any (vectype, sel);
4373               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4374                                                  new_temp, zero_vec, mask);
4375               new_name = make_ssa_name (vec_dest, epilog_stmt);
4376               gimple_assign_set_lhs (epilog_stmt, new_name);
4377               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4378
4379               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4380                                                  new_temp);
4381               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4382               gimple_assign_set_lhs (epilog_stmt, new_temp);
4383               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4384             }
4385
4386           /* 2.4  Extract the final scalar result.  Create:
4387              s_out3 = extract_field <v_out2, bitpos>  */
4388
4389           if (dump_enabled_p ())
4390             dump_printf_loc (MSG_NOTE, vect_location,
4391                              "extract scalar result\n");
4392
4393           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4394                         bitsize, bitsize_zero_node);
4395           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4396           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4397           gimple_assign_set_lhs (epilog_stmt, new_temp);
4398           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4399           scalar_results.safe_push (new_temp);
4400         }
4401       else
4402         {
4403           /*** Case 3: Create:
4404              s = extract_field <v_out2, 0>
4405              for (offset = element_size;
4406                   offset < vector_size;
4407                   offset += element_size;)
4408                {
4409                  Create:  s' = extract_field <v_out2, offset>
4410                  Create:  s = op <s, s'>  // For non SLP cases
4411                }  */
4412
4413           if (dump_enabled_p ())
4414             dump_printf_loc (MSG_NOTE, vect_location,
4415                              "Reduce using scalar code.\n");
4416
4417           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4418           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4419             {
4420               int bit_offset;
4421               if (gimple_code (new_phi) == GIMPLE_PHI)
4422                 vec_temp = PHI_RESULT (new_phi);
4423               else
4424                 vec_temp = gimple_assign_lhs (new_phi);
4425               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4426                             bitsize_zero_node);
4427               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4428               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4429               gimple_assign_set_lhs (epilog_stmt, new_temp);
4430               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4431
4432               /* In SLP we don't need to apply reduction operation, so we just
4433                  collect s' values in SCALAR_RESULTS.  */
4434               if (slp_reduc)
4435                 scalar_results.safe_push (new_temp);
4436
4437               for (bit_offset = element_bitsize;
4438                    bit_offset < vec_size_in_bits;
4439                    bit_offset += element_bitsize)
4440                 {
4441                   tree bitpos = bitsize_int (bit_offset);
4442                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4443                                      bitsize, bitpos);
4444
4445                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4446                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4447                   gimple_assign_set_lhs (epilog_stmt, new_name);
4448                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4449
4450                   if (slp_reduc)
4451                     {
4452                       /* In SLP we don't need to apply reduction operation, so
4453                          we just collect s' values in SCALAR_RESULTS.  */
4454                       new_temp = new_name;
4455                       scalar_results.safe_push (new_name);
4456                     }
4457                   else
4458                     {
4459                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4460                                                          new_name, new_temp);
4461                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4462                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4463                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4464                     }
4465                 }
4466             }
4467
4468           /* The only case where we need to reduce scalar results in SLP, is
4469              unrolling.  If the size of SCALAR_RESULTS is greater than
4470              GROUP_SIZE, we reduce them combining elements modulo
4471              GROUP_SIZE.  */
4472           if (slp_reduc)
4473             {
4474               tree res, first_res, new_res;
4475               gimple new_stmt;
4476
4477               /* Reduce multiple scalar results in case of SLP unrolling.  */
4478               for (j = group_size; scalar_results.iterate (j, &res);
4479                    j++)
4480                 {
4481                   first_res = scalar_results[j % group_size];
4482                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4483                                                   first_res, res);
4484                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4485                   gimple_assign_set_lhs (new_stmt, new_res);
4486                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4487                   scalar_results[j % group_size] = new_res;
4488                 }
4489             }
4490           else
4491             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4492             scalar_results.safe_push (new_temp);
4493         }
4494     }
4495
4496 vect_finalize_reduction:
4497
4498   if (double_reduc)
4499     loop = loop->inner;
4500
4501   /* 2.5 Adjust the final result by the initial value of the reduction
4502          variable. (When such adjustment is not needed, then
4503          'adjustment_def' is zero).  For example, if code is PLUS we create:
4504          new_temp = loop_exit_def + adjustment_def  */
4505
4506   if (adjustment_def)
4507     {
4508       gcc_assert (!slp_reduc);
4509       if (nested_in_vect_loop)
4510         {
4511           new_phi = new_phis[0];
4512           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4513           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4514           new_dest = vect_create_destination_var (scalar_dest, vectype);
4515         }
4516       else
4517         {
4518           new_temp = scalar_results[0];
4519           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4520           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4521           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4522         }
4523
4524       epilog_stmt = gimple_build_assign (new_dest, expr);
4525       new_temp = make_ssa_name (new_dest, epilog_stmt);
4526       gimple_assign_set_lhs (epilog_stmt, new_temp);
4527       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4528       if (nested_in_vect_loop)
4529         {
4530           set_vinfo_for_stmt (epilog_stmt,
4531                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4532                                                  NULL));
4533           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4534                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4535
4536           if (!double_reduc)
4537             scalar_results.quick_push (new_temp);
4538           else
4539             scalar_results[0] = new_temp;
4540         }
4541       else
4542         scalar_results[0] = new_temp;
4543
4544       new_phis[0] = epilog_stmt;
4545     }
4546
4547   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4548           phis with new adjusted scalar results, i.e., replace use <s_out0>
4549           with use <s_out4>.
4550
4551      Transform:
4552         loop_exit:
4553           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4554           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4555           v_out2 = reduce <v_out1>
4556           s_out3 = extract_field <v_out2, 0>
4557           s_out4 = adjust_result <s_out3>
4558           use <s_out0>
4559           use <s_out0>
4560
4561      into:
4562
4563         loop_exit:
4564           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4565           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4566           v_out2 = reduce <v_out1>
4567           s_out3 = extract_field <v_out2, 0>
4568           s_out4 = adjust_result <s_out3>
4569           use <s_out4>
4570           use <s_out4> */
4571
4572
4573   /* In SLP reduction chain we reduce vector results into one vector if
4574      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4575      the last stmt in the reduction chain, since we are looking for the loop
4576      exit phi node.  */
4577   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4578     {
4579       scalar_dest = gimple_assign_lhs (
4580                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4581       group_size = 1;
4582     }
4583
4584   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4585      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4586      need to match SCALAR_RESULTS with corresponding statements.  The first
4587      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4588      the first vector stmt, etc.
4589      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4590   if (group_size > new_phis.length ())
4591     {
4592       ratio = group_size / new_phis.length ();
4593       gcc_assert (!(group_size % new_phis.length ()));
4594     }
4595   else
4596     ratio = 1;
4597
4598   for (k = 0; k < group_size; k++)
4599     {
4600       if (k % ratio == 0)
4601         {
4602           epilog_stmt = new_phis[k / ratio];
4603           reduction_phi = reduction_phis[k / ratio];
4604           if (double_reduc)
4605             inner_phi = inner_phis[k / ratio];
4606         }
4607
4608       if (slp_reduc)
4609         {
4610           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4611
4612           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4613           /* SLP statements can't participate in patterns.  */
4614           gcc_assert (!orig_stmt);
4615           scalar_dest = gimple_assign_lhs (current_stmt);
4616         }
4617
4618       phis.create (3);
4619       /* Find the loop-closed-use at the loop exit of the original scalar
4620          result.  (The reduction result is expected to have two immediate uses -
4621          one at the latch block, and one at the loop exit).  */
4622       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4623         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4624             && !is_gimple_debug (USE_STMT (use_p)))
4625           phis.safe_push (USE_STMT (use_p));
4626
4627       /* While we expect to have found an exit_phi because of loop-closed-ssa
4628          form we can end up without one if the scalar cycle is dead.  */
4629
4630       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4631         {
4632           if (outer_loop)
4633             {
4634               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4635               gphi *vect_phi;
4636
4637               /* FORNOW. Currently not supporting the case that an inner-loop
4638                  reduction is not used in the outer-loop (but only outside the
4639                  outer-loop), unless it is double reduction.  */
4640               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4641                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4642                           || double_reduc);
4643
4644               if (double_reduc)
4645                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4646               else
4647                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4648               if (!double_reduc
4649                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4650                       != vect_double_reduction_def)
4651                 continue;
4652
4653               /* Handle double reduction:
4654
4655                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4656                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4657                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4658                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4659
4660                  At that point the regular reduction (stmt2 and stmt3) is
4661                  already vectorized, as well as the exit phi node, stmt4.
4662                  Here we vectorize the phi node of double reduction, stmt1, and
4663                  update all relevant statements.  */
4664
4665               /* Go through all the uses of s2 to find double reduction phi
4666                  node, i.e., stmt1 above.  */
4667               orig_name = PHI_RESULT (exit_phi);
4668               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4669                 {
4670                   stmt_vec_info use_stmt_vinfo;
4671                   stmt_vec_info new_phi_vinfo;
4672                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4673                   basic_block bb = gimple_bb (use_stmt);
4674                   gimple use;
4675
4676                   /* Check that USE_STMT is really double reduction phi
4677                      node.  */
4678                   if (gimple_code (use_stmt) != GIMPLE_PHI
4679                       || gimple_phi_num_args (use_stmt) != 2
4680                       || bb->loop_father != outer_loop)
4681                     continue;
4682                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4683                   if (!use_stmt_vinfo
4684                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4685                           != vect_double_reduction_def)
4686                     continue;
4687
4688                   /* Create vector phi node for double reduction:
4689                      vs1 = phi <vs0, vs2>
4690                      vs1 was created previously in this function by a call to
4691                        vect_get_vec_def_for_operand and is stored in
4692                        vec_initial_def;
4693                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4694                      vs0 is created here.  */
4695
4696                   /* Create vector phi node.  */
4697                   vect_phi = create_phi_node (vec_initial_def, bb);
4698                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4699                                     loop_vec_info_for_loop (outer_loop), NULL);
4700                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4701
4702                   /* Create vs0 - initial def of the double reduction phi.  */
4703                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4704                                              loop_preheader_edge (outer_loop));
4705                   init_def = get_initial_def_for_reduction (stmt,
4706                                                           preheader_arg, NULL);
4707                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4708                                                     vectype, NULL);
4709
4710                   /* Update phi node arguments with vs0 and vs2.  */
4711                   add_phi_arg (vect_phi, vect_phi_init,
4712                                loop_preheader_edge (outer_loop),
4713                                UNKNOWN_LOCATION);
4714                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4715                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4716                   if (dump_enabled_p ())
4717                     {
4718                       dump_printf_loc (MSG_NOTE, vect_location,
4719                                        "created double reduction phi node: ");
4720                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4721                       dump_printf (MSG_NOTE, "\n");
4722                     }
4723
4724                   vect_phi_res = PHI_RESULT (vect_phi);
4725
4726                   /* Replace the use, i.e., set the correct vs1 in the regular
4727                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4728                      loop is redundant.  */
4729                   use = reduction_phi;
4730                   for (j = 0; j < ncopies; j++)
4731                     {
4732                       edge pr_edge = loop_preheader_edge (loop);
4733                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4734                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4735                     }
4736                 }
4737             }
4738         }
4739
4740       phis.release ();
4741       if (nested_in_vect_loop)
4742         {
4743           if (double_reduc)
4744             loop = outer_loop;
4745           else
4746             continue;
4747         }
4748
4749       phis.create (3);
4750       /* Find the loop-closed-use at the loop exit of the original scalar
4751          result.  (The reduction result is expected to have two immediate uses,
4752          one at the latch block, and one at the loop exit).  For double
4753          reductions we are looking for exit phis of the outer loop.  */
4754       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4755         {
4756           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4757             {
4758               if (!is_gimple_debug (USE_STMT (use_p)))
4759                 phis.safe_push (USE_STMT (use_p));
4760             }
4761           else
4762             {
4763               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4764                 {
4765                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4766
4767                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4768                     {
4769                       if (!flow_bb_inside_loop_p (loop,
4770                                              gimple_bb (USE_STMT (phi_use_p)))
4771                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4772                         phis.safe_push (USE_STMT (phi_use_p));
4773                     }
4774                 }
4775             }
4776         }
4777
4778       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4779         {
4780           /* Replace the uses:  */
4781           orig_name = PHI_RESULT (exit_phi);
4782           scalar_result = scalar_results[k];
4783           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4784             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4785               SET_USE (use_p, scalar_result);
4786         }
4787
4788       phis.release ();
4789     }
4790 }
4791
4792
4793 /* Function vectorizable_reduction.
4794
4795    Check if STMT performs a reduction operation that can be vectorized.
4796    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4797    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4798    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4799
4800    This function also handles reduction idioms (patterns) that have been
4801    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4802    of this form:
4803      X = pattern_expr (arg0, arg1, ..., X)
4804    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4805    sequence that had been detected and replaced by the pattern-stmt (STMT).
4806
4807    In some cases of reduction patterns, the type of the reduction variable X is
4808    different than the type of the other arguments of STMT.
4809    In such cases, the vectype that is used when transforming STMT into a vector
4810    stmt is different than the vectype that is used to determine the
4811    vectorization factor, because it consists of a different number of elements
4812    than the actual number of elements that are being operated upon in parallel.
4813
4814    For example, consider an accumulation of shorts into an int accumulator.
4815    On some targets it's possible to vectorize this pattern operating on 8
4816    shorts at a time (hence, the vectype for purposes of determining the
4817    vectorization factor should be V8HI); on the other hand, the vectype that
4818    is used to create the vector form is actually V4SI (the type of the result).
4819
4820    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4821    indicates what is the actual level of parallelism (V8HI in the example), so
4822    that the right vectorization factor would be derived.  This vectype
4823    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4824    be used to create the vectorized stmt.  The right vectype for the vectorized
4825    stmt is obtained from the type of the result X:
4826         get_vectype_for_scalar_type (TREE_TYPE (X))
4827
4828    This means that, contrary to "regular" reductions (or "regular" stmts in
4829    general), the following equation:
4830       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4831    does *NOT* necessarily hold for reduction patterns.  */
4832
4833 bool
4834 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4835                         gimple *vec_stmt, slp_tree slp_node)
4836 {
4837   tree vec_dest;
4838   tree scalar_dest;
4839   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4840   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4841   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4842   tree vectype_in = NULL_TREE;
4843   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4844   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4845   enum tree_code code, orig_code, epilog_reduc_code;
4846   machine_mode vec_mode;
4847   int op_type;
4848   optab optab, reduc_optab;
4849   tree new_temp = NULL_TREE;
4850   tree def;
4851   gimple def_stmt;
4852   enum vect_def_type dt;
4853   gphi *new_phi = NULL;
4854   tree scalar_type;
4855   bool is_simple_use;
4856   gimple orig_stmt;
4857   stmt_vec_info orig_stmt_info;
4858   tree expr = NULL_TREE;
4859   int i;
4860   int ncopies;
4861   int epilog_copies;
4862   stmt_vec_info prev_stmt_info, prev_phi_info;
4863   bool single_defuse_cycle = false;
4864   tree reduc_def = NULL_TREE;
4865   gimple new_stmt = NULL;
4866   int j;
4867   tree ops[3];
4868   bool nested_cycle = false, found_nested_cycle_def = false;
4869   gimple reduc_def_stmt = NULL;
4870   bool double_reduc = false, dummy;
4871   basic_block def_bb;
4872   struct loop * def_stmt_loop, *outer_loop = NULL;
4873   tree def_arg;
4874   gimple def_arg_stmt;
4875   auto_vec<tree> vec_oprnds0;
4876   auto_vec<tree> vec_oprnds1;
4877   auto_vec<tree> vect_defs;
4878   auto_vec<gimple> phis;
4879   int vec_num;
4880   tree def0, def1, tem, op0, op1 = NULL_TREE;
4881
4882   /* In case of reduction chain we switch to the first stmt in the chain, but
4883      we don't update STMT_INFO, since only the last stmt is marked as reduction
4884      and has reduction properties.  */
4885   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4886     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4887
4888   if (nested_in_vect_loop_p (loop, stmt))
4889     {
4890       outer_loop = loop;
4891       loop = loop->inner;
4892       nested_cycle = true;
4893     }
4894
4895   /* 1. Is vectorizable reduction?  */
4896   /* Not supportable if the reduction variable is used in the loop, unless
4897      it's a reduction chain.  */
4898   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4899       && !GROUP_FIRST_ELEMENT (stmt_info))
4900     return false;
4901
4902   /* Reductions that are not used even in an enclosing outer-loop,
4903      are expected to be "live" (used out of the loop).  */
4904   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4905       && !STMT_VINFO_LIVE_P (stmt_info))
4906     return false;
4907
4908   /* Make sure it was already recognized as a reduction computation.  */
4909   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4910       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4911     return false;
4912
4913   /* 2. Has this been recognized as a reduction pattern?
4914
4915      Check if STMT represents a pattern that has been recognized
4916      in earlier analysis stages.  For stmts that represent a pattern,
4917      the STMT_VINFO_RELATED_STMT field records the last stmt in
4918      the original sequence that constitutes the pattern.  */
4919
4920   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4921   if (orig_stmt)
4922     {
4923       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4924       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4925       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4926     }
4927
4928   /* 3. Check the operands of the operation.  The first operands are defined
4929         inside the loop body. The last operand is the reduction variable,
4930         which is defined by the loop-header-phi.  */
4931
4932   gcc_assert (is_gimple_assign (stmt));
4933
4934   /* Flatten RHS.  */
4935   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4936     {
4937     case GIMPLE_SINGLE_RHS:
4938       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4939       if (op_type == ternary_op)
4940         {
4941           tree rhs = gimple_assign_rhs1 (stmt);
4942           ops[0] = TREE_OPERAND (rhs, 0);
4943           ops[1] = TREE_OPERAND (rhs, 1);
4944           ops[2] = TREE_OPERAND (rhs, 2);
4945           code = TREE_CODE (rhs);
4946         }
4947       else
4948         return false;
4949       break;
4950
4951     case GIMPLE_BINARY_RHS:
4952       code = gimple_assign_rhs_code (stmt);
4953       op_type = TREE_CODE_LENGTH (code);
4954       gcc_assert (op_type == binary_op);
4955       ops[0] = gimple_assign_rhs1 (stmt);
4956       ops[1] = gimple_assign_rhs2 (stmt);
4957       break;
4958
4959     case GIMPLE_TERNARY_RHS:
4960       code = gimple_assign_rhs_code (stmt);
4961       op_type = TREE_CODE_LENGTH (code);
4962       gcc_assert (op_type == ternary_op);
4963       ops[0] = gimple_assign_rhs1 (stmt);
4964       ops[1] = gimple_assign_rhs2 (stmt);
4965       ops[2] = gimple_assign_rhs3 (stmt);
4966       break;
4967
4968     case GIMPLE_UNARY_RHS:
4969       return false;
4970
4971     default:
4972       gcc_unreachable ();
4973     }
4974   /* The default is that the reduction variable is the last in statement.  */
4975   int reduc_index = op_type - 1;
4976
4977   if (code == COND_EXPR && slp_node)
4978     return false;
4979
4980   scalar_dest = gimple_assign_lhs (stmt);
4981   scalar_type = TREE_TYPE (scalar_dest);
4982   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4983       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4984     return false;
4985
4986   /* Do not try to vectorize bit-precision reductions.  */
4987   if ((TYPE_PRECISION (scalar_type)
4988        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4989     return false;
4990
4991   /* All uses but the last are expected to be defined in the loop.
4992      The last use is the reduction variable.  In case of nested cycle this
4993      assumption is not true: we use reduc_index to record the index of the
4994      reduction variable.  */
4995   for (i = 0; i < op_type - 1; i++)
4996     {
4997       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4998       if (i == 0 && code == COND_EXPR)
4999         continue;
5000
5001       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5002                                             &def_stmt, &def, &dt, &tem);
5003       if (!vectype_in)
5004         vectype_in = tem;
5005       gcc_assert (is_simple_use);
5006
5007       if (dt != vect_internal_def
5008           && dt != vect_external_def
5009           && dt != vect_constant_def
5010           && dt != vect_induction_def
5011           && !(dt == vect_nested_cycle && nested_cycle))
5012         return false;
5013
5014       if (dt == vect_nested_cycle)
5015         {
5016           found_nested_cycle_def = true;
5017           reduc_def_stmt = def_stmt;
5018           reduc_index = i;
5019         }
5020     }
5021
5022   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5023                                         &def_stmt, &def, &dt, &tem);
5024   if (!vectype_in)
5025     vectype_in = tem;
5026   gcc_assert (is_simple_use);
5027   if (!found_nested_cycle_def)
5028     reduc_def_stmt = def_stmt;
5029
5030   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5031     return false;
5032
5033   if (!(dt == vect_reduction_def
5034         || dt == vect_nested_cycle
5035         || ((dt == vect_internal_def || dt == vect_external_def
5036              || dt == vect_constant_def || dt == vect_induction_def)
5037             && nested_cycle && found_nested_cycle_def)))
5038     {
5039       /* For pattern recognized stmts, orig_stmt might be a reduction,
5040          but some helper statements for the pattern might not, or
5041          might be COND_EXPRs with reduction uses in the condition.  */
5042       gcc_assert (orig_stmt);
5043       return false;
5044     }
5045
5046   if (orig_stmt)
5047     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5048                                                        reduc_def_stmt,
5049                                                        !nested_cycle,
5050                                                        &dummy));
5051   else
5052     {
5053       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5054                                              !nested_cycle, &dummy);
5055       /* We changed STMT to be the first stmt in reduction chain, hence we
5056          check that in this case the first element in the chain is STMT.  */
5057       gcc_assert (stmt == tmp
5058                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5059     }
5060
5061   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5062     return false;
5063
5064   if (slp_node || PURE_SLP_STMT (stmt_info))
5065     ncopies = 1;
5066   else
5067     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5068                / TYPE_VECTOR_SUBPARTS (vectype_in));
5069
5070   gcc_assert (ncopies >= 1);
5071
5072   vec_mode = TYPE_MODE (vectype_in);
5073
5074   if (code == COND_EXPR)
5075     {
5076       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5077         {
5078           if (dump_enabled_p ())
5079             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5080                              "unsupported condition in reduction\n");
5081
5082           return false;
5083         }
5084     }
5085   else
5086     {
5087       /* 4. Supportable by target?  */
5088
5089       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5090           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5091         {
5092           /* Shifts and rotates are only supported by vectorizable_shifts,
5093              not vectorizable_reduction.  */
5094           if (dump_enabled_p ())
5095             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5096                              "unsupported shift or rotation.\n");
5097           return false;
5098         }
5099
5100       /* 4.1. check support for the operation in the loop  */
5101       optab = optab_for_tree_code (code, vectype_in, optab_default);
5102       if (!optab)
5103         {
5104           if (dump_enabled_p ())
5105             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5106                              "no optab.\n");
5107
5108           return false;
5109         }
5110
5111       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5112         {
5113           if (dump_enabled_p ())
5114             dump_printf (MSG_NOTE, "op not supported by target.\n");
5115
5116           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5117               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5118                   < vect_min_worthwhile_factor (code))
5119             return false;
5120
5121           if (dump_enabled_p ())
5122             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5123         }
5124
5125       /* Worthwhile without SIMD support?  */
5126       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5127           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5128              < vect_min_worthwhile_factor (code))
5129         {
5130           if (dump_enabled_p ())
5131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132                              "not worthwhile without SIMD support.\n");
5133
5134           return false;
5135         }
5136     }
5137
5138   /* 4.2. Check support for the epilog operation.
5139
5140           If STMT represents a reduction pattern, then the type of the
5141           reduction variable may be different than the type of the rest
5142           of the arguments.  For example, consider the case of accumulation
5143           of shorts into an int accumulator; The original code:
5144                         S1: int_a = (int) short_a;
5145           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5146
5147           was replaced with:
5148                         STMT: int_acc = widen_sum <short_a, int_acc>
5149
5150           This means that:
5151           1. The tree-code that is used to create the vector operation in the
5152              epilog code (that reduces the partial results) is not the
5153              tree-code of STMT, but is rather the tree-code of the original
5154              stmt from the pattern that STMT is replacing.  I.e, in the example
5155              above we want to use 'widen_sum' in the loop, but 'plus' in the
5156              epilog.
5157           2. The type (mode) we use to check available target support
5158              for the vector operation to be created in the *epilog*, is
5159              determined by the type of the reduction variable (in the example
5160              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5161              However the type (mode) we use to check available target support
5162              for the vector operation to be created *inside the loop*, is
5163              determined by the type of the other arguments to STMT (in the
5164              example we'd check this: optab_handler (widen_sum_optab,
5165              vect_short_mode)).
5166
5167           This is contrary to "regular" reductions, in which the types of all
5168           the arguments are the same as the type of the reduction variable.
5169           For "regular" reductions we can therefore use the same vector type
5170           (and also the same tree-code) when generating the epilog code and
5171           when generating the code inside the loop.  */
5172
5173   if (orig_stmt)
5174     {
5175       /* This is a reduction pattern: get the vectype from the type of the
5176          reduction variable, and get the tree-code from orig_stmt.  */
5177       orig_code = gimple_assign_rhs_code (orig_stmt);
5178       gcc_assert (vectype_out);
5179       vec_mode = TYPE_MODE (vectype_out);
5180     }
5181   else
5182     {
5183       /* Regular reduction: use the same vectype and tree-code as used for
5184          the vector code inside the loop can be used for the epilog code. */
5185       orig_code = code;
5186     }
5187
5188   if (nested_cycle)
5189     {
5190       def_bb = gimple_bb (reduc_def_stmt);
5191       def_stmt_loop = def_bb->loop_father;
5192       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5193                                        loop_preheader_edge (def_stmt_loop));
5194       if (TREE_CODE (def_arg) == SSA_NAME
5195           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5196           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5197           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5198           && vinfo_for_stmt (def_arg_stmt)
5199           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5200               == vect_double_reduction_def)
5201         double_reduc = true;
5202     }
5203
5204   epilog_reduc_code = ERROR_MARK;
5205   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5206     {
5207       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5208                                          optab_default);
5209       if (!reduc_optab)
5210         {
5211           if (dump_enabled_p ())
5212             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5213                              "no optab for reduction.\n");
5214
5215           epilog_reduc_code = ERROR_MARK;
5216         }
5217       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5218         {
5219           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5220           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5221             {
5222               if (dump_enabled_p ())
5223                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5224                                  "reduc op not supported by target.\n");
5225
5226               epilog_reduc_code = ERROR_MARK;
5227             }
5228         }
5229     }
5230   else
5231     {
5232       if (!nested_cycle || double_reduc)
5233         {
5234           if (dump_enabled_p ())
5235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5236                              "no reduc code for scalar code.\n");
5237
5238           return false;
5239         }
5240     }
5241
5242   if (double_reduc && ncopies > 1)
5243     {
5244       if (dump_enabled_p ())
5245         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5246                          "multiple types in double reduction\n");
5247
5248       return false;
5249     }
5250
5251   /* In case of widenning multiplication by a constant, we update the type
5252      of the constant to be the type of the other operand.  We check that the
5253      constant fits the type in the pattern recognition pass.  */
5254   if (code == DOT_PROD_EXPR
5255       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5256     {
5257       if (TREE_CODE (ops[0]) == INTEGER_CST)
5258         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5259       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5260         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5261       else
5262         {
5263           if (dump_enabled_p ())
5264             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5265                              "invalid types in dot-prod\n");
5266
5267           return false;
5268         }
5269     }
5270
5271   if (!vec_stmt) /* transformation not required.  */
5272     {
5273       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5274                                       reduc_index))
5275         return false;
5276       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5277       return true;
5278     }
5279
5280   /** Transform.  **/
5281
5282   if (dump_enabled_p ())
5283     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5284
5285   /* FORNOW: Multiple types are not supported for condition.  */
5286   if (code == COND_EXPR)
5287     gcc_assert (ncopies == 1);
5288
5289   /* Create the destination vector  */
5290   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5291
5292   /* In case the vectorization factor (VF) is bigger than the number
5293      of elements that we can fit in a vectype (nunits), we have to generate
5294      more than one vector stmt - i.e - we need to "unroll" the
5295      vector stmt by a factor VF/nunits.  For more details see documentation
5296      in vectorizable_operation.  */
5297
5298   /* If the reduction is used in an outer loop we need to generate
5299      VF intermediate results, like so (e.g. for ncopies=2):
5300         r0 = phi (init, r0)
5301         r1 = phi (init, r1)
5302         r0 = x0 + r0;
5303         r1 = x1 + r1;
5304     (i.e. we generate VF results in 2 registers).
5305     In this case we have a separate def-use cycle for each copy, and therefore
5306     for each copy we get the vector def for the reduction variable from the
5307     respective phi node created for this copy.
5308
5309     Otherwise (the reduction is unused in the loop nest), we can combine
5310     together intermediate results, like so (e.g. for ncopies=2):
5311         r = phi (init, r)
5312         r = x0 + r;
5313         r = x1 + r;
5314    (i.e. we generate VF/2 results in a single register).
5315    In this case for each copy we get the vector def for the reduction variable
5316    from the vectorized reduction operation generated in the previous iteration.
5317   */
5318
5319   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5320     {
5321       single_defuse_cycle = true;
5322       epilog_copies = 1;
5323     }
5324   else
5325     epilog_copies = ncopies;
5326
5327   prev_stmt_info = NULL;
5328   prev_phi_info = NULL;
5329   if (slp_node)
5330     {
5331       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5332       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5333                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5334     }
5335   else
5336     {
5337       vec_num = 1;
5338       vec_oprnds0.create (1);
5339       if (op_type == ternary_op)
5340         vec_oprnds1.create (1);
5341     }
5342
5343   phis.create (vec_num);
5344   vect_defs.create (vec_num);
5345   if (!slp_node)
5346     vect_defs.quick_push (NULL_TREE);
5347
5348   for (j = 0; j < ncopies; j++)
5349     {
5350       if (j == 0 || !single_defuse_cycle)
5351         {
5352           for (i = 0; i < vec_num; i++)
5353             {
5354               /* Create the reduction-phi that defines the reduction
5355                  operand.  */
5356               new_phi = create_phi_node (vec_dest, loop->header);
5357               set_vinfo_for_stmt (new_phi,
5358                                   new_stmt_vec_info (new_phi, loop_vinfo,
5359                                                      NULL));
5360                if (j == 0 || slp_node)
5361                  phis.quick_push (new_phi);
5362             }
5363         }
5364
5365       if (code == COND_EXPR)
5366         {
5367           gcc_assert (!slp_node);
5368           vectorizable_condition (stmt, gsi, vec_stmt,
5369                                   PHI_RESULT (phis[0]),
5370                                   reduc_index, NULL);
5371           /* Multiple types are not supported for condition.  */
5372           break;
5373         }
5374
5375       /* Handle uses.  */
5376       if (j == 0)
5377         {
5378           op0 = ops[!reduc_index];
5379           if (op_type == ternary_op)
5380             {
5381               if (reduc_index == 0)
5382                 op1 = ops[2];
5383               else
5384                 op1 = ops[1];
5385             }
5386
5387           if (slp_node)
5388             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5389                                slp_node, -1);
5390           else
5391             {
5392               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5393                                                             stmt, NULL);
5394               vec_oprnds0.quick_push (loop_vec_def0);
5395               if (op_type == ternary_op)
5396                {
5397                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5398                                                                NULL);
5399                  vec_oprnds1.quick_push (loop_vec_def1);
5400                }
5401             }
5402         }
5403       else
5404         {
5405           if (!slp_node)
5406             {
5407               enum vect_def_type dt;
5408               gimple dummy_stmt;
5409               tree dummy;
5410
5411               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5412                                   &dummy_stmt, &dummy, &dt);
5413               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5414                                                               loop_vec_def0);
5415               vec_oprnds0[0] = loop_vec_def0;
5416               if (op_type == ternary_op)
5417                 {
5418                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5419                                       &dummy, &dt);
5420                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5421                                                                 loop_vec_def1);
5422                   vec_oprnds1[0] = loop_vec_def1;
5423                 }
5424             }
5425
5426           if (single_defuse_cycle)
5427             reduc_def = gimple_assign_lhs (new_stmt);
5428
5429           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5430         }
5431
5432       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5433         {
5434           if (slp_node)
5435             reduc_def = PHI_RESULT (phis[i]);
5436           else
5437             {
5438               if (!single_defuse_cycle || j == 0)
5439                 reduc_def = PHI_RESULT (new_phi);
5440             }
5441
5442           def1 = ((op_type == ternary_op)
5443                   ? vec_oprnds1[i] : NULL);
5444           if (op_type == binary_op)
5445             {
5446               if (reduc_index == 0)
5447                 expr = build2 (code, vectype_out, reduc_def, def0);
5448               else
5449                 expr = build2 (code, vectype_out, def0, reduc_def);
5450             }
5451           else
5452             {
5453               if (reduc_index == 0)
5454                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5455               else
5456                 {
5457                   if (reduc_index == 1)
5458                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5459                   else
5460                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5461                 }
5462             }
5463
5464           new_stmt = gimple_build_assign (vec_dest, expr);
5465           new_temp = make_ssa_name (vec_dest, new_stmt);
5466           gimple_assign_set_lhs (new_stmt, new_temp);
5467           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5468
5469           if (slp_node)
5470             {
5471               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5472               vect_defs.quick_push (new_temp);
5473             }
5474           else
5475             vect_defs[0] = new_temp;
5476         }
5477
5478       if (slp_node)
5479         continue;
5480
5481       if (j == 0)
5482         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5483       else
5484         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5485
5486       prev_stmt_info = vinfo_for_stmt (new_stmt);
5487       prev_phi_info = vinfo_for_stmt (new_phi);
5488     }
5489
5490   /* Finalize the reduction-phi (set its arguments) and create the
5491      epilog reduction code.  */
5492   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5493     {
5494       new_temp = gimple_assign_lhs (*vec_stmt);
5495       vect_defs[0] = new_temp;
5496     }
5497
5498   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5499                                     epilog_reduc_code, phis, reduc_index,
5500                                     double_reduc, slp_node);
5501
5502   return true;
5503 }
5504
5505 /* Function vect_min_worthwhile_factor.
5506
5507    For a loop where we could vectorize the operation indicated by CODE,
5508    return the minimum vectorization factor that makes it worthwhile
5509    to use generic vectors.  */
5510 int
5511 vect_min_worthwhile_factor (enum tree_code code)
5512 {
5513   switch (code)
5514     {
5515     case PLUS_EXPR:
5516     case MINUS_EXPR:
5517     case NEGATE_EXPR:
5518       return 4;
5519
5520     case BIT_AND_EXPR:
5521     case BIT_IOR_EXPR:
5522     case BIT_XOR_EXPR:
5523     case BIT_NOT_EXPR:
5524       return 2;
5525
5526     default:
5527       return INT_MAX;
5528     }
5529 }
5530
5531
5532 /* Function vectorizable_induction
5533
5534    Check if PHI performs an induction computation that can be vectorized.
5535    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5536    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5537    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5538
5539 bool
5540 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5541                         gimple *vec_stmt)
5542 {
5543   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5544   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5545   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5546   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5547   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5548   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5549   tree vec_def;
5550
5551   gcc_assert (ncopies >= 1);
5552   /* FORNOW. These restrictions should be relaxed.  */
5553   if (nested_in_vect_loop_p (loop, phi))
5554     {
5555       imm_use_iterator imm_iter;
5556       use_operand_p use_p;
5557       gimple exit_phi;
5558       edge latch_e;
5559       tree loop_arg;
5560
5561       if (ncopies > 1)
5562         {
5563           if (dump_enabled_p ())
5564             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565                              "multiple types in nested loop.\n");
5566           return false;
5567         }
5568
5569       exit_phi = NULL;
5570       latch_e = loop_latch_edge (loop->inner);
5571       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5572       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5573         {
5574           gimple use_stmt = USE_STMT (use_p);
5575           if (is_gimple_debug (use_stmt))
5576             continue;
5577
5578           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5579             {
5580               exit_phi = use_stmt;
5581               break;
5582             }
5583         }
5584       if (exit_phi)
5585         {
5586           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5587           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5588                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5589             {
5590               if (dump_enabled_p ())
5591                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5592                                  "inner-loop induction only used outside "
5593                                  "of the outer vectorized loop.\n");
5594               return false;
5595             }
5596         }
5597     }
5598
5599   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5600     return false;
5601
5602   /* FORNOW: SLP not supported.  */
5603   if (STMT_SLP_TYPE (stmt_info))
5604     return false;
5605
5606   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5607
5608   if (gimple_code (phi) != GIMPLE_PHI)
5609     return false;
5610
5611   if (!vec_stmt) /* transformation not required.  */
5612     {
5613       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5614       if (dump_enabled_p ())
5615         dump_printf_loc (MSG_NOTE, vect_location,
5616                          "=== vectorizable_induction ===\n");
5617       vect_model_induction_cost (stmt_info, ncopies);
5618       return true;
5619     }
5620
5621   /** Transform.  **/
5622
5623   if (dump_enabled_p ())
5624     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5625
5626   vec_def = get_initial_def_for_induction (phi);
5627   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5628   return true;
5629 }
5630
5631 /* Function vectorizable_live_operation.
5632
5633    STMT computes a value that is used outside the loop.  Check if
5634    it can be supported.  */
5635
5636 bool
5637 vectorizable_live_operation (gimple stmt,
5638                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5639                              gimple *vec_stmt)
5640 {
5641   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5642   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5643   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5644   int i;
5645   int op_type;
5646   tree op;
5647   tree def;
5648   gimple def_stmt;
5649   enum vect_def_type dt;
5650   enum tree_code code;
5651   enum gimple_rhs_class rhs_class;
5652
5653   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5654
5655   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5656     return false;
5657
5658   if (!is_gimple_assign (stmt))
5659     {
5660       if (gimple_call_internal_p (stmt)
5661           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5662           && gimple_call_lhs (stmt)
5663           && loop->simduid
5664           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5665           && loop->simduid
5666              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5667         {
5668           edge e = single_exit (loop);
5669           basic_block merge_bb = e->dest;
5670           imm_use_iterator imm_iter;
5671           use_operand_p use_p;
5672           tree lhs = gimple_call_lhs (stmt);
5673
5674           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5675             {
5676               gimple use_stmt = USE_STMT (use_p);
5677               if (gimple_code (use_stmt) == GIMPLE_PHI
5678                   && gimple_bb (use_stmt) == merge_bb)
5679                 {
5680                   if (vec_stmt)
5681                     {
5682                       tree vfm1
5683                         = build_int_cst (unsigned_type_node,
5684                                          loop_vinfo->vectorization_factor - 1);
5685                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5686                     }
5687                   return true;
5688                 }
5689             }
5690         }
5691
5692       return false;
5693     }
5694
5695   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5696     return false;
5697
5698   /* FORNOW. CHECKME. */
5699   if (nested_in_vect_loop_p (loop, stmt))
5700     return false;
5701
5702   code = gimple_assign_rhs_code (stmt);
5703   op_type = TREE_CODE_LENGTH (code);
5704   rhs_class = get_gimple_rhs_class (code);
5705   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5706   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5707
5708   /* FORNOW: support only if all uses are invariant.  This means
5709      that the scalar operations can remain in place, unvectorized.
5710      The original last scalar value that they compute will be used.  */
5711
5712   for (i = 0; i < op_type; i++)
5713     {
5714       if (rhs_class == GIMPLE_SINGLE_RHS)
5715         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5716       else
5717         op = gimple_op (stmt, i + 1);
5718       if (op
5719           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5720                                   &dt))
5721         {
5722           if (dump_enabled_p ())
5723             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5724                              "use not simple.\n");
5725           return false;
5726         }
5727
5728       if (dt != vect_external_def && dt != vect_constant_def)
5729         return false;
5730     }
5731
5732   /* No transformation is required for the cases we currently support.  */
5733   return true;
5734 }
5735
5736 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5737
5738 static void
5739 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5740 {
5741   ssa_op_iter op_iter;
5742   imm_use_iterator imm_iter;
5743   def_operand_p def_p;
5744   gimple ustmt;
5745
5746   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5747     {
5748       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5749         {
5750           basic_block bb;
5751
5752           if (!is_gimple_debug (ustmt))
5753             continue;
5754
5755           bb = gimple_bb (ustmt);
5756
5757           if (!flow_bb_inside_loop_p (loop, bb))
5758             {
5759               if (gimple_debug_bind_p (ustmt))
5760                 {
5761                   if (dump_enabled_p ())
5762                     dump_printf_loc (MSG_NOTE, vect_location,
5763                                      "killing debug use\n");
5764
5765                   gimple_debug_bind_reset_value (ustmt);
5766                   update_stmt (ustmt);
5767                 }
5768               else
5769                 gcc_unreachable ();
5770             }
5771         }
5772     }
5773 }
5774
5775
5776 /* This function builds ni_name = number of iterations.  Statements
5777    are emitted on the loop preheader edge.  */
5778
5779 static tree
5780 vect_build_loop_niters (loop_vec_info loop_vinfo)
5781 {
5782   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5783   if (TREE_CODE (ni) == INTEGER_CST)
5784     return ni;
5785   else
5786     {
5787       tree ni_name, var;
5788       gimple_seq stmts = NULL;
5789       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5790
5791       var = create_tmp_var (TREE_TYPE (ni), "niters");
5792       ni_name = force_gimple_operand (ni, &stmts, false, var);
5793       if (stmts)
5794         gsi_insert_seq_on_edge_immediate (pe, stmts);
5795
5796       return ni_name;
5797     }
5798 }
5799
5800
5801 /* This function generates the following statements:
5802
5803    ni_name = number of iterations loop executes
5804    ratio = ni_name / vf
5805    ratio_mult_vf_name = ratio * vf
5806
5807    and places them on the loop preheader edge.  */
5808
5809 static void
5810 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5811                                  tree ni_name,
5812                                  tree *ratio_mult_vf_name_ptr,
5813                                  tree *ratio_name_ptr)
5814 {
5815   tree ni_minus_gap_name;
5816   tree var;
5817   tree ratio_name;
5818   tree ratio_mult_vf_name;
5819   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5820   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5821   tree log_vf;
5822
5823   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5824
5825   /* If epilogue loop is required because of data accesses with gaps, we
5826      subtract one iteration from the total number of iterations here for
5827      correct calculation of RATIO.  */
5828   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5829     {
5830       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5831                                        ni_name,
5832                                        build_one_cst (TREE_TYPE (ni_name)));
5833       if (!is_gimple_val (ni_minus_gap_name))
5834         {
5835           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5836           gimple stmts = NULL;
5837           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5838                                                     true, var);
5839           gsi_insert_seq_on_edge_immediate (pe, stmts);
5840         }
5841     }
5842   else
5843     ni_minus_gap_name = ni_name;
5844
5845   /* Create: ratio = ni >> log2(vf) */
5846   /* ???  As we have ni == number of latch executions + 1, ni could
5847      have overflown to zero.  So avoid computing ratio based on ni
5848      but compute it using the fact that we know ratio will be at least
5849      one, thus via (ni - vf) >> log2(vf) + 1.  */
5850   ratio_name
5851     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5852                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5853                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5854                                              ni_minus_gap_name,
5855                                              build_int_cst
5856                                                (TREE_TYPE (ni_name), vf)),
5857                                 log_vf),
5858                    build_int_cst (TREE_TYPE (ni_name), 1));
5859   if (!is_gimple_val (ratio_name))
5860     {
5861       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5862       gimple stmts = NULL;
5863       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5864       gsi_insert_seq_on_edge_immediate (pe, stmts);
5865     }
5866   *ratio_name_ptr = ratio_name;
5867
5868   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5869
5870   if (ratio_mult_vf_name_ptr)
5871     {
5872       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5873                                         ratio_name, log_vf);
5874       if (!is_gimple_val (ratio_mult_vf_name))
5875         {
5876           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5877           gimple stmts = NULL;
5878           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5879                                                      true, var);
5880           gsi_insert_seq_on_edge_immediate (pe, stmts);
5881         }
5882       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5883     }
5884
5885   return;
5886 }
5887
5888
5889 /* Function vect_transform_loop.
5890
5891    The analysis phase has determined that the loop is vectorizable.
5892    Vectorize the loop - created vectorized stmts to replace the scalar
5893    stmts in the loop, and update the loop exit condition.  */
5894
5895 void
5896 vect_transform_loop (loop_vec_info loop_vinfo)
5897 {
5898   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5899   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5900   int nbbs = loop->num_nodes;
5901   int i;
5902   tree ratio = NULL;
5903   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5904   bool grouped_store;
5905   bool slp_scheduled = false;
5906   gimple stmt, pattern_stmt;
5907   gimple_seq pattern_def_seq = NULL;
5908   gimple_stmt_iterator pattern_def_si = gsi_none ();
5909   bool transform_pattern_stmt = false;
5910   bool check_profitability = false;
5911   int th;
5912   /* Record number of iterations before we started tampering with the profile. */
5913   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5914
5915   if (dump_enabled_p ())
5916     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5917
5918   /* If profile is inprecise, we have chance to fix it up.  */
5919   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5920     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5921
5922   /* Use the more conservative vectorization threshold.  If the number
5923      of iterations is constant assume the cost check has been performed
5924      by our caller.  If the threshold makes all loops profitable that
5925      run at least the vectorization factor number of times checking
5926      is pointless, too.  */
5927   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5928   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5929       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5930     {
5931       if (dump_enabled_p ())
5932         dump_printf_loc (MSG_NOTE, vect_location,
5933                          "Profitability threshold is %d loop iterations.\n",
5934                          th);
5935       check_profitability = true;
5936     }
5937
5938   /* Version the loop first, if required, so the profitability check
5939      comes first.  */
5940
5941   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5942       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5943     {
5944       vect_loop_versioning (loop_vinfo, th, check_profitability);
5945       check_profitability = false;
5946     }
5947
5948   tree ni_name = vect_build_loop_niters (loop_vinfo);
5949   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5950
5951   /* Peel the loop if there are data refs with unknown alignment.
5952      Only one data ref with unknown store is allowed.  */
5953
5954   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5955     {
5956       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5957                                      th, check_profitability);
5958       check_profitability = false;
5959       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5960          be re-computed.  */
5961       ni_name = NULL_TREE;
5962     }
5963
5964   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5965      compile time constant), or it is a constant that doesn't divide by the
5966      vectorization factor, then an epilog loop needs to be created.
5967      We therefore duplicate the loop: the original loop will be vectorized,
5968      and will compute the first (n/VF) iterations.  The second copy of the loop
5969      will remain scalar and will compute the remaining (n%VF) iterations.
5970      (VF is the vectorization factor).  */
5971
5972   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5973       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5974     {
5975       tree ratio_mult_vf;
5976       if (!ni_name)
5977         ni_name = vect_build_loop_niters (loop_vinfo);
5978       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5979                                        &ratio);
5980       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5981                                       th, check_profitability);
5982     }
5983   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5984     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5985                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5986   else
5987     {
5988       if (!ni_name)
5989         ni_name = vect_build_loop_niters (loop_vinfo);
5990       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5991     }
5992
5993   /* 1) Make sure the loop header has exactly two entries
5994      2) Make sure we have a preheader basic block.  */
5995
5996   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5997
5998   split_edge (loop_preheader_edge (loop));
5999
6000   /* FORNOW: the vectorizer supports only loops which body consist
6001      of one basic block (header + empty latch). When the vectorizer will
6002      support more involved loop forms, the order by which the BBs are
6003      traversed need to be reconsidered.  */
6004
6005   for (i = 0; i < nbbs; i++)
6006     {
6007       basic_block bb = bbs[i];
6008       stmt_vec_info stmt_info;
6009
6010       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6011            gsi_next (&si))
6012         {
6013           gphi *phi = si.phi ();
6014           if (dump_enabled_p ())
6015             {
6016               dump_printf_loc (MSG_NOTE, vect_location,
6017                                "------>vectorizing phi: ");
6018               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6019               dump_printf (MSG_NOTE, "\n");
6020             }
6021           stmt_info = vinfo_for_stmt (phi);
6022           if (!stmt_info)
6023             continue;
6024
6025           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6026             vect_loop_kill_debug_uses (loop, phi);
6027
6028           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6029               && !STMT_VINFO_LIVE_P (stmt_info))
6030             continue;
6031
6032           if (STMT_VINFO_VECTYPE (stmt_info)
6033               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6034                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6035               && dump_enabled_p ())
6036             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6037
6038           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6039             {
6040               if (dump_enabled_p ())
6041                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6042               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6043             }
6044         }
6045
6046       pattern_stmt = NULL;
6047       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6048            !gsi_end_p (si) || transform_pattern_stmt;)
6049         {
6050           bool is_store;
6051
6052           if (transform_pattern_stmt)
6053             stmt = pattern_stmt;
6054           else
6055             {
6056               stmt = gsi_stmt (si);
6057               /* During vectorization remove existing clobber stmts.  */
6058               if (gimple_clobber_p (stmt))
6059                 {
6060                   unlink_stmt_vdef (stmt);
6061                   gsi_remove (&si, true);
6062                   release_defs (stmt);
6063                   continue;
6064                 }
6065             }
6066
6067           if (dump_enabled_p ())
6068             {
6069               dump_printf_loc (MSG_NOTE, vect_location,
6070                                "------>vectorizing statement: ");
6071               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6072               dump_printf (MSG_NOTE, "\n");
6073             }
6074
6075           stmt_info = vinfo_for_stmt (stmt);
6076
6077           /* vector stmts created in the outer-loop during vectorization of
6078              stmts in an inner-loop may not have a stmt_info, and do not
6079              need to be vectorized.  */
6080           if (!stmt_info)
6081             {
6082               gsi_next (&si);
6083               continue;
6084             }
6085
6086           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6087             vect_loop_kill_debug_uses (loop, stmt);
6088
6089           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6090               && !STMT_VINFO_LIVE_P (stmt_info))
6091             {
6092               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6093                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6094                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6095                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6096                 {
6097                   stmt = pattern_stmt;
6098                   stmt_info = vinfo_for_stmt (stmt);
6099                 }
6100               else
6101                 {
6102                   gsi_next (&si);
6103                   continue;
6104                 }
6105             }
6106           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6107                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6108                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6109                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6110             transform_pattern_stmt = true;
6111
6112           /* If pattern statement has def stmts, vectorize them too.  */
6113           if (is_pattern_stmt_p (stmt_info))
6114             {
6115               if (pattern_def_seq == NULL)
6116                 {
6117                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6118                   pattern_def_si = gsi_start (pattern_def_seq);
6119                 }
6120               else if (!gsi_end_p (pattern_def_si))
6121                 gsi_next (&pattern_def_si);
6122               if (pattern_def_seq != NULL)
6123                 {
6124                   gimple pattern_def_stmt = NULL;
6125                   stmt_vec_info pattern_def_stmt_info = NULL;
6126
6127                   while (!gsi_end_p (pattern_def_si))
6128                     {
6129                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6130                       pattern_def_stmt_info
6131                         = vinfo_for_stmt (pattern_def_stmt);
6132                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6133                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6134                         break;
6135                       gsi_next (&pattern_def_si);
6136                     }
6137
6138                   if (!gsi_end_p (pattern_def_si))
6139                     {
6140                       if (dump_enabled_p ())
6141                         {
6142                           dump_printf_loc (MSG_NOTE, vect_location,
6143                                            "==> vectorizing pattern def "
6144                                            "stmt: ");
6145                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6146                                             pattern_def_stmt, 0);
6147                           dump_printf (MSG_NOTE, "\n");
6148                         }
6149
6150                       stmt = pattern_def_stmt;
6151                       stmt_info = pattern_def_stmt_info;
6152                     }
6153                   else
6154                     {
6155                       pattern_def_si = gsi_none ();
6156                       transform_pattern_stmt = false;
6157                     }
6158                 }
6159               else
6160                 transform_pattern_stmt = false;
6161             }
6162
6163           if (STMT_VINFO_VECTYPE (stmt_info))
6164             {
6165               unsigned int nunits
6166                 = (unsigned int)
6167                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6168               if (!STMT_SLP_TYPE (stmt_info)
6169                   && nunits != (unsigned int) vectorization_factor
6170                   && dump_enabled_p ())
6171                   /* For SLP VF is set according to unrolling factor, and not
6172                      to vector size, hence for SLP this print is not valid.  */
6173                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6174             }
6175
6176           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6177              reached.  */
6178           if (STMT_SLP_TYPE (stmt_info))
6179             {
6180               if (!slp_scheduled)
6181                 {
6182                   slp_scheduled = true;
6183
6184                   if (dump_enabled_p ())
6185                     dump_printf_loc (MSG_NOTE, vect_location,
6186                                      "=== scheduling SLP instances ===\n");
6187
6188                   vect_schedule_slp (loop_vinfo, NULL);
6189                 }
6190
6191               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6192               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6193                 {
6194                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6195                     {
6196                       pattern_def_seq = NULL;
6197                       gsi_next (&si);
6198                     }
6199                   continue;
6200                 }
6201             }
6202
6203           /* -------- vectorize statement ------------ */
6204           if (dump_enabled_p ())
6205             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6206
6207           grouped_store = false;
6208           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6209           if (is_store)
6210             {
6211               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6212                 {
6213                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6214                      interleaving chain was completed - free all the stores in
6215                      the chain.  */
6216                   gsi_next (&si);
6217                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6218                 }
6219               else
6220                 {
6221                   /* Free the attached stmt_vec_info and remove the stmt.  */
6222                   gimple store = gsi_stmt (si);
6223                   free_stmt_vec_info (store);
6224                   unlink_stmt_vdef (store);
6225                   gsi_remove (&si, true);
6226                   release_defs (store);
6227                 }
6228
6229               /* Stores can only appear at the end of pattern statements.  */
6230               gcc_assert (!transform_pattern_stmt);
6231               pattern_def_seq = NULL;
6232             }
6233           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6234             {
6235               pattern_def_seq = NULL;
6236               gsi_next (&si);
6237             }
6238         }                       /* stmts in BB */
6239     }                           /* BBs in loop */
6240
6241   slpeel_make_loop_iterate_ntimes (loop, ratio);
6242
6243   /* Reduce loop iterations by the vectorization factor.  */
6244   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6245                       expected_iterations / vectorization_factor);
6246   loop->nb_iterations_upper_bound
6247     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6248   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6249       && loop->nb_iterations_upper_bound != 0)
6250     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6251   if (loop->any_estimate)
6252     {
6253       loop->nb_iterations_estimate
6254         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6255        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6256            && loop->nb_iterations_estimate != 0)
6257          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6258     }
6259
6260   if (dump_enabled_p ())
6261     {
6262       dump_printf_loc (MSG_NOTE, vect_location,
6263                        "LOOP VECTORIZED\n");
6264       if (loop->inner)
6265         dump_printf_loc (MSG_NOTE, vect_location,
6266                          "OUTER LOOP VECTORIZED\n");
6267       dump_printf (MSG_NOTE, "\n");
6268     }
6269 }