gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-ssa.h"
  32 #include "tree-pass.h"
  33 #include "cfgloop.h"
  34 #include "expr.h"
  35 #include "recog.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "diagnostic-core.h"
  39 #include "tree-chrec.h"
  40 #include "tree-scalar-evolution.h"
  41 #include "tree-vectorizer.h"
  42 #include "target.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs").  These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized.  It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S).  This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 127    Targets that can support different sizes of vectors, for now will need
 128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 129    flexibility will be added in the future.
 130
 131         Since we only vectorize operations which vector form can be
 132    expressed using existing tree codes, to verify that an operation is
 133    supported, the vectorizer checks the relevant optab at the relevant
 134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 135    the value found is CODE_FOR_nothing, then there's no target support, and
 136    we can't vectorize the stmt.
 137
 138    For additional information on this project see:
 139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 140 */
 141
 142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (dump_enabled_p ())
 190     dump_printf_loc (MSG_NOTE, vect_location,
 191                      "=== vect_determine_vectorization_factor ===\n");
 192
 193   for (i = 0; i < nbbs; i++)
 194     {
 195       basic_block bb = bbs[i];
 196
 197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 198         {
 199           phi = gsi_stmt (si);
 200           stmt_info = vinfo_for_stmt (phi);
 201           if (dump_enabled_p ())
 202             {
 203               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 204               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 205               dump_printf (MSG_NOTE, "\n");
 206             }
 207
 208           gcc_assert (stmt_info);
 209
 210           if (STMT_VINFO_RELEVANT_P (stmt_info))
 211             {
 212               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 213               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 214
 215               if (dump_enabled_p ())
 216                 {
 217                   dump_printf_loc (MSG_NOTE, vect_location,
 218                                    "get vectype for scalar type:  ");
 219                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 220                   dump_printf (MSG_NOTE, "\n");
 221                 }
 222
 223               vectype = get_vectype_for_scalar_type (scalar_type);
 224               if (!vectype)
 225                 {
 226                   if (dump_enabled_p ())
 227                     {
 228                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 229                                        "not vectorized: unsupported "
 230                                        "data-type ");
 231                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 232                                          scalar_type);
 233                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 234                     }
 235                   return false;
 236                 }
 237               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 238
 239               if (dump_enabled_p ())
 240                 {
 241                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 242                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 243                   dump_printf (MSG_NOTE, "\n");
 244                 }
 245
 246               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 247               if (dump_enabled_p ())
 248                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 249                                  nunits);
 250
 251               if (!vectorization_factor
 252                   || (nunits > vectorization_factor))
 253                 vectorization_factor = nunits;
 254             }
 255         }
 256
 257       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 258         {
 259           tree vf_vectype;
 260
 261           if (analyze_pattern_stmt)
 262             stmt = pattern_stmt;
 263           else
 264             stmt = gsi_stmt (si);
 265
 266           stmt_info = vinfo_for_stmt (stmt);
 267
 268           if (dump_enabled_p ())
 269             {
 270               dump_printf_loc (MSG_NOTE, vect_location,
 271                                "==> examining statement: ");
 272               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 273               dump_printf (MSG_NOTE, "\n");
 274             }
 275
 276           gcc_assert (stmt_info);
 277
 278           /* Skip stmts which do not need to be vectorized.  */
 279           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 280                && !STMT_VINFO_LIVE_P (stmt_info))
 281               || gimple_clobber_p (stmt))
 282             {
 283               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 284                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 285                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 286                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 287                 {
 288                   stmt = pattern_stmt;
 289                   stmt_info = vinfo_for_stmt (pattern_stmt);
 290                   if (dump_enabled_p ())
 291                     {
 292                       dump_printf_loc (MSG_NOTE, vect_location,
 293                                        "==> examining pattern statement: ");
 294                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 295                       dump_printf (MSG_NOTE, "\n");
 296                     }
 297                 }
 298               else
 299                 {
 300                   if (dump_enabled_p ())
 301                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 302                   gsi_next (&si);
 303                   continue;
 304                 }
 305             }
 306           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 307                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 308                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 309                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 310             analyze_pattern_stmt = true;
 311
 312           /* If a pattern statement has def stmts, analyze them too.  */
 313           if (is_pattern_stmt_p (stmt_info))
 314             {
 315               if (pattern_def_seq == NULL)
 316                 {
 317                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 318                   pattern_def_si = gsi_start (pattern_def_seq);
 319                 }
 320               else if (!gsi_end_p (pattern_def_si))
 321                 gsi_next (&pattern_def_si);
 322               if (pattern_def_seq != NULL)
 323                 {
 324                   gimple pattern_def_stmt = NULL;
 325                   stmt_vec_info pattern_def_stmt_info = NULL;
 326
 327                   while (!gsi_end_p (pattern_def_si))
 328                     {
 329                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 330                       pattern_def_stmt_info
 331                         = vinfo_for_stmt (pattern_def_stmt);
 332                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 333                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 334                         break;
 335                       gsi_next (&pattern_def_si);
 336                     }
 337
 338                   if (!gsi_end_p (pattern_def_si))
 339                     {
 340                       if (dump_enabled_p ())
 341                         {
 342                           dump_printf_loc (MSG_NOTE, vect_location,
 343                                            "==> examining pattern def stmt: ");
 344                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 345                                             pattern_def_stmt, 0);
 346                           dump_printf (MSG_NOTE, "\n");
 347                         }
 348
 349                       stmt = pattern_def_stmt;
 350                       stmt_info = pattern_def_stmt_info;
 351                     }
 352                   else
 353                     {
 354                       pattern_def_si = gsi_none ();
 355                       analyze_pattern_stmt = false;
 356                     }
 357                 }
 358               else
 359                 analyze_pattern_stmt = false;
 360             }
 361
 362           if (gimple_get_lhs (stmt) == NULL_TREE)
 363             {
 364               if (dump_enabled_p ())
 365                 {
 366                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 367                                    "not vectorized: irregular stmt.");
 368                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 369                                     0);
 370                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 371                 }
 372               return false;
 373             }
 374
 375           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 376             {
 377               if (dump_enabled_p ())
 378                 {
 379                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 380                                    "not vectorized: vector stmt in loop:");
 381                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 382                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 383                 }
 384               return false;
 385             }
 386
 387           if (STMT_VINFO_VECTYPE (stmt_info))
 388             {
 389               /* The only case when a vectype had been already set is for stmts
 390                  that contain a dataref, or for "pattern-stmts" (stmts
 391                  generated by the vectorizer to represent/replace a certain
 392                  idiom).  */
 393               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 394                           || is_pattern_stmt_p (stmt_info)
 395                           || !gsi_end_p (pattern_def_si));
 396               vectype = STMT_VINFO_VECTYPE (stmt_info);
 397             }
 398           else
 399             {
 400               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 401               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 402               if (dump_enabled_p ())
 403                 {
 404                   dump_printf_loc (MSG_NOTE, vect_location,
 405                                    "get vectype for scalar type:  ");
 406                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 407                   dump_printf (MSG_NOTE, "\n");
 408                 }
 409               vectype = get_vectype_for_scalar_type (scalar_type);
 410               if (!vectype)
 411                 {
 412                   if (dump_enabled_p ())
 413                     {
 414                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 415                                        "not vectorized: unsupported "
 416                                        "data-type ");
 417                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 418                                          scalar_type);
 419                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 420                     }
 421                   return false;
 422                 }
 423
 424               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 425
 426               if (dump_enabled_p ())
 427                 {
 428                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 429                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 430                   dump_printf (MSG_NOTE, "\n");
 431                 }
 432             }
 433
 434           /* The vectorization factor is according to the smallest
 435              scalar type (or the largest vector size, but we only
 436              support one vector size per loop).  */
 437           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 438                                                        &dummy);
 439           if (dump_enabled_p ())
 440             {
 441               dump_printf_loc (MSG_NOTE, vect_location,
 442                                "get vectype for scalar type:  ");
 443               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 444               dump_printf (MSG_NOTE, "\n");
 445             }
 446           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 447           if (!vf_vectype)
 448             {
 449               if (dump_enabled_p ())
 450                 {
 451                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 452                                    "not vectorized: unsupported data-type ");
 453                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 454                                      scalar_type);
 455                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 456                 }
 457               return false;
 458             }
 459
 460           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 461                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 462             {
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 466                                    "not vectorized: different sized vector "
 467                                    "types in statement, ");
 468                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 469                                      vectype);
 470                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 471                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 472                                      vf_vectype);
 473                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 474                 }
 475               return false;
 476             }
 477
 478           if (dump_enabled_p ())
 479             {
 480               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 481               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 482               dump_printf (MSG_NOTE, "\n");
 483             }
 484
 485           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 486           if (dump_enabled_p ())
 487             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 488           if (!vectorization_factor
 489               || (nunits > vectorization_factor))
 490             vectorization_factor = nunits;
 491
 492           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 493             {
 494               pattern_def_seq = NULL;
 495               gsi_next (&si);
 496             }
 497         }
 498     }
 499
 500   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 501   if (dump_enabled_p ())
 502     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 503                      vectorization_factor);
 504   if (vectorization_factor <= 1)
 505     {
 506       if (dump_enabled_p ())
 507         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 508                          "not vectorized: unsupported data-type\n");
 509       return false;
 510     }
 511   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 512
 513   return true;
 514 }
 515
 516
 517 /* Function vect_is_simple_iv_evolution.
 518
 519    FORNOW: A simple evolution of an induction variables in the loop is
 520    considered a polynomial evolution.  */
 521
 522 static bool
 523 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 524                              tree * step)
 525 {
 526   tree init_expr;
 527   tree step_expr;
 528   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 529   basic_block bb;
 530
 531   /* When there is no evolution in this loop, the evolution function
 532      is not "simple".  */
 533   if (evolution_part == NULL_TREE)
 534     return false;
 535
 536   /* When the evolution is a polynomial of degree >= 2
 537      the evolution function is not "simple".  */
 538   if (tree_is_chrec (evolution_part))
 539     return false;
 540
 541   step_expr = evolution_part;
 542   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 543
 544   if (dump_enabled_p ())
 545     {
 546       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 547       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 548       dump_printf (MSG_NOTE, ",  init: ");
 549       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 550       dump_printf (MSG_NOTE, "\n");
 551     }
 552
 553   *init = init_expr;
 554   *step = step_expr;
 555
 556   if (TREE_CODE (step_expr) != INTEGER_CST
 557       && (TREE_CODE (step_expr) != SSA_NAME
 558           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 559               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 560           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 561               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 562                   || !flag_associative_math)))
 563       && (TREE_CODE (step_expr) != REAL_CST
 564           || !flag_associative_math))
 565     {
 566       if (dump_enabled_p ())
 567         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 568                          "step unknown.\n");
 569       return false;
 570     }
 571
 572   return true;
 573 }
 574
 575 /* Function vect_analyze_scalar_cycles_1.
 576
 577    Examine the cross iteration def-use cycles of scalar variables
 578    in LOOP.  LOOP_VINFO represents the loop that is now being
 579    considered for vectorization (can be LOOP, or an outer-loop
 580    enclosing LOOP).  */
 581
 582 static void
 583 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 584 {
 585   basic_block bb = loop->header;
 586   tree init, step;
 587   vec<gimple> worklist;
 588   worklist.create (64);
 589   gimple_stmt_iterator gsi;
 590   bool double_reduc;
 591
 592   if (dump_enabled_p ())
 593     dump_printf_loc (MSG_NOTE, vect_location,
 594                      "=== vect_analyze_scalar_cycles ===\n");
 595
 596   /* First - identify all inductions.  Reduction detection assumes that all the
 597      inductions have been identified, therefore, this order must not be
 598      changed.  */
 599   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 600     {
 601       gimple phi = gsi_stmt (gsi);
 602       tree access_fn = NULL;
 603       tree def = PHI_RESULT (phi);
 604       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 605
 606       if (dump_enabled_p ())
 607         {
 608           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 609           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 610           dump_printf (MSG_NOTE, "\n");
 611         }
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             {
 627               dump_printf_loc (MSG_NOTE, vect_location,
 628                                "Access function of PHI: ");
 629               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 630               dump_printf (MSG_NOTE, "\n");
 631             }
 632           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 633             = evolution_part_in_loop_num (access_fn, loop->num);
 634         }
 635
 636       if (!access_fn
 637           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 638           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639               && TREE_CODE (step) != INTEGER_CST))
 640         {
 641           worklist.safe_push (phi);
 642           continue;
 643         }
 644
 645       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 646
 647       if (dump_enabled_p ())
 648         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 649       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 650     }
 651
 652
 653   /* Second - identify all reductions and nested cycles.  */
 654   while (worklist.length () > 0)
 655     {
 656       gimple phi = worklist.pop ();
 657       tree def = PHI_RESULT (phi);
 658       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 659       gimple reduc_stmt;
 660       bool nested_cycle;
 661
 662       if (dump_enabled_p ())
 663         {
 664           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 665           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 666           dump_printf (MSG_NOTE, "\n");
 667         }
 668
 669       gcc_assert (!virtual_operand_p (def)
 670                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 671
 672       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 673       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 674                                                 &double_reduc);
 675       if (reduc_stmt)
 676         {
 677           if (double_reduc)
 678             {
 679               if (dump_enabled_p ())
 680                 dump_printf_loc (MSG_NOTE, vect_location,
 681                                  "Detected double reduction.\n");
 682
 683               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 684               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 685                                                     vect_double_reduction_def;
 686             }
 687           else
 688             {
 689               if (nested_cycle)
 690                 {
 691                   if (dump_enabled_p ())
 692                     dump_printf_loc (MSG_NOTE, vect_location,
 693                                      "Detected vectorizable nested cycle.\n");
 694
 695                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 696                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 697                                                              vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 707                                                            vect_reduction_def;
 708                   /* Store the reduction cycles for possible vectorization in
 709                      loop-aware SLP.  */
 710                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 711                 }
 712             }
 713         }
 714       else
 715         if (dump_enabled_p ())
 716           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 717                            "Unknown def-use cycle pattern.\n");
 718     }
 719
 720   worklist.release ();
 721 }
 722
 723
 724 /* Function vect_analyze_scalar_cycles.
 725
 726    Examine the cross iteration def-use cycles of scalar variables, by
 727    analyzing the loop-header PHIs of scalar variables.  Classify each
 728    cycle as one of the following: invariant, induction, reduction, unknown.
 729    We do that for the loop represented by LOOP_VINFO, and also to its
 730    inner-loop, if exists.
 731    Examples for scalar cycles:
 732
 733    Example1: reduction:
 734
 735               loop1:
 736               for (i=0; i<N; i++)
 737                  sum += a[i];
 738
 739    Example2: induction:
 740
 741               loop2:
 742               for (i=0; i<N; i++)
 743                  a[i] = i;  */
 744
 745 static void
 746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 747 {
 748   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 749
 750   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 751
 752   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 753      Reductions in such inner-loop therefore have different properties than
 754      the reductions in the nest that gets vectorized:
 755      1. When vectorized, they are executed in the same order as in the original
 756         scalar loop, so we can't change the order of computation when
 757         vectorizing them.
 758      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 759         current checks are too strict.  */
 760
 761   if (loop->inner)
 762     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 763 }
 764
 765 /* Function vect_get_loop_niters.
 766
 767    Determine how many iterations the loop is executed.
 768    If an expression that represents the number of iterations
 769    can be constructed, place it in NUMBER_OF_ITERATIONS.
 770    Return the loop exit condition.  */
 771
 772 static gimple
 773 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 774 {
 775   tree niters;
 776
 777   if (dump_enabled_p ())
 778     dump_printf_loc (MSG_NOTE, vect_location,
 779                      "=== get_loop_niters ===\n");
 780   niters = number_of_exit_cond_executions (loop);
 781
 782   if (niters != NULL_TREE
 783       && niters != chrec_dont_know)
 784     {
 785       *number_of_iterations = niters;
 786
 787       if (dump_enabled_p ())
 788         {
 789           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 790           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 791           dump_printf (MSG_NOTE, "\n");
 792         }
 793     }
 794
 795   return get_loop_exit_condition (loop);
 796 }
 797
 798
 799 /* Function bb_in_loop_p
 800
 801    Used as predicate for dfs order traversal of the loop bbs.  */
 802
 803 static bool
 804 bb_in_loop_p (const_basic_block bb, const void *data)
 805 {
 806   const struct loop *const loop = (const struct loop *)data;
 807   if (flow_bb_inside_loop_p (loop, bb))
 808     return true;
 809   return false;
 810 }
 811
 812
 813 /* Function new_loop_vec_info.
 814
 815    Create and initialize a new loop_vec_info struct for LOOP, as well as
 816    stmt_vec_info structs for all the stmts in LOOP.  */
 817
 818 static loop_vec_info
 819 new_loop_vec_info (struct loop *loop)
 820 {
 821   loop_vec_info res;
 822   basic_block *bbs;
 823   gimple_stmt_iterator si;
 824   unsigned int i, nbbs;
 825
 826   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 827   LOOP_VINFO_LOOP (res) = loop;
 828
 829   bbs = get_loop_body (loop);
 830
 831   /* Create/Update stmt_info for all stmts in the loop.  */
 832   for (i = 0; i < loop->num_nodes; i++)
 833     {
 834       basic_block bb = bbs[i];
 835
 836       /* BBs in a nested inner-loop will have been already processed (because
 837          we will have called vect_analyze_loop_form for any nested inner-loop).
 838          Therefore, for stmts in an inner-loop we just want to update the
 839          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 840          loop_info of the outer-loop we are currently considering to vectorize
 841          (instead of the loop_info of the inner-loop).
 842          For stmts in other BBs we need to create a stmt_info from scratch.  */
 843       if (bb->loop_father != loop)
 844         {
 845           /* Inner-loop bb.  */
 846           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 847           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 848             {
 849               gimple phi = gsi_stmt (si);
 850               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 851               loop_vec_info inner_loop_vinfo =
 852                 STMT_VINFO_LOOP_VINFO (stmt_info);
 853               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 854               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 855             }
 856           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 857            {
 858               gimple stmt = gsi_stmt (si);
 859               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 860               loop_vec_info inner_loop_vinfo =
 861                  STMT_VINFO_LOOP_VINFO (stmt_info);
 862               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 863               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 864            }
 865         }
 866       else
 867         {
 868           /* bb in current nest.  */
 869           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 870             {
 871               gimple phi = gsi_stmt (si);
 872               gimple_set_uid (phi, 0);
 873               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 874             }
 875
 876           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 877             {
 878               gimple stmt = gsi_stmt (si);
 879               gimple_set_uid (stmt, 0);
 880               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 881             }
 882         }
 883     }
 884
 885   /* CHECKME: We want to visit all BBs before their successors (except for
 886      latch blocks, for which this assertion wouldn't hold).  In the simple
 887      case of the loop forms we allow, a dfs order of the BBs would the same
 888      as reversed postorder traversal, so we are safe.  */
 889
 890    free (bbs);
 891    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 892    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 893                               bbs, loop->num_nodes, loop);
 894    gcc_assert (nbbs == loop->num_nodes);
 895
 896   LOOP_VINFO_BBS (res) = bbs;
 897   LOOP_VINFO_NITERS (res) = NULL;
 898   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 899   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 900   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 901   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 902   LOOP_VINFO_VECT_FACTOR (res) = 0;
 903   LOOP_VINFO_LOOP_NEST (res).create (3);
 904   LOOP_VINFO_DATAREFS (res).create (10);
 905   LOOP_VINFO_DDRS (res).create (10 * 10);
 906   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 907   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 908              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 909   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 910              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 911   LOOP_VINFO_GROUPED_STORES (res).create (10);
 912   LOOP_VINFO_REDUCTIONS (res).create (10);
 913   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 914   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 915   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 916   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 917   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 918   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 919
 920   return res;
 921 }
 922
 923
 924 /* Function destroy_loop_vec_info.
 925
 926    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 927    stmts in the loop.  */
 928
 929 void
 930 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 931 {
 932   struct loop *loop;
 933   basic_block *bbs;
 934   int nbbs;
 935   gimple_stmt_iterator si;
 936   int j;
 937   vec<slp_instance> slp_instances;
 938   slp_instance instance;
 939   bool swapped;
 940
 941   if (!loop_vinfo)
 942     return;
 943
 944   loop = LOOP_VINFO_LOOP (loop_vinfo);
 945
 946   bbs = LOOP_VINFO_BBS (loop_vinfo);
 947   nbbs = clean_stmts ? loop->num_nodes : 0;
 948   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 949
 950   for (j = 0; j < nbbs; j++)
 951     {
 952       basic_block bb = bbs[j];
 953       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 954         free_stmt_vec_info (gsi_stmt (si));
 955
 956       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 957         {
 958           gimple stmt = gsi_stmt (si);
 959
 960           /* We may have broken canonical form by moving a constant
 961              into RHS1 of a commutative op.  Fix such occurrences.  */
 962           if (swapped && is_gimple_assign (stmt))
 963             {
 964               enum tree_code code = gimple_assign_rhs_code (stmt);
 965
 966               if ((code == PLUS_EXPR
 967                    || code == POINTER_PLUS_EXPR
 968                    || code == MULT_EXPR)
 969                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 970                 swap_tree_operands (stmt,
 971                                     gimple_assign_rhs1_ptr (stmt),
 972                                     gimple_assign_rhs2_ptr (stmt));
 973             }
 974
 975           /* Free stmt_vec_info.  */
 976           free_stmt_vec_info (stmt);
 977           gsi_next (&si);
 978         }
 979     }
 980
 981   free (LOOP_VINFO_BBS (loop_vinfo));
 982   vect_destroy_datarefs (loop_vinfo, NULL);
 983   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 984   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 985   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 986   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 987   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 988   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 989     vect_free_slp_instance (instance);
 990
 991   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 992   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 993   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 994   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 995
 996   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
 997     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
 998
 999   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1000
1001   free (loop_vinfo);
1002   loop->aux = NULL;
1003 }
1004
1005
1006 /* Function vect_analyze_loop_1.
1007
1008    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1009    for it. The different analyses will record information in the
1010    loop_vec_info struct.  This is a subset of the analyses applied in
1011    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1012    that is now considered for (outer-loop) vectorization.  */
1013
1014 static loop_vec_info
1015 vect_analyze_loop_1 (struct loop *loop)
1016 {
1017   loop_vec_info loop_vinfo;
1018
1019   if (dump_enabled_p ())
1020     dump_printf_loc (MSG_NOTE, vect_location,
1021                      "===== analyze_loop_nest_1 =====\n");
1022
1023   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1024
1025   loop_vinfo = vect_analyze_loop_form (loop);
1026   if (!loop_vinfo)
1027     {
1028       if (dump_enabled_p ())
1029         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030                          "bad inner-loop form.\n");
1031       return NULL;
1032     }
1033
1034   return loop_vinfo;
1035 }
1036
1037
1038 /* Function vect_analyze_loop_form.
1039
1040    Verify that certain CFG restrictions hold, including:
1041    - the loop has a pre-header
1042    - the loop has a single entry and exit
1043    - the loop exit condition is simple enough, and the number of iterations
1044      can be analyzed (a countable loop).  */
1045
1046 loop_vec_info
1047 vect_analyze_loop_form (struct loop *loop)
1048 {
1049   loop_vec_info loop_vinfo;
1050   gimple loop_cond;
1051   tree number_of_iterations = NULL;
1052   loop_vec_info inner_loop_vinfo = NULL;
1053
1054   if (dump_enabled_p ())
1055     dump_printf_loc (MSG_NOTE, vect_location,
1056                      "=== vect_analyze_loop_form ===\n");
1057
1058   /* Different restrictions apply when we are considering an inner-most loop,
1059      vs. an outer (nested) loop.
1060      (FORNOW. May want to relax some of these restrictions in the future).  */
1061
1062   if (!loop->inner)
1063     {
1064       /* Inner-most loop.  We currently require that the number of BBs is
1065          exactly 2 (the header and latch).  Vectorizable inner-most loops
1066          look like this:
1067
1068                         (pre-header)
1069                            |
1070                           header <--------+
1071                            | |            |
1072                            | +--> latch --+
1073                            |
1074                         (exit-bb)  */
1075
1076       if (loop->num_nodes != 2)
1077         {
1078           if (dump_enabled_p ())
1079             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1080                              "not vectorized: control flow in loop.\n");
1081           return NULL;
1082         }
1083
1084       if (empty_block_p (loop->header))
1085     {
1086           if (dump_enabled_p ())
1087             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1088                              "not vectorized: empty loop.\n");
1089       return NULL;
1090     }
1091     }
1092   else
1093     {
1094       struct loop *innerloop = loop->inner;
1095       edge entryedge;
1096
1097       /* Nested loop. We currently require that the loop is doubly-nested,
1098          contains a single inner loop, and the number of BBs is exactly 5.
1099          Vectorizable outer-loops look like this:
1100
1101                         (pre-header)
1102                            |
1103                           header <---+
1104                            |         |
1105                           inner-loop |
1106                            |         |
1107                           tail ------+
1108                            |
1109                         (exit-bb)
1110
1111          The inner-loop has the properties expected of inner-most loops
1112          as described above.  */
1113
1114       if ((loop->inner)->inner || (loop->inner)->next)
1115         {
1116           if (dump_enabled_p ())
1117             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118                              "not vectorized: multiple nested loops.\n");
1119           return NULL;
1120         }
1121
1122       /* Analyze the inner-loop.  */
1123       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1124       if (!inner_loop_vinfo)
1125         {
1126           if (dump_enabled_p ())
1127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128                              "not vectorized: Bad inner loop.\n");
1129           return NULL;
1130         }
1131
1132       if (!expr_invariant_in_loop_p (loop,
1133                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1134         {
1135           if (dump_enabled_p ())
1136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137                              "not vectorized: inner-loop count not"
1138                              " invariant.\n");
1139           destroy_loop_vec_info (inner_loop_vinfo, true);
1140           return NULL;
1141         }
1142
1143       if (loop->num_nodes != 5)
1144         {
1145           if (dump_enabled_p ())
1146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1147                              "not vectorized: control flow in loop.\n");
1148           destroy_loop_vec_info (inner_loop_vinfo, true);
1149           return NULL;
1150         }
1151
1152       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1153       entryedge = EDGE_PRED (innerloop->header, 0);
1154       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1155         entryedge = EDGE_PRED (innerloop->header, 1);
1156
1157       if (entryedge->src != loop->header
1158           || !single_exit (innerloop)
1159           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1160         {
1161           if (dump_enabled_p ())
1162             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163                              "not vectorized: unsupported outerloop form.\n");
1164           destroy_loop_vec_info (inner_loop_vinfo, true);
1165           return NULL;
1166         }
1167
1168       if (dump_enabled_p ())
1169         dump_printf_loc (MSG_NOTE, vect_location,
1170                          "Considering outer-loop vectorization.\n");
1171     }
1172
1173   if (!single_exit (loop)
1174       || EDGE_COUNT (loop->header->preds) != 2)
1175     {
1176       if (dump_enabled_p ())
1177         {
1178           if (!single_exit (loop))
1179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                              "not vectorized: multiple exits.\n");
1181           else if (EDGE_COUNT (loop->header->preds) != 2)
1182             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1183                              "not vectorized: too many incoming edges.\n");
1184         }
1185       if (inner_loop_vinfo)
1186         destroy_loop_vec_info (inner_loop_vinfo, true);
1187       return NULL;
1188     }
1189
1190   /* We assume that the loop exit condition is at the end of the loop. i.e,
1191      that the loop is represented as a do-while (with a proper if-guard
1192      before the loop if needed), where the loop header contains all the
1193      executable statements, and the latch is empty.  */
1194   if (!empty_block_p (loop->latch)
1195       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1196     {
1197       if (dump_enabled_p ())
1198         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                          "not vectorized: latch block not empty.\n");
1200       if (inner_loop_vinfo)
1201         destroy_loop_vec_info (inner_loop_vinfo, true);
1202       return NULL;
1203     }
1204
1205   /* Make sure there exists a single-predecessor exit bb:  */
1206   if (!single_pred_p (single_exit (loop)->dest))
1207     {
1208       edge e = single_exit (loop);
1209       if (!(e->flags & EDGE_ABNORMAL))
1210         {
1211           split_loop_exit_edge (e);
1212           if (dump_enabled_p ())
1213             dump_printf (MSG_NOTE, "split exit edge.\n");
1214         }
1215       else
1216         {
1217           if (dump_enabled_p ())
1218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                              "not vectorized: abnormal loop exit edge.\n");
1220           if (inner_loop_vinfo)
1221             destroy_loop_vec_info (inner_loop_vinfo, true);
1222           return NULL;
1223         }
1224     }
1225
1226   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1227   if (!loop_cond)
1228     {
1229       if (dump_enabled_p ())
1230         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231                          "not vectorized: complicated exit condition.\n");
1232       if (inner_loop_vinfo)
1233         destroy_loop_vec_info (inner_loop_vinfo, true);
1234       return NULL;
1235     }
1236
1237   if (!number_of_iterations)
1238     {
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1241                          "not vectorized: number of iterations cannot be "
1242                          "computed.\n");
1243       if (inner_loop_vinfo)
1244         destroy_loop_vec_info (inner_loop_vinfo, true);
1245       return NULL;
1246     }
1247
1248   if (chrec_contains_undetermined (number_of_iterations))
1249     {
1250       if (dump_enabled_p ())
1251             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1252                              "Infinite number of iterations.\n");
1253       if (inner_loop_vinfo)
1254         destroy_loop_vec_info (inner_loop_vinfo, true);
1255       return NULL;
1256     }
1257
1258   if (!NITERS_KNOWN_P (number_of_iterations))
1259     {
1260       if (dump_enabled_p ())
1261         {
1262           dump_printf_loc (MSG_NOTE, vect_location,
1263                            "Symbolic number of iterations is ");
1264           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1265           dump_printf (MSG_NOTE, "\n");
1266         }
1267     }
1268   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1269     {
1270       if (dump_enabled_p ())
1271         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272                          "not vectorized: number of iterations = 0.\n");
1273       if (inner_loop_vinfo)
1274         destroy_loop_vec_info (inner_loop_vinfo, true);
1275       return NULL;
1276     }
1277
1278   loop_vinfo = new_loop_vec_info (loop);
1279   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1280   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1281
1282   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1283
1284   /* CHECKME: May want to keep it around it in the future.  */
1285   if (inner_loop_vinfo)
1286     destroy_loop_vec_info (inner_loop_vinfo, false);
1287
1288   gcc_assert (!loop->aux);
1289   loop->aux = loop_vinfo;
1290   return loop_vinfo;
1291 }
1292
1293
1294 /* Function vect_analyze_loop_operations.
1295
1296    Scan the loop stmts and make sure they are all vectorizable.  */
1297
1298 static bool
1299 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1300 {
1301   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1302   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1303   int nbbs = loop->num_nodes;
1304   gimple_stmt_iterator si;
1305   unsigned int vectorization_factor = 0;
1306   int i;
1307   gimple phi;
1308   stmt_vec_info stmt_info;
1309   bool need_to_vectorize = false;
1310   int min_profitable_iters;
1311   int min_scalar_loop_bound;
1312   unsigned int th;
1313   bool only_slp_in_loop = true, ok;
1314   HOST_WIDE_INT max_niter;
1315   HOST_WIDE_INT estimated_niter;
1316   int min_profitable_estimate;
1317
1318   if (dump_enabled_p ())
1319     dump_printf_loc (MSG_NOTE, vect_location,
1320                      "=== vect_analyze_loop_operations ===\n");
1321
1322   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1323   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1324   if (slp)
1325     {
1326       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1327          vectorization factor of the loop is the unrolling factor required by
1328          the SLP instances.  If that unrolling factor is 1, we say, that we
1329          perform pure SLP on loop - cross iteration parallelism is not
1330          exploited.  */
1331       for (i = 0; i < nbbs; i++)
1332         {
1333           basic_block bb = bbs[i];
1334           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1335             {
1336               gimple stmt = gsi_stmt (si);
1337               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1338               gcc_assert (stmt_info);
1339               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1340                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1341                   && !PURE_SLP_STMT (stmt_info))
1342                 /* STMT needs both SLP and loop-based vectorization.  */
1343                 only_slp_in_loop = false;
1344             }
1345         }
1346
1347       if (only_slp_in_loop)
1348         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1349       else
1350         vectorization_factor = least_common_multiple (vectorization_factor,
1351                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1352
1353       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1354       if (dump_enabled_p ())
1355         dump_printf_loc (MSG_NOTE, vect_location,
1356                          "Updating vectorization factor to %d\n",
1357                          vectorization_factor);
1358     }
1359
1360   for (i = 0; i < nbbs; i++)
1361     {
1362       basic_block bb = bbs[i];
1363
1364       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1365         {
1366           phi = gsi_stmt (si);
1367           ok = true;
1368
1369           stmt_info = vinfo_for_stmt (phi);
1370           if (dump_enabled_p ())
1371             {
1372               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1373               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1374               dump_printf (MSG_NOTE, "\n");
1375             }
1376
1377           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1378              (i.e., a phi in the tail of the outer-loop).  */
1379           if (! is_loop_header_bb_p (bb))
1380             {
1381               /* FORNOW: we currently don't support the case that these phis
1382                  are not used in the outerloop (unless it is double reduction,
1383                  i.e., this phi is vect_reduction_def), cause this case
1384                  requires to actually do something here.  */
1385               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1386                    || STMT_VINFO_LIVE_P (stmt_info))
1387                   && STMT_VINFO_DEF_TYPE (stmt_info)
1388                      != vect_double_reduction_def)
1389                 {
1390                   if (dump_enabled_p ())
1391                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392                                      "Unsupported loop-closed phi in "
1393                                      "outer-loop.\n");
1394                   return false;
1395                 }
1396
1397               /* If PHI is used in the outer loop, we check that its operand
1398                  is defined in the inner loop.  */
1399               if (STMT_VINFO_RELEVANT_P (stmt_info))
1400                 {
1401                   tree phi_op;
1402                   gimple op_def_stmt;
1403
1404                   if (gimple_phi_num_args (phi) != 1)
1405                     return false;
1406
1407                   phi_op = PHI_ARG_DEF (phi, 0);
1408                   if (TREE_CODE (phi_op) != SSA_NAME)
1409                     return false;
1410
1411                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1412                   if (gimple_nop_p (op_def_stmt)
1413                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1414                       || !vinfo_for_stmt (op_def_stmt))
1415                     return false;
1416
1417                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1418                         != vect_used_in_outer
1419                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1420                            != vect_used_in_outer_by_reduction)
1421                     return false;
1422                 }
1423
1424               continue;
1425             }
1426
1427           gcc_assert (stmt_info);
1428
1429           if (STMT_VINFO_LIVE_P (stmt_info))
1430             {
1431               /* FORNOW: not yet supported.  */
1432               if (dump_enabled_p ())
1433                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1434                                  "not vectorized: value used after loop.\n");
1435               return false;
1436             }
1437
1438           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1439               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1440             {
1441               /* A scalar-dependence cycle that we don't support.  */
1442               if (dump_enabled_p ())
1443                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1444                                  "not vectorized: scalar dependence cycle.\n");
1445               return false;
1446             }
1447
1448           if (STMT_VINFO_RELEVANT_P (stmt_info))
1449             {
1450               need_to_vectorize = true;
1451               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1452                 ok = vectorizable_induction (phi, NULL, NULL);
1453             }
1454
1455           if (!ok)
1456             {
1457               if (dump_enabled_p ())
1458                 {
1459                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1460                                    "not vectorized: relevant phi not "
1461                                    "supported: ");
1462                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1463                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1464                 }
1465               return false;
1466             }
1467         }
1468
1469       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1470         {
1471           gimple stmt = gsi_stmt (si);
1472           if (!gimple_clobber_p (stmt)
1473               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1474             return false;
1475         }
1476     } /* bbs */
1477
1478   /* All operations in the loop are either irrelevant (deal with loop
1479      control, or dead), or only used outside the loop and can be moved
1480      out of the loop (e.g. invariants, inductions).  The loop can be
1481      optimized away by scalar optimizations.  We're better off not
1482      touching this loop.  */
1483   if (!need_to_vectorize)
1484     {
1485       if (dump_enabled_p ())
1486         dump_printf_loc (MSG_NOTE, vect_location,
1487                          "All the computation can be taken out of the loop.\n");
1488       if (dump_enabled_p ())
1489         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1490                          "not vectorized: redundant loop. no profit to "
1491                          "vectorize.\n");
1492       return false;
1493     }
1494
1495   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1496     dump_printf_loc (MSG_NOTE, vect_location,
1497                      "vectorization_factor = %d, niters = "
1498                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1499                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1500
1501   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1502        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1503       || ((max_niter = max_stmt_executions_int (loop)) != -1
1504           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1505     {
1506       if (dump_enabled_p ())
1507         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1508                          "not vectorized: iteration count too small.\n");
1509       if (dump_enabled_p ())
1510         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1511                          "not vectorized: iteration count smaller than "
1512                          "vectorization factor.\n");
1513       return false;
1514     }
1515
1516   /* Analyze cost.  Decide if worth while to vectorize.  */
1517
1518   /* Once VF is set, SLP costs should be updated since the number of created
1519      vector stmts depends on VF.  */
1520   vect_update_slp_costs_according_to_vf (loop_vinfo);
1521
1522   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1523                                       &min_profitable_estimate);
1524   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1525
1526   if (min_profitable_iters < 0)
1527     {
1528       if (dump_enabled_p ())
1529         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530                          "not vectorized: vectorization not profitable.\n");
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                          "not vectorized: vector version will never be "
1534                          "profitable.\n");
1535       return false;
1536     }
1537
1538   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1539                             * vectorization_factor) - 1);
1540
1541
1542   /* Use the cost model only if it is more conservative than user specified
1543      threshold.  */
1544
1545   th = (unsigned) min_scalar_loop_bound;
1546   if (min_profitable_iters
1547       && (!min_scalar_loop_bound
1548           || min_profitable_iters > min_scalar_loop_bound))
1549     th = (unsigned) min_profitable_iters;
1550
1551   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1552       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1553     {
1554       if (dump_enabled_p ())
1555         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                          "not vectorized: vectorization not profitable.\n");
1557       if (dump_enabled_p ())
1558         dump_printf_loc (MSG_NOTE, vect_location,
1559                          "not vectorized: iteration count smaller than user "
1560                          "specified loop bound parameter or minimum profitable "
1561                          "iterations (whichever is more conservative).\n");
1562       return false;
1563     }
1564
1565   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1566       && ((unsigned HOST_WIDE_INT) estimated_niter
1567           <= MAX (th, (unsigned)min_profitable_estimate)))
1568     {
1569       if (dump_enabled_p ())
1570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                          "not vectorized: estimated iteration count too "
1572                          "small.\n");
1573       if (dump_enabled_p ())
1574         dump_printf_loc (MSG_NOTE, vect_location,
1575                          "not vectorized: estimated iteration count smaller "
1576                          "than specified loop bound parameter or minimum "
1577                          "profitable iterations (whichever is more "
1578                          "conservative).\n");
1579       return false;
1580     }
1581
1582   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1583       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1584       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1585     {
1586       if (dump_enabled_p ())
1587         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.\n");
1588       if (!vect_can_advance_ivs_p (loop_vinfo))
1589         {
1590           if (dump_enabled_p ())
1591             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1592                              "not vectorized: can't create epilog loop 1.\n");
1593           return false;
1594         }
1595       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1596         {
1597           if (dump_enabled_p ())
1598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599                              "not vectorized: can't create epilog loop 2.\n");
1600           return false;
1601         }
1602     }
1603
1604   return true;
1605 }
1606
1607
1608 /* Function vect_analyze_loop_2.
1609
1610    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1611    for it.  The different analyses will record information in the
1612    loop_vec_info struct.  */
1613 static bool
1614 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1615 {
1616   bool ok, slp = false;
1617   int max_vf = MAX_VECTORIZATION_FACTOR;
1618   int min_vf = 2;
1619
1620   /* Find all data references in the loop (which correspond to vdefs/vuses)
1621      and analyze their evolution in the loop.  Also adjust the minimal
1622      vectorization factor according to the loads and stores.
1623
1624      FORNOW: Handle only simple, array references, which
1625      alignment can be forced, and aligned pointer-references.  */
1626
1627   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1628   if (!ok)
1629     {
1630       if (dump_enabled_p ())
1631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1632                          "bad data references.\n");
1633       return false;
1634     }
1635
1636   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1637      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1638
1639   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1640   if (!ok)
1641     {
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644                          "bad data access.\n");
1645       return false;
1646     }
1647
1648   /* Classify all cross-iteration scalar data-flow cycles.
1649      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1650
1651   vect_analyze_scalar_cycles (loop_vinfo);
1652
1653   vect_pattern_recog (loop_vinfo, NULL);
1654
1655   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1656
1657   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1658   if (!ok)
1659     {
1660       if (dump_enabled_p ())
1661         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662                          "unexpected pattern.\n");
1663       return false;
1664     }
1665
1666   /* Analyze data dependences between the data-refs in the loop
1667      and adjust the maximum vectorization factor according to
1668      the dependences.
1669      FORNOW: fail at the first data dependence that we encounter.  */
1670
1671   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1672   if (!ok
1673       || max_vf < min_vf)
1674     {
1675       if (dump_enabled_p ())
1676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677                              "bad data dependence.\n");
1678       return false;
1679     }
1680
1681   ok = vect_determine_vectorization_factor (loop_vinfo);
1682   if (!ok)
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "can't determine vectorization factor.\n");
1687       return false;
1688     }
1689   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1690     {
1691       if (dump_enabled_p ())
1692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1693                          "bad data dependence.\n");
1694       return false;
1695     }
1696
1697   /* Analyze the alignment of the data-refs in the loop.
1698      Fail if a data reference is found that cannot be vectorized.  */
1699
1700   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1701   if (!ok)
1702     {
1703       if (dump_enabled_p ())
1704         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1705                          "bad data alignment.\n");
1706       return false;
1707     }
1708
1709   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1710      It is important to call pruning after vect_analyze_data_ref_accesses,
1711      since we use grouping information gathered by interleaving analysis.  */
1712   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1713   if (!ok)
1714     {
1715       if (dump_enabled_p ())
1716         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717                          "too long list of versioning for alias "
1718                          "run-time tests.\n");
1719       return false;
1720     }
1721
1722   /* This pass will decide on using loop versioning and/or loop peeling in
1723      order to enhance the alignment of data references in the loop.  */
1724
1725   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1726   if (!ok)
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                          "bad data alignment.\n");
1731       return false;
1732     }
1733
1734   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1735   ok = vect_analyze_slp (loop_vinfo, NULL);
1736   if (ok)
1737     {
1738       /* Decide which possible SLP instances to SLP.  */
1739       slp = vect_make_slp_decision (loop_vinfo);
1740
1741       /* Find stmts that need to be both vectorized and SLPed.  */
1742       vect_detect_hybrid_slp (loop_vinfo);
1743     }
1744   else
1745     return false;
1746
1747   /* Scan all the operations in the loop and make sure they are
1748      vectorizable.  */
1749
1750   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1751   if (!ok)
1752     {
1753       if (dump_enabled_p ())
1754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755                          "bad operation or unsupported loop bound.\n");
1756       return false;
1757     }
1758
1759   return true;
1760 }
1761
1762 /* Function vect_analyze_loop.
1763
1764    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1765    for it.  The different analyses will record information in the
1766    loop_vec_info struct.  */
1767 loop_vec_info
1768 vect_analyze_loop (struct loop *loop)
1769 {
1770   loop_vec_info loop_vinfo;
1771   unsigned int vector_sizes;
1772
1773   /* Autodetect first vector size we try.  */
1774   current_vector_size = 0;
1775   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1776
1777   if (dump_enabled_p ())
1778     dump_printf_loc (MSG_NOTE, vect_location,
1779                      "===== analyze_loop_nest =====\n");
1780
1781   if (loop_outer (loop)
1782       && loop_vec_info_for_loop (loop_outer (loop))
1783       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1784     {
1785       if (dump_enabled_p ())
1786         dump_printf_loc (MSG_NOTE, vect_location,
1787                          "outer-loop already vectorized.\n");
1788       return NULL;
1789     }
1790
1791   while (1)
1792     {
1793       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1794       loop_vinfo = vect_analyze_loop_form (loop);
1795       if (!loop_vinfo)
1796         {
1797           if (dump_enabled_p ())
1798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799                              "bad loop form.\n");
1800           return NULL;
1801         }
1802
1803       if (vect_analyze_loop_2 (loop_vinfo))
1804         {
1805           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1806
1807           return loop_vinfo;
1808         }
1809
1810       destroy_loop_vec_info (loop_vinfo, true);
1811
1812       vector_sizes &= ~current_vector_size;
1813       if (vector_sizes == 0
1814           || current_vector_size == 0)
1815         return NULL;
1816
1817       /* Try the next biggest vector size.  */
1818       current_vector_size = 1 << floor_log2 (vector_sizes);
1819       if (dump_enabled_p ())
1820         dump_printf_loc (MSG_NOTE, vect_location,
1821                          "***** Re-trying analysis with "
1822                          "vector size %d\n", current_vector_size);
1823     }
1824 }
1825
1826
1827 /* Function reduction_code_for_scalar_code
1828
1829    Input:
1830    CODE - tree_code of a reduction operations.
1831
1832    Output:
1833    REDUC_CODE - the corresponding tree-code to be used to reduce the
1834       vector of partial results into a single scalar result (which
1835       will also reside in a vector) or ERROR_MARK if the operation is
1836       a supported reduction operation, but does not have such tree-code.
1837
1838    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1839
1840 static bool
1841 reduction_code_for_scalar_code (enum tree_code code,
1842                                 enum tree_code *reduc_code)
1843 {
1844   switch (code)
1845     {
1846       case MAX_EXPR:
1847         *reduc_code = REDUC_MAX_EXPR;
1848         return true;
1849
1850       case MIN_EXPR:
1851         *reduc_code = REDUC_MIN_EXPR;
1852         return true;
1853
1854       case PLUS_EXPR:
1855         *reduc_code = REDUC_PLUS_EXPR;
1856         return true;
1857
1858       case MULT_EXPR:
1859       case MINUS_EXPR:
1860       case BIT_IOR_EXPR:
1861       case BIT_XOR_EXPR:
1862       case BIT_AND_EXPR:
1863         *reduc_code = ERROR_MARK;
1864         return true;
1865
1866       default:
1867        return false;
1868     }
1869 }
1870
1871
1872 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1873    STMT is printed with a message MSG. */
1874
1875 static void
1876 report_vect_op (int msg_type, gimple stmt, const char *msg)
1877 {
1878   dump_printf_loc (msg_type, vect_location, "%s", msg);
1879   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1880   dump_printf (msg_type, "\n");
1881 }
1882
1883
1884 /* Detect SLP reduction of the form:
1885
1886    #a1 = phi <a5, a0>
1887    a2 = operation (a1)
1888    a3 = operation (a2)
1889    a4 = operation (a3)
1890    a5 = operation (a4)
1891
1892    #a = phi <a5>
1893
1894    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1895    FIRST_STMT is the first reduction stmt in the chain
1896    (a2 = operation (a1)).
1897
1898    Return TRUE if a reduction chain was detected.  */
1899
1900 static bool
1901 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1902 {
1903   struct loop *loop = (gimple_bb (phi))->loop_father;
1904   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1905   enum tree_code code;
1906   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1907   stmt_vec_info use_stmt_info, current_stmt_info;
1908   tree lhs;
1909   imm_use_iterator imm_iter;
1910   use_operand_p use_p;
1911   int nloop_uses, size = 0, n_out_of_loop_uses;
1912   bool found = false;
1913
1914   if (loop != vect_loop)
1915     return false;
1916
1917   lhs = PHI_RESULT (phi);
1918   code = gimple_assign_rhs_code (first_stmt);
1919   while (1)
1920     {
1921       nloop_uses = 0;
1922       n_out_of_loop_uses = 0;
1923       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1924         {
1925           gimple use_stmt = USE_STMT (use_p);
1926           if (is_gimple_debug (use_stmt))
1927             continue;
1928
1929           use_stmt = USE_STMT (use_p);
1930
1931           /* Check if we got back to the reduction phi.  */
1932           if (use_stmt == phi)
1933             {
1934               loop_use_stmt = use_stmt;
1935               found = true;
1936               break;
1937             }
1938
1939           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1940             {
1941               if (vinfo_for_stmt (use_stmt)
1942                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1943                 {
1944                   loop_use_stmt = use_stmt;
1945                   nloop_uses++;
1946                 }
1947             }
1948            else
1949              n_out_of_loop_uses++;
1950
1951            /* There are can be either a single use in the loop or two uses in
1952               phi nodes.  */
1953            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1954              return false;
1955         }
1956
1957       if (found)
1958         break;
1959
1960       /* We reached a statement with no loop uses.  */
1961       if (nloop_uses == 0)
1962         return false;
1963
1964       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1965       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1966         return false;
1967
1968       if (!is_gimple_assign (loop_use_stmt)
1969           || code != gimple_assign_rhs_code (loop_use_stmt)
1970           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1971         return false;
1972
1973       /* Insert USE_STMT into reduction chain.  */
1974       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1975       if (current_stmt)
1976         {
1977           current_stmt_info = vinfo_for_stmt (current_stmt);
1978           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1979           GROUP_FIRST_ELEMENT (use_stmt_info)
1980             = GROUP_FIRST_ELEMENT (current_stmt_info);
1981         }
1982       else
1983         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1984
1985       lhs = gimple_assign_lhs (loop_use_stmt);
1986       current_stmt = loop_use_stmt;
1987       size++;
1988    }
1989
1990   if (!found || loop_use_stmt != phi || size < 2)
1991     return false;
1992
1993   /* Swap the operands, if needed, to make the reduction operand be the second
1994      operand.  */
1995   lhs = PHI_RESULT (phi);
1996   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1997   while (next_stmt)
1998     {
1999       if (gimple_assign_rhs2 (next_stmt) == lhs)
2000         {
2001           tree op = gimple_assign_rhs1 (next_stmt);
2002           gimple def_stmt = NULL;
2003
2004           if (TREE_CODE (op) == SSA_NAME)
2005             def_stmt = SSA_NAME_DEF_STMT (op);
2006
2007           /* Check that the other def is either defined in the loop
2008              ("vect_internal_def"), or it's an induction (defined by a
2009              loop-header phi-node).  */
2010           if (def_stmt
2011               && gimple_bb (def_stmt)
2012               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2013               && (is_gimple_assign (def_stmt)
2014                   || is_gimple_call (def_stmt)
2015                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2016                            == vect_induction_def
2017                   || (gimple_code (def_stmt) == GIMPLE_PHI
2018                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2019                                   == vect_internal_def
2020                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2021             {
2022               lhs = gimple_assign_lhs (next_stmt);
2023               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2024               continue;
2025             }
2026
2027           return false;
2028         }
2029       else
2030         {
2031           tree op = gimple_assign_rhs2 (next_stmt);
2032           gimple def_stmt = NULL;
2033
2034           if (TREE_CODE (op) == SSA_NAME)
2035             def_stmt = SSA_NAME_DEF_STMT (op);
2036
2037           /* Check that the other def is either defined in the loop
2038             ("vect_internal_def"), or it's an induction (defined by a
2039             loop-header phi-node).  */
2040           if (def_stmt
2041               && gimple_bb (def_stmt)
2042               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2043               && (is_gimple_assign (def_stmt)
2044                   || is_gimple_call (def_stmt)
2045                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2046                               == vect_induction_def
2047                   || (gimple_code (def_stmt) == GIMPLE_PHI
2048                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2049                                   == vect_internal_def
2050                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2051             {
2052               if (dump_enabled_p ())
2053                 {
2054                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2055                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2056                   dump_printf (MSG_NOTE, "\n");
2057                 }
2058
2059               swap_tree_operands (next_stmt,
2060                                   gimple_assign_rhs1_ptr (next_stmt),
2061                                   gimple_assign_rhs2_ptr (next_stmt));
2062               update_stmt (next_stmt);
2063
2064               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2065                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2066             }
2067           else
2068             return false;
2069         }
2070
2071       lhs = gimple_assign_lhs (next_stmt);
2072       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2073     }
2074
2075   /* Save the chain for further analysis in SLP detection.  */
2076   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2077   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2078   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2079
2080   return true;
2081 }
2082
2083
2084 /* Function vect_is_simple_reduction_1
2085
2086    (1) Detect a cross-iteration def-use cycle that represents a simple
2087    reduction computation.  We look for the following pattern:
2088
2089    loop_header:
2090      a1 = phi < a0, a2 >
2091      a3 = ...
2092      a2 = operation (a3, a1)
2093
2094    such that:
2095    1. operation is commutative and associative and it is safe to
2096       change the order of the computation (if CHECK_REDUCTION is true)
2097    2. no uses for a2 in the loop (a2 is used out of the loop)
2098    3. no uses of a1 in the loop besides the reduction operation
2099    4. no uses of a1 outside the loop.
2100
2101    Conditions 1,4 are tested here.
2102    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2103
2104    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2105    nested cycles, if CHECK_REDUCTION is false.
2106
2107    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2108    reductions:
2109
2110      a1 = phi < a0, a2 >
2111      inner loop (def of a3)
2112      a2 = phi < a3 >
2113
2114    If MODIFY is true it tries also to rework the code in-place to enable
2115    detection of more reduction patterns.  For the time being we rewrite
2116    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2117 */
2118
2119 static gimple
2120 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2121                             bool check_reduction, bool *double_reduc,
2122                             bool modify)
2123 {
2124   struct loop *loop = (gimple_bb (phi))->loop_father;
2125   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2126   edge latch_e = loop_latch_edge (loop);
2127   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2128   gimple def_stmt, def1 = NULL, def2 = NULL;
2129   enum tree_code orig_code, code;
2130   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2131   tree type;
2132   int nloop_uses;
2133   tree name;
2134   imm_use_iterator imm_iter;
2135   use_operand_p use_p;
2136   bool phi_def;
2137
2138   *double_reduc = false;
2139
2140   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2141      otherwise, we assume outer loop vectorization.  */
2142   gcc_assert ((check_reduction && loop == vect_loop)
2143               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2144
2145   name = PHI_RESULT (phi);
2146   nloop_uses = 0;
2147   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2148     {
2149       gimple use_stmt = USE_STMT (use_p);
2150       if (is_gimple_debug (use_stmt))
2151         continue;
2152
2153       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2154         {
2155           if (dump_enabled_p ())
2156             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2157                              "intermediate value used outside loop.\n");
2158
2159           return NULL;
2160         }
2161
2162       if (vinfo_for_stmt (use_stmt)
2163           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2164         nloop_uses++;
2165       if (nloop_uses > 1)
2166         {
2167           if (dump_enabled_p ())
2168             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2169                              "reduction used in loop.\n");
2170           return NULL;
2171         }
2172     }
2173
2174   if (TREE_CODE (loop_arg) != SSA_NAME)
2175     {
2176       if (dump_enabled_p ())
2177         {
2178           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2179                            "reduction: not ssa_name: ");
2180           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2181           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2182         }
2183       return NULL;
2184     }
2185
2186   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2187   if (!def_stmt)
2188     {
2189       if (dump_enabled_p ())
2190         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191                          "reduction: no def_stmt.\n");
2192       return NULL;
2193     }
2194
2195   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2196     {
2197       if (dump_enabled_p ())
2198         {
2199           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2200           dump_printf (MSG_NOTE, "\n");
2201         }
2202       return NULL;
2203     }
2204
2205   if (is_gimple_assign (def_stmt))
2206     {
2207       name = gimple_assign_lhs (def_stmt);
2208       phi_def = false;
2209     }
2210   else
2211     {
2212       name = PHI_RESULT (def_stmt);
2213       phi_def = true;
2214     }
2215
2216   nloop_uses = 0;
2217   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2218     {
2219       gimple use_stmt = USE_STMT (use_p);
2220       if (is_gimple_debug (use_stmt))
2221         continue;
2222       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2223           && vinfo_for_stmt (use_stmt)
2224           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2225         nloop_uses++;
2226       if (nloop_uses > 1)
2227         {
2228           if (dump_enabled_p ())
2229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230                              "reduction used in loop.\n");
2231           return NULL;
2232         }
2233     }
2234
2235   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2236      defined in the inner loop.  */
2237   if (phi_def)
2238     {
2239       op1 = PHI_ARG_DEF (def_stmt, 0);
2240
2241       if (gimple_phi_num_args (def_stmt) != 1
2242           || TREE_CODE (op1) != SSA_NAME)
2243         {
2244           if (dump_enabled_p ())
2245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246                              "unsupported phi node definition.\n");
2247
2248           return NULL;
2249         }
2250
2251       def1 = SSA_NAME_DEF_STMT (op1);
2252       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2253           && loop->inner
2254           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2255           && is_gimple_assign (def1))
2256         {
2257           if (dump_enabled_p ())
2258             report_vect_op (MSG_NOTE, def_stmt,
2259                             "detected double reduction: ");
2260
2261           *double_reduc = true;
2262           return def_stmt;
2263         }
2264
2265       return NULL;
2266     }
2267
2268   code = orig_code = gimple_assign_rhs_code (def_stmt);
2269
2270   /* We can handle "res -= x[i]", which is non-associative by
2271      simply rewriting this into "res += -x[i]".  Avoid changing
2272      gimple instruction for the first simple tests and only do this
2273      if we're allowed to change code at all.  */
2274   if (code == MINUS_EXPR
2275       && modify
2276       && (op1 = gimple_assign_rhs1 (def_stmt))
2277       && TREE_CODE (op1) == SSA_NAME
2278       && SSA_NAME_DEF_STMT (op1) == phi)
2279     code = PLUS_EXPR;
2280
2281   if (check_reduction
2282       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2283     {
2284       if (dump_enabled_p ())
2285         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2286                         "reduction: not commutative/associative: ");
2287       return NULL;
2288     }
2289
2290   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2291     {
2292       if (code != COND_EXPR)
2293         {
2294           if (dump_enabled_p ())
2295             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2296                             "reduction: not binary operation: ");
2297
2298           return NULL;
2299         }
2300
2301       op3 = gimple_assign_rhs1 (def_stmt);
2302       if (COMPARISON_CLASS_P (op3))
2303         {
2304           op4 = TREE_OPERAND (op3, 1);
2305           op3 = TREE_OPERAND (op3, 0);
2306         }
2307
2308       op1 = gimple_assign_rhs2 (def_stmt);
2309       op2 = gimple_assign_rhs3 (def_stmt);
2310
2311       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2312         {
2313           if (dump_enabled_p ())
2314             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2315                             "reduction: uses not ssa_names: ");
2316
2317           return NULL;
2318         }
2319     }
2320   else
2321     {
2322       op1 = gimple_assign_rhs1 (def_stmt);
2323       op2 = gimple_assign_rhs2 (def_stmt);
2324
2325       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2326         {
2327           if (dump_enabled_p ())
2328             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2329                             "reduction: uses not ssa_names: ");
2330
2331           return NULL;
2332         }
2333    }
2334
2335   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2336   if ((TREE_CODE (op1) == SSA_NAME
2337        && !types_compatible_p (type,TREE_TYPE (op1)))
2338       || (TREE_CODE (op2) == SSA_NAME
2339           && !types_compatible_p (type, TREE_TYPE (op2)))
2340       || (op3 && TREE_CODE (op3) == SSA_NAME
2341           && !types_compatible_p (type, TREE_TYPE (op3)))
2342       || (op4 && TREE_CODE (op4) == SSA_NAME
2343           && !types_compatible_p (type, TREE_TYPE (op4))))
2344     {
2345       if (dump_enabled_p ())
2346         {
2347           dump_printf_loc (MSG_NOTE, vect_location,
2348                            "reduction: multiple types: operation type: ");
2349           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2350           dump_printf (MSG_NOTE, ", operands types: ");
2351           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2352                              TREE_TYPE (op1));
2353           dump_printf (MSG_NOTE, ",");
2354           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2355                              TREE_TYPE (op2));
2356           if (op3)
2357             {
2358               dump_printf (MSG_NOTE, ",");
2359               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2360                                  TREE_TYPE (op3));
2361             }
2362
2363           if (op4)
2364             {
2365               dump_printf (MSG_NOTE, ",");
2366               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2367                                  TREE_TYPE (op4));
2368             }
2369           dump_printf (MSG_NOTE, "\n");
2370         }
2371
2372       return NULL;
2373     }
2374
2375   /* Check that it's ok to change the order of the computation.
2376      Generally, when vectorizing a reduction we change the order of the
2377      computation.  This may change the behavior of the program in some
2378      cases, so we need to check that this is ok.  One exception is when
2379      vectorizing an outer-loop: the inner-loop is executed sequentially,
2380      and therefore vectorizing reductions in the inner-loop during
2381      outer-loop vectorization is safe.  */
2382
2383   /* CHECKME: check for !flag_finite_math_only too?  */
2384   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2385       && check_reduction)
2386     {
2387       /* Changing the order of operations changes the semantics.  */
2388       if (dump_enabled_p ())
2389         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2390                         "reduction: unsafe fp math optimization: ");
2391       return NULL;
2392     }
2393   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2394            && check_reduction)
2395     {
2396       /* Changing the order of operations changes the semantics.  */
2397       if (dump_enabled_p ())
2398         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2399                         "reduction: unsafe int math optimization: ");
2400       return NULL;
2401     }
2402   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2403     {
2404       /* Changing the order of operations changes the semantics.  */
2405       if (dump_enabled_p ())
2406         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2407                         "reduction: unsafe fixed-point math optimization: ");
2408       return NULL;
2409     }
2410
2411   /* If we detected "res -= x[i]" earlier, rewrite it into
2412      "res += -x[i]" now.  If this turns out to be useless reassoc
2413      will clean it up again.  */
2414   if (orig_code == MINUS_EXPR)
2415     {
2416       tree rhs = gimple_assign_rhs2 (def_stmt);
2417       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2418       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2419                                                          rhs, NULL);
2420       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2421       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2422                                                           loop_info, NULL));
2423       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2424       gimple_assign_set_rhs2 (def_stmt, negrhs);
2425       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2426       update_stmt (def_stmt);
2427     }
2428
2429   /* Reduction is safe. We're dealing with one of the following:
2430      1) integer arithmetic and no trapv
2431      2) floating point arithmetic, and special flags permit this optimization
2432      3) nested cycle (i.e., outer loop vectorization).  */
2433   if (TREE_CODE (op1) == SSA_NAME)
2434     def1 = SSA_NAME_DEF_STMT (op1);
2435
2436   if (TREE_CODE (op2) == SSA_NAME)
2437     def2 = SSA_NAME_DEF_STMT (op2);
2438
2439   if (code != COND_EXPR
2440       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2441     {
2442       if (dump_enabled_p ())
2443         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2444       return NULL;
2445     }
2446
2447   /* Check that one def is the reduction def, defined by PHI,
2448      the other def is either defined in the loop ("vect_internal_def"),
2449      or it's an induction (defined by a loop-header phi-node).  */
2450
2451   if (def2 && def2 == phi
2452       && (code == COND_EXPR
2453           || !def1 || gimple_nop_p (def1)
2454           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2455               && (is_gimple_assign (def1)
2456                   || is_gimple_call (def1)
2457                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2458                       == vect_induction_def
2459                   || (gimple_code (def1) == GIMPLE_PHI
2460                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2461                           == vect_internal_def
2462                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2463     {
2464       if (dump_enabled_p ())
2465         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2466       return def_stmt;
2467     }
2468
2469   if (def1 && def1 == phi
2470       && (code == COND_EXPR
2471           || !def2 || gimple_nop_p (def2)
2472           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2473               && (is_gimple_assign (def2)
2474                   || is_gimple_call (def2)
2475                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2476                       == vect_induction_def
2477                   || (gimple_code (def2) == GIMPLE_PHI
2478                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2479                           == vect_internal_def
2480                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2481     {
2482       if (check_reduction)
2483         {
2484           /* Swap operands (just for simplicity - so that the rest of the code
2485              can assume that the reduction variable is always the last (second)
2486              argument).  */
2487           if (dump_enabled_p ())
2488             report_vect_op (MSG_NOTE, def_stmt,
2489                             "detected reduction: need to swap operands: ");
2490
2491           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2492                               gimple_assign_rhs2_ptr (def_stmt));
2493
2494           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2495             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2496         }
2497       else
2498         {
2499           if (dump_enabled_p ())
2500             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2501         }
2502
2503       return def_stmt;
2504     }
2505
2506   /* Try to find SLP reduction chain.  */
2507   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2508     {
2509       if (dump_enabled_p ())
2510         report_vect_op (MSG_NOTE, def_stmt,
2511                         "reduction: detected reduction chain: ");
2512
2513       return def_stmt;
2514     }
2515
2516   if (dump_enabled_p ())
2517     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2518                     "reduction: unknown pattern: ");
2519
2520   return NULL;
2521 }
2522
2523 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2524    in-place.  Arguments as there.  */
2525
2526 static gimple
2527 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2528                           bool check_reduction, bool *double_reduc)
2529 {
2530   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2531                                      double_reduc, false);
2532 }
2533
2534 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2535    in-place if it enables detection of more reductions.  Arguments
2536    as there.  */
2537
2538 gimple
2539 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2540                           bool check_reduction, bool *double_reduc)
2541 {
2542   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2543                                      double_reduc, true);
2544 }
2545
2546 /* Calculate the cost of one scalar iteration of the loop.  */
2547 int
2548 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2549 {
2550   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2551   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2552   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2553   int innerloop_iters, i, stmt_cost;
2554
2555   /* Count statements in scalar loop.  Using this as scalar cost for a single
2556      iteration for now.
2557
2558      TODO: Add outer loop support.
2559
2560      TODO: Consider assigning different costs to different scalar
2561      statements.  */
2562
2563   /* FORNOW.  */
2564   innerloop_iters = 1;
2565   if (loop->inner)
2566     innerloop_iters = 50; /* FIXME */
2567
2568   for (i = 0; i < nbbs; i++)
2569     {
2570       gimple_stmt_iterator si;
2571       basic_block bb = bbs[i];
2572
2573       if (bb->loop_father == loop->inner)
2574         factor = innerloop_iters;
2575       else
2576         factor = 1;
2577
2578       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2579         {
2580           gimple stmt = gsi_stmt (si);
2581           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2582
2583           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2584             continue;
2585
2586           /* Skip stmts that are not vectorized inside the loop.  */
2587           if (stmt_info
2588               && !STMT_VINFO_RELEVANT_P (stmt_info)
2589               && (!STMT_VINFO_LIVE_P (stmt_info)
2590                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2591               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2592             continue;
2593
2594           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2595             {
2596               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2597                stmt_cost = vect_get_stmt_cost (scalar_load);
2598              else
2599                stmt_cost = vect_get_stmt_cost (scalar_store);
2600             }
2601           else
2602             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2603
2604           scalar_single_iter_cost += stmt_cost * factor;
2605         }
2606     }
2607   return scalar_single_iter_cost;
2608 }
2609
2610 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2611 int
2612 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2613                              int *peel_iters_epilogue,
2614                              int scalar_single_iter_cost,
2615                              stmt_vector_for_cost *prologue_cost_vec,
2616                              stmt_vector_for_cost *epilogue_cost_vec)
2617 {
2618   int retval = 0;
2619   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2620
2621   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2622     {
2623       *peel_iters_epilogue = vf/2;
2624       if (dump_enabled_p ())
2625         dump_printf_loc (MSG_NOTE, vect_location,
2626                          "cost model: epilogue peel iters set to vf/2 "
2627                          "because loop iterations are unknown .\n");
2628
2629       /* If peeled iterations are known but number of scalar loop
2630          iterations are unknown, count a taken branch per peeled loop.  */
2631       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2632                                  NULL, 0, vect_prologue);
2633     }
2634   else
2635     {
2636       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2637       peel_iters_prologue = niters < peel_iters_prologue ?
2638                             niters : peel_iters_prologue;
2639       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2640       /* If we need to peel for gaps, but no peeling is required, we have to
2641          peel VF iterations.  */
2642       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2643         *peel_iters_epilogue = vf;
2644     }
2645
2646   if (peel_iters_prologue)
2647     retval += record_stmt_cost (prologue_cost_vec,
2648                                 peel_iters_prologue * scalar_single_iter_cost,
2649                                 scalar_stmt, NULL, 0, vect_prologue);
2650   if (*peel_iters_epilogue)
2651     retval += record_stmt_cost (epilogue_cost_vec,
2652                                 *peel_iters_epilogue * scalar_single_iter_cost,
2653                                 scalar_stmt, NULL, 0, vect_epilogue);
2654   return retval;
2655 }
2656
2657 /* Function vect_estimate_min_profitable_iters
2658
2659    Return the number of iterations required for the vector version of the
2660    loop to be profitable relative to the cost of the scalar version of the
2661    loop.  */
2662
2663 static void
2664 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2665                                     int *ret_min_profitable_niters,
2666                                     int *ret_min_profitable_estimate)
2667 {
2668   int min_profitable_iters;
2669   int min_profitable_estimate;
2670   int peel_iters_prologue;
2671   int peel_iters_epilogue;
2672   unsigned vec_inside_cost = 0;
2673   int vec_outside_cost = 0;
2674   unsigned vec_prologue_cost = 0;
2675   unsigned vec_epilogue_cost = 0;
2676   int scalar_single_iter_cost = 0;
2677   int scalar_outside_cost = 0;
2678   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2679   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2680   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2681
2682   /* Cost model disabled.  */
2683   if (!flag_vect_cost_model)
2684     {
2685       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2686       *ret_min_profitable_niters = 0;
2687       *ret_min_profitable_estimate = 0;
2688       return;
2689     }
2690
2691   /* Requires loop versioning tests to handle misalignment.  */
2692   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2693     {
2694       /*  FIXME: Make cost depend on complexity of individual check.  */
2695       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2696       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2697                             vect_prologue);
2698       dump_printf (MSG_NOTE,
2699                    "cost model: Adding cost of checks for loop "
2700                    "versioning to treat misalignment.\n");
2701     }
2702
2703   /* Requires loop versioning with alias checks.  */
2704   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2705     {
2706       /*  FIXME: Make cost depend on complexity of individual check.  */
2707       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2708       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2709                             vect_prologue);
2710       dump_printf (MSG_NOTE,
2711                    "cost model: Adding cost of checks for loop "
2712                    "versioning aliasing.\n");
2713     }
2714
2715   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2716       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2717     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2718                           vect_prologue);
2719
2720   /* Count statements in scalar loop.  Using this as scalar cost for a single
2721      iteration for now.
2722
2723      TODO: Add outer loop support.
2724
2725      TODO: Consider assigning different costs to different scalar
2726      statements.  */
2727
2728   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2729
2730   /* Add additional cost for the peeled instructions in prologue and epilogue
2731      loop.
2732
2733      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2734      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2735
2736      TODO: Build an expression that represents peel_iters for prologue and
2737      epilogue to be used in a run-time test.  */
2738
2739   if (npeel  < 0)
2740     {
2741       peel_iters_prologue = vf/2;
2742       dump_printf (MSG_NOTE, "cost model: "
2743                    "prologue peel iters set to vf/2.\n");
2744
2745       /* If peeling for alignment is unknown, loop bound of main loop becomes
2746          unknown.  */
2747       peel_iters_epilogue = vf/2;
2748       dump_printf (MSG_NOTE, "cost model: "
2749                    "epilogue peel iters set to vf/2 because "
2750                    "peeling for alignment is unknown.\n");
2751
2752       /* If peeled iterations are unknown, count a taken branch and a not taken
2753          branch per peeled loop. Even if scalar loop iterations are known,
2754          vector iterations are not known since peeled prologue iterations are
2755          not known. Hence guards remain the same.  */
2756       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2757                             NULL, 0, vect_prologue);
2758       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2759                             NULL, 0, vect_prologue);
2760       /* FORNOW: Don't attempt to pass individual scalar instructions to
2761          the model; just assume linear cost for scalar iterations.  */
2762       (void) add_stmt_cost (target_cost_data,
2763                             peel_iters_prologue * scalar_single_iter_cost,
2764                             scalar_stmt, NULL, 0, vect_prologue);
2765       (void) add_stmt_cost (target_cost_data,
2766                             peel_iters_epilogue * scalar_single_iter_cost,
2767                             scalar_stmt, NULL, 0, vect_epilogue);
2768     }
2769   else
2770     {
2771       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2772       stmt_info_for_cost *si;
2773       int j;
2774       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2775
2776       prologue_cost_vec.create (2);
2777       epilogue_cost_vec.create (2);
2778       peel_iters_prologue = npeel;
2779
2780       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2781                                           &peel_iters_epilogue,
2782                                           scalar_single_iter_cost,
2783                                           &prologue_cost_vec,
2784                                           &epilogue_cost_vec);
2785
2786       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2787         {
2788           struct _stmt_vec_info *stmt_info
2789             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2790           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2791                                 si->misalign, vect_prologue);
2792         }
2793
2794       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2795         {
2796           struct _stmt_vec_info *stmt_info
2797             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2798           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2799                                 si->misalign, vect_epilogue);
2800         }
2801
2802       prologue_cost_vec.release ();
2803       epilogue_cost_vec.release ();
2804     }
2805
2806   /* FORNOW: The scalar outside cost is incremented in one of the
2807      following ways:
2808
2809      1. The vectorizer checks for alignment and aliasing and generates
2810      a condition that allows dynamic vectorization.  A cost model
2811      check is ANDED with the versioning condition.  Hence scalar code
2812      path now has the added cost of the versioning check.
2813
2814        if (cost > th & versioning_check)
2815          jmp to vector code
2816
2817      Hence run-time scalar is incremented by not-taken branch cost.
2818
2819      2. The vectorizer then checks if a prologue is required.  If the
2820      cost model check was not done before during versioning, it has to
2821      be done before the prologue check.
2822
2823        if (cost <= th)
2824          prologue = scalar_iters
2825        if (prologue == 0)
2826          jmp to vector code
2827        else
2828          execute prologue
2829        if (prologue == num_iters)
2830          go to exit
2831
2832      Hence the run-time scalar cost is incremented by a taken branch,
2833      plus a not-taken branch, plus a taken branch cost.
2834
2835      3. The vectorizer then checks if an epilogue is required.  If the
2836      cost model check was not done before during prologue check, it
2837      has to be done with the epilogue check.
2838
2839        if (prologue == 0)
2840          jmp to vector code
2841        else
2842          execute prologue
2843        if (prologue == num_iters)
2844          go to exit
2845        vector code:
2846          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2847            jmp to epilogue
2848
2849      Hence the run-time scalar cost should be incremented by 2 taken
2850      branches.
2851
2852      TODO: The back end may reorder the BBS's differently and reverse
2853      conditions/branch directions.  Change the estimates below to
2854      something more reasonable.  */
2855
2856   /* If the number of iterations is known and we do not do versioning, we can
2857      decide whether to vectorize at compile time.  Hence the scalar version
2858      do not carry cost model guard costs.  */
2859   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2860       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2861       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2862     {
2863       /* Cost model check occurs at versioning.  */
2864       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2865           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2866         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2867       else
2868         {
2869           /* Cost model check occurs at prologue generation.  */
2870           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2871             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2872               + vect_get_stmt_cost (cond_branch_not_taken);
2873           /* Cost model check occurs at epilogue generation.  */
2874           else
2875             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2876         }
2877     }
2878
2879   /* Complete the target-specific cost calculations.  */
2880   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2881                &vec_inside_cost, &vec_epilogue_cost);
2882
2883   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2884
2885   /* Calculate number of iterations required to make the vector version
2886      profitable, relative to the loop bodies only.  The following condition
2887      must hold true:
2888      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2889      where
2890      SIC = scalar iteration cost, VIC = vector iteration cost,
2891      VOC = vector outside cost, VF = vectorization factor,
2892      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2893      SOC = scalar outside cost for run time cost model check.  */
2894
2895   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2896     {
2897       if (vec_outside_cost <= 0)
2898         min_profitable_iters = 1;
2899       else
2900         {
2901           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2902                                   - vec_inside_cost * peel_iters_prologue
2903                                   - vec_inside_cost * peel_iters_epilogue)
2904                                  / ((scalar_single_iter_cost * vf)
2905                                     - vec_inside_cost);
2906
2907           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2908               <= (((int) vec_inside_cost * min_profitable_iters)
2909                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2910             min_profitable_iters++;
2911         }
2912     }
2913   /* vector version will never be profitable.  */
2914   else
2915     {
2916       if (dump_enabled_p ())
2917         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2918                          "cost model: the vector iteration cost = %d "
2919                          "divided by the scalar iteration cost = %d "
2920                          "is greater or equal to the vectorization factor = %d"
2921                          ".\n",
2922                          vec_inside_cost, scalar_single_iter_cost, vf);
2923       *ret_min_profitable_niters = -1;
2924       *ret_min_profitable_estimate = -1;
2925       return;
2926     }
2927
2928   if (dump_enabled_p ())
2929     {
2930       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2931       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2932                    vec_inside_cost);
2933       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2934                    vec_prologue_cost);
2935       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2936                    vec_epilogue_cost);
2937       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2938                    scalar_single_iter_cost);
2939       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2940                    scalar_outside_cost);
2941       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2942                    vec_outside_cost);
2943       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2944                    peel_iters_prologue);
2945       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2946                    peel_iters_epilogue);
2947       dump_printf (MSG_NOTE,
2948                    "  Calculated minimum iters for profitability: %d\n",
2949                    min_profitable_iters);
2950       dump_printf (MSG_NOTE, "\n");
2951     }
2952
2953   min_profitable_iters =
2954         min_profitable_iters < vf ? vf : min_profitable_iters;
2955
2956   /* Because the condition we create is:
2957      if (niters <= min_profitable_iters)
2958        then skip the vectorized loop.  */
2959   min_profitable_iters--;
2960
2961   if (dump_enabled_p ())
2962     dump_printf_loc (MSG_NOTE, vect_location,
2963                      "  Runtime profitability threshold = %d\n",
2964                      min_profitable_iters);
2965
2966   *ret_min_profitable_niters = min_profitable_iters;
2967
2968   /* Calculate number of iterations required to make the vector version
2969      profitable, relative to the loop bodies only.
2970
2971      Non-vectorized variant is SIC * niters and it must win over vector
2972      variant on the expected loop trip count.  The following condition must hold true:
2973      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2974
2975   if (vec_outside_cost <= 0)
2976     min_profitable_estimate = 1;
2977   else
2978     {
2979       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2980                                  - vec_inside_cost * peel_iters_prologue
2981                                  - vec_inside_cost * peel_iters_epilogue)
2982                                  / ((scalar_single_iter_cost * vf)
2983                                    - vec_inside_cost);
2984     }
2985   min_profitable_estimate --;
2986   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2987   if (dump_enabled_p ())
2988     dump_printf_loc (MSG_NOTE, vect_location,
2989                      "  Static estimate profitability threshold = %d\n",
2990                       min_profitable_iters);
2991
2992   *ret_min_profitable_estimate = min_profitable_estimate;
2993 }
2994
2995
2996 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2997    functions. Design better to avoid maintenance issues.  */
2998
2999 /* Function vect_model_reduction_cost.
3000
3001    Models cost for a reduction operation, including the vector ops
3002    generated within the strip-mine loop, the initial definition before
3003    the loop, and the epilogue code that must be generated.  */
3004
3005 static bool
3006 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3007                            int ncopies)
3008 {
3009   int prologue_cost = 0, epilogue_cost = 0;
3010   enum tree_code code;
3011   optab optab;
3012   tree vectype;
3013   gimple stmt, orig_stmt;
3014   tree reduction_op;
3015   enum machine_mode mode;
3016   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3017   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3018   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3019
3020   /* Cost of reduction op inside loop.  */
3021   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3022                                         stmt_info, 0, vect_body);
3023   stmt = STMT_VINFO_STMT (stmt_info);
3024
3025   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3026     {
3027     case GIMPLE_SINGLE_RHS:
3028       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3029       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3030       break;
3031     case GIMPLE_UNARY_RHS:
3032       reduction_op = gimple_assign_rhs1 (stmt);
3033       break;
3034     case GIMPLE_BINARY_RHS:
3035       reduction_op = gimple_assign_rhs2 (stmt);
3036       break;
3037     case GIMPLE_TERNARY_RHS:
3038       reduction_op = gimple_assign_rhs3 (stmt);
3039       break;
3040     default:
3041       gcc_unreachable ();
3042     }
3043
3044   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3045   if (!vectype)
3046     {
3047       if (dump_enabled_p ())
3048         {
3049           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3050                            "unsupported data-type ");
3051           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3052                              TREE_TYPE (reduction_op));
3053           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3054         }
3055       return false;
3056    }
3057
3058   mode = TYPE_MODE (vectype);
3059   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3060
3061   if (!orig_stmt)
3062     orig_stmt = STMT_VINFO_STMT (stmt_info);
3063
3064   code = gimple_assign_rhs_code (orig_stmt);
3065
3066   /* Add in cost for initial definition.  */
3067   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3068                                   stmt_info, 0, vect_prologue);
3069
3070   /* Determine cost of epilogue code.
3071
3072      We have a reduction operator that will reduce the vector in one statement.
3073      Also requires scalar extract.  */
3074
3075   if (!nested_in_vect_loop_p (loop, orig_stmt))
3076     {
3077       if (reduc_code != ERROR_MARK)
3078         {
3079           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3080                                           stmt_info, 0, vect_epilogue);
3081           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3082                                           stmt_info, 0, vect_epilogue);
3083         }
3084       else
3085         {
3086           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3087           tree bitsize =
3088             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3089           int element_bitsize = tree_low_cst (bitsize, 1);
3090           int nelements = vec_size_in_bits / element_bitsize;
3091
3092           optab = optab_for_tree_code (code, vectype, optab_default);
3093
3094           /* We have a whole vector shift available.  */
3095           if (VECTOR_MODE_P (mode)
3096               && optab_handler (optab, mode) != CODE_FOR_nothing
3097               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3098             {
3099               /* Final reduction via vector shifts and the reduction operator.
3100                  Also requires scalar extract.  */
3101               epilogue_cost += add_stmt_cost (target_cost_data,
3102                                               exact_log2 (nelements) * 2,
3103                                               vector_stmt, stmt_info, 0,
3104                                               vect_epilogue);
3105               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3106                                               vec_to_scalar, stmt_info, 0,
3107                                               vect_epilogue);
3108             }
3109           else
3110             /* Use extracts and reduction op for final reduction.  For N
3111                elements, we have N extracts and N-1 reduction ops.  */
3112             epilogue_cost += add_stmt_cost (target_cost_data,
3113                                             nelements + nelements - 1,
3114                                             vector_stmt, stmt_info, 0,
3115                                             vect_epilogue);
3116         }
3117     }
3118
3119   if (dump_enabled_p ())
3120     dump_printf (MSG_NOTE,
3121                  "vect_model_reduction_cost: inside_cost = %d, "
3122                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3123                  prologue_cost, epilogue_cost);
3124
3125   return true;
3126 }
3127
3128
3129 /* Function vect_model_induction_cost.
3130
3131    Models cost for induction operations.  */
3132
3133 static void
3134 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3135 {
3136   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3137   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3138   unsigned inside_cost, prologue_cost;
3139
3140   /* loop cost for vec_loop.  */
3141   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3142                                stmt_info, 0, vect_body);
3143
3144   /* prologue cost for vec_init and vec_step.  */
3145   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3146                                  stmt_info, 0, vect_prologue);
3147
3148   if (dump_enabled_p ())
3149     dump_printf_loc (MSG_NOTE, vect_location,
3150                      "vect_model_induction_cost: inside_cost = %d, "
3151                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3152 }
3153
3154
3155 /* Function get_initial_def_for_induction
3156
3157    Input:
3158    STMT - a stmt that performs an induction operation in the loop.
3159    IV_PHI - the initial value of the induction variable
3160
3161    Output:
3162    Return a vector variable, initialized with the first VF values of
3163    the induction variable.  E.g., for an iv with IV_PHI='X' and
3164    evolution S, for a vector of 4 units, we want to return:
3165    [X, X + S, X + 2*S, X + 3*S].  */
3166
3167 static tree
3168 get_initial_def_for_induction (gimple iv_phi)
3169 {
3170   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3171   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3173   tree vectype;
3174   int nunits;
3175   edge pe = loop_preheader_edge (loop);
3176   struct loop *iv_loop;
3177   basic_block new_bb;
3178   tree new_vec, vec_init, vec_step, t;
3179   tree access_fn;
3180   tree new_var;
3181   tree new_name;
3182   gimple init_stmt, induction_phi, new_stmt;
3183   tree induc_def, vec_def, vec_dest;
3184   tree init_expr, step_expr;
3185   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3186   int i;
3187   bool ok;
3188   int ncopies;
3189   tree expr;
3190   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3191   bool nested_in_vect_loop = false;
3192   gimple_seq stmts = NULL;
3193   imm_use_iterator imm_iter;
3194   use_operand_p use_p;
3195   gimple exit_phi;
3196   edge latch_e;
3197   tree loop_arg;
3198   gimple_stmt_iterator si;
3199   basic_block bb = gimple_bb (iv_phi);
3200   tree stepvectype;
3201   tree resvectype;
3202
3203   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3204   if (nested_in_vect_loop_p (loop, iv_phi))
3205     {
3206       nested_in_vect_loop = true;
3207       iv_loop = loop->inner;
3208     }
3209   else
3210     iv_loop = loop;
3211   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3212
3213   latch_e = loop_latch_edge (iv_loop);
3214   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3215
3216   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3217   gcc_assert (access_fn);
3218   STRIP_NOPS (access_fn);
3219   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3220                                     &init_expr, &step_expr);
3221   gcc_assert (ok);
3222   pe = loop_preheader_edge (iv_loop);
3223
3224   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3225   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3226   gcc_assert (vectype);
3227   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3228   ncopies = vf / nunits;
3229
3230   gcc_assert (phi_info);
3231   gcc_assert (ncopies >= 1);
3232
3233   /* Find the first insertion point in the BB.  */
3234   si = gsi_after_labels (bb);
3235
3236   /* Create the vector that holds the initial_value of the induction.  */
3237   if (nested_in_vect_loop)
3238     {
3239       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3240          been created during vectorization of previous stmts.  We obtain it
3241          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3242       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3243                                            loop_preheader_edge (iv_loop));
3244       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3245       /* If the initial value is not of proper type, convert it.  */
3246       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3247         {
3248           new_stmt = gimple_build_assign_with_ops
3249               (VIEW_CONVERT_EXPR,
3250                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3251                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3252           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3253           gimple_assign_set_lhs (new_stmt, vec_init);
3254           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3255                                                  new_stmt);
3256           gcc_assert (!new_bb);
3257           set_vinfo_for_stmt (new_stmt,
3258                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3259         }
3260     }
3261   else
3262     {
3263       vec<constructor_elt, va_gc> *v;
3264
3265       /* iv_loop is the loop to be vectorized. Create:
3266          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3267       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3268                                        vect_scalar_var, "var_");
3269       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3270                                                      init_expr),
3271                                        &stmts, false, new_var);
3272       if (stmts)
3273         {
3274           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3275           gcc_assert (!new_bb);
3276         }
3277
3278       vec_alloc (v, nunits);
3279       bool constant_p = is_gimple_min_invariant (new_name);
3280       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3281       for (i = 1; i < nunits; i++)
3282         {
3283           /* Create: new_name_i = new_name + step_expr  */
3284           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3285                                   new_name, step_expr);
3286           if (!is_gimple_min_invariant (new_name))
3287             {
3288               init_stmt = gimple_build_assign (new_var, new_name);
3289               new_name = make_ssa_name (new_var, init_stmt);
3290               gimple_assign_set_lhs (init_stmt, new_name);
3291               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3292               gcc_assert (!new_bb);
3293               if (dump_enabled_p ())
3294                 {
3295                   dump_printf_loc (MSG_NOTE, vect_location,
3296                                    "created new init_stmt: ");
3297                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3298                   dump_printf (MSG_NOTE, "\n");
3299                 }
3300               constant_p = false;
3301             }
3302           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3303         }
3304       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3305       if (constant_p)
3306         new_vec = build_vector_from_ctor (vectype, v);
3307       else
3308         new_vec = build_constructor (vectype, v);
3309       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3310     }
3311
3312
3313   /* Create the vector that holds the step of the induction.  */
3314   if (nested_in_vect_loop)
3315     /* iv_loop is nested in the loop to be vectorized. Generate:
3316        vec_step = [S, S, S, S]  */
3317     new_name = step_expr;
3318   else
3319     {
3320       /* iv_loop is the loop to be vectorized. Generate:
3321           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3322       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3323         {
3324           expr = build_int_cst (integer_type_node, vf);
3325           expr = fold_convert (TREE_TYPE (step_expr), expr);
3326         }
3327       else
3328         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3329       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3330                               expr, step_expr);
3331       if (TREE_CODE (step_expr) == SSA_NAME)
3332         new_name = vect_init_vector (iv_phi, new_name,
3333                                      TREE_TYPE (step_expr), NULL);
3334     }
3335
3336   t = unshare_expr (new_name);
3337   gcc_assert (CONSTANT_CLASS_P (new_name)
3338               || TREE_CODE (new_name) == SSA_NAME);
3339   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3340   gcc_assert (stepvectype);
3341   new_vec = build_vector_from_val (stepvectype, t);
3342   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3343
3344
3345   /* Create the following def-use cycle:
3346      loop prolog:
3347          vec_init = ...
3348          vec_step = ...
3349      loop:
3350          vec_iv = PHI <vec_init, vec_loop>
3351          ...
3352          STMT
3353          ...
3354          vec_loop = vec_iv + vec_step;  */
3355
3356   /* Create the induction-phi that defines the induction-operand.  */
3357   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3358   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3359   set_vinfo_for_stmt (induction_phi,
3360                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3361   induc_def = PHI_RESULT (induction_phi);
3362
3363   /* Create the iv update inside the loop  */
3364   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3365                                            induc_def, vec_step);
3366   vec_def = make_ssa_name (vec_dest, new_stmt);
3367   gimple_assign_set_lhs (new_stmt, vec_def);
3368   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3369   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3370                                                    NULL));
3371
3372   /* Set the arguments of the phi node:  */
3373   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3374   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3375                UNKNOWN_LOCATION);
3376
3377
3378   /* In case that vectorization factor (VF) is bigger than the number
3379      of elements that we can fit in a vectype (nunits), we have to generate
3380      more than one vector stmt - i.e - we need to "unroll" the
3381      vector stmt by a factor VF/nunits.  For more details see documentation
3382      in vectorizable_operation.  */
3383
3384   if (ncopies > 1)
3385     {
3386       stmt_vec_info prev_stmt_vinfo;
3387       /* FORNOW. This restriction should be relaxed.  */
3388       gcc_assert (!nested_in_vect_loop);
3389
3390       /* Create the vector that holds the step of the induction.  */
3391       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3392         {
3393           expr = build_int_cst (integer_type_node, nunits);
3394           expr = fold_convert (TREE_TYPE (step_expr), expr);
3395         }
3396       else
3397         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3398       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3399                               expr, step_expr);
3400       if (TREE_CODE (step_expr) == SSA_NAME)
3401         new_name = vect_init_vector (iv_phi, new_name,
3402                                      TREE_TYPE (step_expr), NULL);
3403       t = unshare_expr (new_name);
3404       gcc_assert (CONSTANT_CLASS_P (new_name)
3405                   || TREE_CODE (new_name) == SSA_NAME);
3406       new_vec = build_vector_from_val (stepvectype, t);
3407       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3408
3409       vec_def = induc_def;
3410       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3411       for (i = 1; i < ncopies; i++)
3412         {
3413           /* vec_i = vec_prev + vec_step  */
3414           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3415                                                    vec_def, vec_step);
3416           vec_def = make_ssa_name (vec_dest, new_stmt);
3417           gimple_assign_set_lhs (new_stmt, vec_def);
3418
3419           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3420           if (!useless_type_conversion_p (resvectype, vectype))
3421             {
3422               new_stmt = gimple_build_assign_with_ops
3423                   (VIEW_CONVERT_EXPR,
3424                    vect_get_new_vect_var (resvectype, vect_simple_var,
3425                                           "vec_iv_"),
3426                    build1 (VIEW_CONVERT_EXPR, resvectype,
3427                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3428               gimple_assign_set_lhs (new_stmt,
3429                                      make_ssa_name
3430                                        (gimple_assign_lhs (new_stmt), new_stmt));
3431               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3432             }
3433           set_vinfo_for_stmt (new_stmt,
3434                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3435           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3436           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3437         }
3438     }
3439
3440   if (nested_in_vect_loop)
3441     {
3442       /* Find the loop-closed exit-phi of the induction, and record
3443          the final vector of induction results:  */
3444       exit_phi = NULL;
3445       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3446         {
3447           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3448             {
3449               exit_phi = USE_STMT (use_p);
3450               break;
3451             }
3452         }
3453       if (exit_phi)
3454         {
3455           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3456           /* FORNOW. Currently not supporting the case that an inner-loop induction
3457              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3458           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3459                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3460
3461           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3462           if (dump_enabled_p ())
3463             {
3464               dump_printf_loc (MSG_NOTE, vect_location,
3465                                "vector of inductions after inner-loop:");
3466               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3467               dump_printf (MSG_NOTE, "\n");
3468             }
3469         }
3470     }
3471
3472
3473   if (dump_enabled_p ())
3474     {
3475       dump_printf_loc (MSG_NOTE, vect_location,
3476                        "transform induction: created def-use cycle: ");
3477       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3478       dump_printf (MSG_NOTE, "\n");
3479       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3480                         SSA_NAME_DEF_STMT (vec_def), 0);
3481       dump_printf (MSG_NOTE, "\n");
3482     }
3483
3484   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3485   if (!useless_type_conversion_p (resvectype, vectype))
3486     {
3487       new_stmt = gimple_build_assign_with_ops
3488          (VIEW_CONVERT_EXPR,
3489           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3490           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3491       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3492       gimple_assign_set_lhs (new_stmt, induc_def);
3493       si = gsi_after_labels (bb);
3494       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3495       set_vinfo_for_stmt (new_stmt,
3496                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3497       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3498         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3499     }
3500
3501   return induc_def;
3502 }
3503
3504
3505 /* Function get_initial_def_for_reduction
3506
3507    Input:
3508    STMT - a stmt that performs a reduction operation in the loop.
3509    INIT_VAL - the initial value of the reduction variable
3510
3511    Output:
3512    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3513         of the reduction (used for adjusting the epilog - see below).
3514    Return a vector variable, initialized according to the operation that STMT
3515         performs. This vector will be used as the initial value of the
3516         vector of partial results.
3517
3518    Option1 (adjust in epilog): Initialize the vector as follows:
3519      add/bit or/xor:    [0,0,...,0,0]
3520      mult/bit and:      [1,1,...,1,1]
3521      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3522    and when necessary (e.g. add/mult case) let the caller know
3523    that it needs to adjust the result by init_val.
3524
3525    Option2: Initialize the vector as follows:
3526      add/bit or/xor:    [init_val,0,0,...,0]
3527      mult/bit and:      [init_val,1,1,...,1]
3528      min/max/cond_expr: [init_val,init_val,...,init_val]
3529    and no adjustments are needed.
3530
3531    For example, for the following code:
3532
3533    s = init_val;
3534    for (i=0;i<n;i++)
3535      s = s + a[i];
3536
3537    STMT is 's = s + a[i]', and the reduction variable is 's'.
3538    For a vector of 4 units, we want to return either [0,0,0,init_val],
3539    or [0,0,0,0] and let the caller know that it needs to adjust
3540    the result at the end by 'init_val'.
3541
3542    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3543    initialization vector is simpler (same element in all entries), if
3544    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3545
3546    A cost model should help decide between these two schemes.  */
3547
3548 tree
3549 get_initial_def_for_reduction (gimple stmt, tree init_val,
3550                                tree *adjustment_def)
3551 {
3552   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3553   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3554   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3555   tree scalar_type = TREE_TYPE (init_val);
3556   tree vectype = get_vectype_for_scalar_type (scalar_type);
3557   int nunits;
3558   enum tree_code code = gimple_assign_rhs_code (stmt);
3559   tree def_for_init;
3560   tree init_def;
3561   tree *elts;
3562   int i;
3563   bool nested_in_vect_loop = false;
3564   tree init_value;
3565   REAL_VALUE_TYPE real_init_val = dconst0;
3566   int int_init_val = 0;
3567   gimple def_stmt = NULL;
3568
3569   gcc_assert (vectype);
3570   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3571
3572   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3573               || SCALAR_FLOAT_TYPE_P (scalar_type));
3574
3575   if (nested_in_vect_loop_p (loop, stmt))
3576     nested_in_vect_loop = true;
3577   else
3578     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3579
3580   /* In case of double reduction we only create a vector variable to be put
3581      in the reduction phi node.  The actual statement creation is done in
3582      vect_create_epilog_for_reduction.  */
3583   if (adjustment_def && nested_in_vect_loop
3584       && TREE_CODE (init_val) == SSA_NAME
3585       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3586       && gimple_code (def_stmt) == GIMPLE_PHI
3587       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3588       && vinfo_for_stmt (def_stmt)
3589       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3590           == vect_double_reduction_def)
3591     {
3592       *adjustment_def = NULL;
3593       return vect_create_destination_var (init_val, vectype);
3594     }
3595
3596   if (TREE_CONSTANT (init_val))
3597     {
3598       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3599         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3600       else
3601         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3602     }
3603   else
3604     init_value = init_val;
3605
3606   switch (code)
3607     {
3608       case WIDEN_SUM_EXPR:
3609       case DOT_PROD_EXPR:
3610       case PLUS_EXPR:
3611       case MINUS_EXPR:
3612       case BIT_IOR_EXPR:
3613       case BIT_XOR_EXPR:
3614       case MULT_EXPR:
3615       case BIT_AND_EXPR:
3616         /* ADJUSMENT_DEF is NULL when called from
3617            vect_create_epilog_for_reduction to vectorize double reduction.  */
3618         if (adjustment_def)
3619           {
3620             if (nested_in_vect_loop)
3621               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3622                                                               NULL);
3623             else
3624               *adjustment_def = init_val;
3625           }
3626
3627         if (code == MULT_EXPR)
3628           {
3629             real_init_val = dconst1;
3630             int_init_val = 1;
3631           }
3632
3633         if (code == BIT_AND_EXPR)
3634           int_init_val = -1;
3635
3636         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3637           def_for_init = build_real (scalar_type, real_init_val);
3638         else
3639           def_for_init = build_int_cst (scalar_type, int_init_val);
3640
3641         /* Create a vector of '0' or '1' except the first element.  */
3642         elts = XALLOCAVEC (tree, nunits);
3643         for (i = nunits - 2; i >= 0; --i)
3644           elts[i + 1] = def_for_init;
3645
3646         /* Option1: the first element is '0' or '1' as well.  */
3647         if (adjustment_def)
3648           {
3649             elts[0] = def_for_init;
3650             init_def = build_vector (vectype, elts);
3651             break;
3652           }
3653
3654         /* Option2: the first element is INIT_VAL.  */
3655         elts[0] = init_val;
3656         if (TREE_CONSTANT (init_val))
3657           init_def = build_vector (vectype, elts);
3658         else
3659           {
3660             vec<constructor_elt, va_gc> *v;
3661             vec_alloc (v, nunits);
3662             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3663             for (i = 1; i < nunits; ++i)
3664               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3665             init_def = build_constructor (vectype, v);
3666           }
3667
3668         break;
3669
3670       case MIN_EXPR:
3671       case MAX_EXPR:
3672       case COND_EXPR:
3673         if (adjustment_def)
3674           {
3675             *adjustment_def = NULL_TREE;
3676             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3677             break;
3678           }
3679
3680         init_def = build_vector_from_val (vectype, init_value);
3681         break;
3682
3683       default:
3684         gcc_unreachable ();
3685     }
3686
3687   return init_def;
3688 }
3689
3690
3691 /* Function vect_create_epilog_for_reduction
3692
3693    Create code at the loop-epilog to finalize the result of a reduction
3694    computation.
3695
3696    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3697      reduction statements.
3698    STMT is the scalar reduction stmt that is being vectorized.
3699    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3700      number of elements that we can fit in a vectype (nunits).  In this case
3701      we have to generate more than one vector stmt - i.e - we need to "unroll"
3702      the vector stmt by a factor VF/nunits.  For more details see documentation
3703      in vectorizable_operation.
3704    REDUC_CODE is the tree-code for the epilog reduction.
3705    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3706      computation.
3707    REDUC_INDEX is the index of the operand in the right hand side of the
3708      statement that is defined by REDUCTION_PHI.
3709    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3710    SLP_NODE is an SLP node containing a group of reduction statements. The
3711      first one in this group is STMT.
3712
3713    This function:
3714    1. Creates the reduction def-use cycles: sets the arguments for
3715       REDUCTION_PHIS:
3716       The loop-entry argument is the vectorized initial-value of the reduction.
3717       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3718       sums.
3719    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3720       by applying the operation specified by REDUC_CODE if available, or by
3721       other means (whole-vector shifts or a scalar loop).
3722       The function also creates a new phi node at the loop exit to preserve
3723       loop-closed form, as illustrated below.
3724
3725      The flow at the entry to this function:
3726
3727         loop:
3728           vec_def = phi <null, null>            # REDUCTION_PHI
3729           VECT_DEF = vector_stmt                # vectorized form of STMT
3730           s_loop = scalar_stmt                  # (scalar) STMT
3731         loop_exit:
3732           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3733           use <s_out0>
3734           use <s_out0>
3735
3736      The above is transformed by this function into:
3737
3738         loop:
3739           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3740           VECT_DEF = vector_stmt                # vectorized form of STMT
3741           s_loop = scalar_stmt                  # (scalar) STMT
3742         loop_exit:
3743           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3744           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3745           v_out2 = reduce <v_out1>
3746           s_out3 = extract_field <v_out2, 0>
3747           s_out4 = adjust_result <s_out3>
3748           use <s_out4>
3749           use <s_out4>
3750 */
3751
3752 static void
3753 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3754                                   int ncopies, enum tree_code reduc_code,
3755                                   vec<gimple> reduction_phis,
3756                                   int reduc_index, bool double_reduc,
3757                                   slp_tree slp_node)
3758 {
3759   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3760   stmt_vec_info prev_phi_info;
3761   tree vectype;
3762   enum machine_mode mode;
3763   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3764   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3765   basic_block exit_bb;
3766   tree scalar_dest;
3767   tree scalar_type;
3768   gimple new_phi = NULL, phi;
3769   gimple_stmt_iterator exit_gsi;
3770   tree vec_dest;
3771   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3772   gimple epilog_stmt = NULL;
3773   enum tree_code code = gimple_assign_rhs_code (stmt);
3774   gimple exit_phi;
3775   tree bitsize, bitpos;
3776   tree adjustment_def = NULL;
3777   tree vec_initial_def = NULL;
3778   tree reduction_op, expr, def;
3779   tree orig_name, scalar_result;
3780   imm_use_iterator imm_iter, phi_imm_iter;
3781   use_operand_p use_p, phi_use_p;
3782   bool extract_scalar_result = false;
3783   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3784   bool nested_in_vect_loop = false;
3785   vec<gimple> new_phis = vNULL;
3786   vec<gimple> inner_phis = vNULL;
3787   enum vect_def_type dt = vect_unknown_def_type;
3788   int j, i;
3789   vec<tree> scalar_results = vNULL;
3790   unsigned int group_size = 1, k, ratio;
3791   vec<tree> vec_initial_defs = vNULL;
3792   vec<gimple> phis;
3793   bool slp_reduc = false;
3794   tree new_phi_result;
3795   gimple inner_phi = NULL;
3796
3797   if (slp_node)
3798     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3799
3800   if (nested_in_vect_loop_p (loop, stmt))
3801     {
3802       outer_loop = loop;
3803       loop = loop->inner;
3804       nested_in_vect_loop = true;
3805       gcc_assert (!slp_node);
3806     }
3807
3808   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3809     {
3810     case GIMPLE_SINGLE_RHS:
3811       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3812                   == ternary_op);
3813       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3814       break;
3815     case GIMPLE_UNARY_RHS:
3816       reduction_op = gimple_assign_rhs1 (stmt);
3817       break;
3818     case GIMPLE_BINARY_RHS:
3819       reduction_op = reduc_index ?
3820                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3821       break;
3822     case GIMPLE_TERNARY_RHS:
3823       reduction_op = gimple_op (stmt, reduc_index + 1);
3824       break;
3825     default:
3826       gcc_unreachable ();
3827     }
3828
3829   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3830   gcc_assert (vectype);
3831   mode = TYPE_MODE (vectype);
3832
3833   /* 1. Create the reduction def-use cycle:
3834      Set the arguments of REDUCTION_PHIS, i.e., transform
3835
3836         loop:
3837           vec_def = phi <null, null>            # REDUCTION_PHI
3838           VECT_DEF = vector_stmt                # vectorized form of STMT
3839           ...
3840
3841      into:
3842
3843         loop:
3844           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3845           VECT_DEF = vector_stmt                # vectorized form of STMT
3846           ...
3847
3848      (in case of SLP, do it for all the phis). */
3849
3850   /* Get the loop-entry arguments.  */
3851   if (slp_node)
3852     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3853                        NULL, slp_node, reduc_index);
3854   else
3855     {
3856       vec_initial_defs.create (1);
3857      /* For the case of reduction, vect_get_vec_def_for_operand returns
3858         the scalar def before the loop, that defines the initial value
3859         of the reduction variable.  */
3860       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3861                                                       &adjustment_def);
3862       vec_initial_defs.quick_push (vec_initial_def);
3863     }
3864
3865   /* Set phi nodes arguments.  */
3866   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3867     {
3868       tree vec_init_def = vec_initial_defs[i];
3869       tree def = vect_defs[i];
3870       for (j = 0; j < ncopies; j++)
3871         {
3872           /* Set the loop-entry arg of the reduction-phi.  */
3873           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3874                        UNKNOWN_LOCATION);
3875
3876           /* Set the loop-latch arg for the reduction-phi.  */
3877           if (j > 0)
3878             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3879
3880           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3881
3882           if (dump_enabled_p ())
3883             {
3884               dump_printf_loc (MSG_NOTE, vect_location,
3885                                "transform reduction: created def-use cycle: ");
3886               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3887               dump_printf (MSG_NOTE, "\n");
3888               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3889               dump_printf (MSG_NOTE, "\n");
3890             }
3891
3892           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3893         }
3894     }
3895
3896   vec_initial_defs.release ();
3897
3898   /* 2. Create epilog code.
3899         The reduction epilog code operates across the elements of the vector
3900         of partial results computed by the vectorized loop.
3901         The reduction epilog code consists of:
3902
3903         step 1: compute the scalar result in a vector (v_out2)
3904         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3905         step 3: adjust the scalar result (s_out3) if needed.
3906
3907         Step 1 can be accomplished using one the following three schemes:
3908           (scheme 1) using reduc_code, if available.
3909           (scheme 2) using whole-vector shifts, if available.
3910           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3911                      combined.
3912
3913           The overall epilog code looks like this:
3914
3915           s_out0 = phi <s_loop>         # original EXIT_PHI
3916           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3917           v_out2 = reduce <v_out1>              # step 1
3918           s_out3 = extract_field <v_out2, 0>    # step 2
3919           s_out4 = adjust_result <s_out3>       # step 3
3920
3921           (step 3 is optional, and steps 1 and 2 may be combined).
3922           Lastly, the uses of s_out0 are replaced by s_out4.  */
3923
3924
3925   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3926          v_out1 = phi <VECT_DEF>
3927          Store them in NEW_PHIS.  */
3928
3929   exit_bb = single_exit (loop)->dest;
3930   prev_phi_info = NULL;
3931   new_phis.create (vect_defs.length ());
3932   FOR_EACH_VEC_ELT (vect_defs, i, def)
3933     {
3934       for (j = 0; j < ncopies; j++)
3935         {
3936           tree new_def = copy_ssa_name (def, NULL);
3937           phi = create_phi_node (new_def, exit_bb);
3938           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3939           if (j == 0)
3940             new_phis.quick_push (phi);
3941           else
3942             {
3943               def = vect_get_vec_def_for_stmt_copy (dt, def);
3944               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3945             }
3946
3947           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3948           prev_phi_info = vinfo_for_stmt (phi);
3949         }
3950     }
3951
3952   /* The epilogue is created for the outer-loop, i.e., for the loop being
3953      vectorized.  Create exit phis for the outer loop.  */
3954   if (double_reduc)
3955     {
3956       loop = outer_loop;
3957       exit_bb = single_exit (loop)->dest;
3958       inner_phis.create (vect_defs.length ());
3959       FOR_EACH_VEC_ELT (new_phis, i, phi)
3960         {
3961           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3962           gimple outer_phi = create_phi_node (new_result, exit_bb);
3963           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3964                            PHI_RESULT (phi));
3965           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3966                                                             loop_vinfo, NULL));
3967           inner_phis.quick_push (phi);
3968           new_phis[i] = outer_phi;
3969           prev_phi_info = vinfo_for_stmt (outer_phi);
3970           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3971             {
3972               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3973               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3974               outer_phi = create_phi_node (new_result, exit_bb);
3975               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3976                                PHI_RESULT (phi));
3977               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3978                                                         loop_vinfo, NULL));
3979               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3980               prev_phi_info = vinfo_for_stmt (outer_phi);
3981             }
3982         }
3983     }
3984
3985   exit_gsi = gsi_after_labels (exit_bb);
3986
3987   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3988          (i.e. when reduc_code is not available) and in the final adjustment
3989          code (if needed).  Also get the original scalar reduction variable as
3990          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3991          represents a reduction pattern), the tree-code and scalar-def are
3992          taken from the original stmt that the pattern-stmt (STMT) replaces.
3993          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3994          are taken from STMT.  */
3995
3996   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3997   if (!orig_stmt)
3998     {
3999       /* Regular reduction  */
4000       orig_stmt = stmt;
4001     }
4002   else
4003     {
4004       /* Reduction pattern  */
4005       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4006       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4007       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4008     }
4009
4010   code = gimple_assign_rhs_code (orig_stmt);
4011   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4012      partial results are added and not subtracted.  */
4013   if (code == MINUS_EXPR)
4014     code = PLUS_EXPR;
4015
4016   scalar_dest = gimple_assign_lhs (orig_stmt);
4017   scalar_type = TREE_TYPE (scalar_dest);
4018   scalar_results.create (group_size);
4019   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4020   bitsize = TYPE_SIZE (scalar_type);
4021
4022   /* In case this is a reduction in an inner-loop while vectorizing an outer
4023      loop - we don't need to extract a single scalar result at the end of the
4024      inner-loop (unless it is double reduction, i.e., the use of reduction is
4025      outside the outer-loop).  The final vector of partial results will be used
4026      in the vectorized outer-loop, or reduced to a scalar result at the end of
4027      the outer-loop.  */
4028   if (nested_in_vect_loop && !double_reduc)
4029     goto vect_finalize_reduction;
4030
4031   /* SLP reduction without reduction chain, e.g.,
4032      # a1 = phi <a2, a0>
4033      # b1 = phi <b2, b0>
4034      a2 = operation (a1)
4035      b2 = operation (b1)  */
4036   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4037
4038   /* In case of reduction chain, e.g.,
4039      # a1 = phi <a3, a0>
4040      a2 = operation (a1)
4041      a3 = operation (a2),
4042
4043      we may end up with more than one vector result.  Here we reduce them to
4044      one vector.  */
4045   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4046     {
4047       tree first_vect = PHI_RESULT (new_phis[0]);
4048       tree tmp;
4049       gimple new_vec_stmt = NULL;
4050
4051       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4052       for (k = 1; k < new_phis.length (); k++)
4053         {
4054           gimple next_phi = new_phis[k];
4055           tree second_vect = PHI_RESULT (next_phi);
4056
4057           tmp = build2 (code, vectype,  first_vect, second_vect);
4058           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4059           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4060           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4061           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4062         }
4063
4064       new_phi_result = first_vect;
4065       if (new_vec_stmt)
4066         {
4067           new_phis.truncate (0);
4068           new_phis.safe_push (new_vec_stmt);
4069         }
4070     }
4071   else
4072     new_phi_result = PHI_RESULT (new_phis[0]);
4073
4074   /* 2.3 Create the reduction code, using one of the three schemes described
4075          above. In SLP we simply need to extract all the elements from the
4076          vector (without reducing them), so we use scalar shifts.  */
4077   if (reduc_code != ERROR_MARK && !slp_reduc)
4078     {
4079       tree tmp;
4080
4081       /*** Case 1:  Create:
4082            v_out2 = reduc_expr <v_out1>  */
4083
4084       if (dump_enabled_p ())
4085         dump_printf_loc (MSG_NOTE, vect_location,
4086                          "Reduce using direct vector reduction.\n");
4087
4088       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4089       tmp = build1 (reduc_code, vectype, new_phi_result);
4090       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4091       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4092       gimple_assign_set_lhs (epilog_stmt, new_temp);
4093       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4094
4095       extract_scalar_result = true;
4096     }
4097   else
4098     {
4099       enum tree_code shift_code = ERROR_MARK;
4100       bool have_whole_vector_shift = true;
4101       int bit_offset;
4102       int element_bitsize = tree_low_cst (bitsize, 1);
4103       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4104       tree vec_temp;
4105
4106       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4107         shift_code = VEC_RSHIFT_EXPR;
4108       else
4109         have_whole_vector_shift = false;
4110
4111       /* Regardless of whether we have a whole vector shift, if we're
4112          emulating the operation via tree-vect-generic, we don't want
4113          to use it.  Only the first round of the reduction is likely
4114          to still be profitable via emulation.  */
4115       /* ??? It might be better to emit a reduction tree code here, so that
4116          tree-vect-generic can expand the first round via bit tricks.  */
4117       if (!VECTOR_MODE_P (mode))
4118         have_whole_vector_shift = false;
4119       else
4120         {
4121           optab optab = optab_for_tree_code (code, vectype, optab_default);
4122           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4123             have_whole_vector_shift = false;
4124         }
4125
4126       if (have_whole_vector_shift && !slp_reduc)
4127         {
4128           /*** Case 2: Create:
4129              for (offset = VS/2; offset >= element_size; offset/=2)
4130                 {
4131                   Create:  va' = vec_shift <va, offset>
4132                   Create:  va = vop <va, va'>
4133                 }  */
4134
4135           if (dump_enabled_p ())
4136             dump_printf_loc (MSG_NOTE, vect_location,
4137                              "Reduce using vector shifts\n");
4138
4139           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4140           new_temp = new_phi_result;
4141           for (bit_offset = vec_size_in_bits/2;
4142                bit_offset >= element_bitsize;
4143                bit_offset /= 2)
4144             {
4145               tree bitpos = size_int (bit_offset);
4146
4147               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4148                                                vec_dest, new_temp, bitpos);
4149               new_name = make_ssa_name (vec_dest, epilog_stmt);
4150               gimple_assign_set_lhs (epilog_stmt, new_name);
4151               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4152
4153               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4154                                                           new_name, new_temp);
4155               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4156               gimple_assign_set_lhs (epilog_stmt, new_temp);
4157               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4158             }
4159
4160           extract_scalar_result = true;
4161         }
4162       else
4163         {
4164           tree rhs;
4165
4166           /*** Case 3: Create:
4167              s = extract_field <v_out2, 0>
4168              for (offset = element_size;
4169                   offset < vector_size;
4170                   offset += element_size;)
4171                {
4172                  Create:  s' = extract_field <v_out2, offset>
4173                  Create:  s = op <s, s'>  // For non SLP cases
4174                }  */
4175
4176           if (dump_enabled_p ())
4177             dump_printf_loc (MSG_NOTE, vect_location,
4178                              "Reduce using scalar code.\n");
4179
4180           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4181           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4182             {
4183               if (gimple_code (new_phi) == GIMPLE_PHI)
4184                 vec_temp = PHI_RESULT (new_phi);
4185               else
4186                 vec_temp = gimple_assign_lhs (new_phi);
4187               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4188                             bitsize_zero_node);
4189               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4190               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4191               gimple_assign_set_lhs (epilog_stmt, new_temp);
4192               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4193
4194               /* In SLP we don't need to apply reduction operation, so we just
4195                  collect s' values in SCALAR_RESULTS.  */
4196               if (slp_reduc)
4197                 scalar_results.safe_push (new_temp);
4198
4199               for (bit_offset = element_bitsize;
4200                    bit_offset < vec_size_in_bits;
4201                    bit_offset += element_bitsize)
4202                 {
4203                   tree bitpos = bitsize_int (bit_offset);
4204                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4205                                      bitsize, bitpos);
4206
4207                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4208                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4209                   gimple_assign_set_lhs (epilog_stmt, new_name);
4210                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4211
4212                   if (slp_reduc)
4213                     {
4214                       /* In SLP we don't need to apply reduction operation, so
4215                          we just collect s' values in SCALAR_RESULTS.  */
4216                       new_temp = new_name;
4217                       scalar_results.safe_push (new_name);
4218                     }
4219                   else
4220                     {
4221                       epilog_stmt = gimple_build_assign_with_ops (code,
4222                                           new_scalar_dest, new_name, new_temp);
4223                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4224                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4225                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4226                     }
4227                 }
4228             }
4229
4230           /* The only case where we need to reduce scalar results in SLP, is
4231              unrolling.  If the size of SCALAR_RESULTS is greater than
4232              GROUP_SIZE, we reduce them combining elements modulo
4233              GROUP_SIZE.  */
4234           if (slp_reduc)
4235             {
4236               tree res, first_res, new_res;
4237               gimple new_stmt;
4238
4239               /* Reduce multiple scalar results in case of SLP unrolling.  */
4240               for (j = group_size; scalar_results.iterate (j, &res);
4241                    j++)
4242                 {
4243                   first_res = scalar_results[j % group_size];
4244                   new_stmt = gimple_build_assign_with_ops (code,
4245                                               new_scalar_dest, first_res, res);
4246                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4247                   gimple_assign_set_lhs (new_stmt, new_res);
4248                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4249                   scalar_results[j % group_size] = new_res;
4250                 }
4251             }
4252           else
4253             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4254             scalar_results.safe_push (new_temp);
4255
4256           extract_scalar_result = false;
4257         }
4258     }
4259
4260   /* 2.4  Extract the final scalar result.  Create:
4261           s_out3 = extract_field <v_out2, bitpos>  */
4262
4263   if (extract_scalar_result)
4264     {
4265       tree rhs;
4266
4267       if (dump_enabled_p ())
4268         dump_printf_loc (MSG_NOTE, vect_location,
4269                          "extract scalar result\n");
4270
4271       if (BYTES_BIG_ENDIAN)
4272         bitpos = size_binop (MULT_EXPR,
4273                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4274                              TYPE_SIZE (scalar_type));
4275       else
4276         bitpos = bitsize_zero_node;
4277
4278       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4279       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4280       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4281       gimple_assign_set_lhs (epilog_stmt, new_temp);
4282       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4283       scalar_results.safe_push (new_temp);
4284     }
4285
4286 vect_finalize_reduction:
4287
4288   if (double_reduc)
4289     loop = loop->inner;
4290
4291   /* 2.5 Adjust the final result by the initial value of the reduction
4292          variable. (When such adjustment is not needed, then
4293          'adjustment_def' is zero).  For example, if code is PLUS we create:
4294          new_temp = loop_exit_def + adjustment_def  */
4295
4296   if (adjustment_def)
4297     {
4298       gcc_assert (!slp_reduc);
4299       if (nested_in_vect_loop)
4300         {
4301           new_phi = new_phis[0];
4302           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4303           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4304           new_dest = vect_create_destination_var (scalar_dest, vectype);
4305         }
4306       else
4307         {
4308           new_temp = scalar_results[0];
4309           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4310           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4311           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4312         }
4313
4314       epilog_stmt = gimple_build_assign (new_dest, expr);
4315       new_temp = make_ssa_name (new_dest, epilog_stmt);
4316       gimple_assign_set_lhs (epilog_stmt, new_temp);
4317       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4318       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4319       if (nested_in_vect_loop)
4320         {
4321           set_vinfo_for_stmt (epilog_stmt,
4322                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4323                                                  NULL));
4324           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4325                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4326
4327           if (!double_reduc)
4328             scalar_results.quick_push (new_temp);
4329           else
4330             scalar_results[0] = new_temp;
4331         }
4332       else
4333         scalar_results[0] = new_temp;
4334
4335       new_phis[0] = epilog_stmt;
4336     }
4337
4338   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4339           phis with new adjusted scalar results, i.e., replace use <s_out0>
4340           with use <s_out4>.
4341
4342      Transform:
4343         loop_exit:
4344           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4345           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4346           v_out2 = reduce <v_out1>
4347           s_out3 = extract_field <v_out2, 0>
4348           s_out4 = adjust_result <s_out3>
4349           use <s_out0>
4350           use <s_out0>
4351
4352      into:
4353
4354         loop_exit:
4355           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4356           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4357           v_out2 = reduce <v_out1>
4358           s_out3 = extract_field <v_out2, 0>
4359           s_out4 = adjust_result <s_out3>
4360           use <s_out4>
4361           use <s_out4> */
4362
4363
4364   /* In SLP reduction chain we reduce vector results into one vector if
4365      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4366      the last stmt in the reduction chain, since we are looking for the loop
4367      exit phi node.  */
4368   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4369     {
4370       scalar_dest = gimple_assign_lhs (
4371                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4372       group_size = 1;
4373     }
4374
4375   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4376      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4377      need to match SCALAR_RESULTS with corresponding statements.  The first
4378      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4379      the first vector stmt, etc.
4380      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4381   if (group_size > new_phis.length ())
4382     {
4383       ratio = group_size / new_phis.length ();
4384       gcc_assert (!(group_size % new_phis.length ()));
4385     }
4386   else
4387     ratio = 1;
4388
4389   for (k = 0; k < group_size; k++)
4390     {
4391       if (k % ratio == 0)
4392         {
4393           epilog_stmt = new_phis[k / ratio];
4394           reduction_phi = reduction_phis[k / ratio];
4395           if (double_reduc)
4396             inner_phi = inner_phis[k / ratio];
4397         }
4398
4399       if (slp_reduc)
4400         {
4401           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4402
4403           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4404           /* SLP statements can't participate in patterns.  */
4405           gcc_assert (!orig_stmt);
4406           scalar_dest = gimple_assign_lhs (current_stmt);
4407         }
4408
4409       phis.create (3);
4410       /* Find the loop-closed-use at the loop exit of the original scalar
4411          result.  (The reduction result is expected to have two immediate uses -
4412          one at the latch block, and one at the loop exit).  */
4413       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4414         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4415           phis.safe_push (USE_STMT (use_p));
4416
4417       /* While we expect to have found an exit_phi because of loop-closed-ssa
4418          form we can end up without one if the scalar cycle is dead.  */
4419
4420       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4421         {
4422           if (outer_loop)
4423             {
4424               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4425               gimple vect_phi;
4426
4427               /* FORNOW. Currently not supporting the case that an inner-loop
4428                  reduction is not used in the outer-loop (but only outside the
4429                  outer-loop), unless it is double reduction.  */
4430               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4431                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4432                           || double_reduc);
4433
4434               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4435               if (!double_reduc
4436                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4437                       != vect_double_reduction_def)
4438                 continue;
4439
4440               /* Handle double reduction:
4441
4442                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4443                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4444                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4445                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4446
4447                  At that point the regular reduction (stmt2 and stmt3) is
4448                  already vectorized, as well as the exit phi node, stmt4.
4449                  Here we vectorize the phi node of double reduction, stmt1, and
4450                  update all relevant statements.  */
4451
4452               /* Go through all the uses of s2 to find double reduction phi
4453                  node, i.e., stmt1 above.  */
4454               orig_name = PHI_RESULT (exit_phi);
4455               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4456                 {
4457                   stmt_vec_info use_stmt_vinfo;
4458                   stmt_vec_info new_phi_vinfo;
4459                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4460                   basic_block bb = gimple_bb (use_stmt);
4461                   gimple use;
4462
4463                   /* Check that USE_STMT is really double reduction phi
4464                      node.  */
4465                   if (gimple_code (use_stmt) != GIMPLE_PHI
4466                       || gimple_phi_num_args (use_stmt) != 2
4467                       || bb->loop_father != outer_loop)
4468                     continue;
4469                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4470                   if (!use_stmt_vinfo
4471                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4472                           != vect_double_reduction_def)
4473                     continue;
4474
4475                   /* Create vector phi node for double reduction:
4476                      vs1 = phi <vs0, vs2>
4477                      vs1 was created previously in this function by a call to
4478                        vect_get_vec_def_for_operand and is stored in
4479                        vec_initial_def;
4480                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4481                      vs0 is created here.  */
4482
4483                   /* Create vector phi node.  */
4484                   vect_phi = create_phi_node (vec_initial_def, bb);
4485                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4486                                     loop_vec_info_for_loop (outer_loop), NULL);
4487                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4488
4489                   /* Create vs0 - initial def of the double reduction phi.  */
4490                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4491                                              loop_preheader_edge (outer_loop));
4492                   init_def = get_initial_def_for_reduction (stmt,
4493                                                           preheader_arg, NULL);
4494                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4495                                                     vectype, NULL);
4496
4497                   /* Update phi node arguments with vs0 and vs2.  */
4498                   add_phi_arg (vect_phi, vect_phi_init,
4499                                loop_preheader_edge (outer_loop),
4500                                UNKNOWN_LOCATION);
4501                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4502                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4503                   if (dump_enabled_p ())
4504                     {
4505                       dump_printf_loc (MSG_NOTE, vect_location,
4506                                        "created double reduction phi node: ");
4507                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4508                       dump_printf (MSG_NOTE, "\n");
4509                     }
4510
4511                   vect_phi_res = PHI_RESULT (vect_phi);
4512
4513                   /* Replace the use, i.e., set the correct vs1 in the regular
4514                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4515                      loop is redundant.  */
4516                   use = reduction_phi;
4517                   for (j = 0; j < ncopies; j++)
4518                     {
4519                       edge pr_edge = loop_preheader_edge (loop);
4520                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4521                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4522                     }
4523                 }
4524             }
4525         }
4526
4527       phis.release ();
4528       if (nested_in_vect_loop)
4529         {
4530           if (double_reduc)
4531             loop = outer_loop;
4532           else
4533             continue;
4534         }
4535
4536       phis.create (3);
4537       /* Find the loop-closed-use at the loop exit of the original scalar
4538          result.  (The reduction result is expected to have two immediate uses,
4539          one at the latch block, and one at the loop exit).  For double
4540          reductions we are looking for exit phis of the outer loop.  */
4541       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4542         {
4543           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4544             phis.safe_push (USE_STMT (use_p));
4545           else
4546             {
4547               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4548                 {
4549                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4550
4551                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4552                     {
4553                       if (!flow_bb_inside_loop_p (loop,
4554                                              gimple_bb (USE_STMT (phi_use_p))))
4555                         phis.safe_push (USE_STMT (phi_use_p));
4556                     }
4557                 }
4558             }
4559         }
4560
4561       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4562         {
4563           /* Replace the uses:  */
4564           orig_name = PHI_RESULT (exit_phi);
4565           scalar_result = scalar_results[k];
4566           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4567             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4568               SET_USE (use_p, scalar_result);
4569         }
4570
4571       phis.release ();
4572     }
4573
4574   scalar_results.release ();
4575   inner_phis.release ();
4576   new_phis.release ();
4577 }
4578
4579
4580 /* Function vectorizable_reduction.
4581
4582    Check if STMT performs a reduction operation that can be vectorized.
4583    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4584    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4585    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4586
4587    This function also handles reduction idioms (patterns) that have been
4588    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4589    of this form:
4590      X = pattern_expr (arg0, arg1, ..., X)
4591    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4592    sequence that had been detected and replaced by the pattern-stmt (STMT).
4593
4594    In some cases of reduction patterns, the type of the reduction variable X is
4595    different than the type of the other arguments of STMT.
4596    In such cases, the vectype that is used when transforming STMT into a vector
4597    stmt is different than the vectype that is used to determine the
4598    vectorization factor, because it consists of a different number of elements
4599    than the actual number of elements that are being operated upon in parallel.
4600
4601    For example, consider an accumulation of shorts into an int accumulator.
4602    On some targets it's possible to vectorize this pattern operating on 8
4603    shorts at a time (hence, the vectype for purposes of determining the
4604    vectorization factor should be V8HI); on the other hand, the vectype that
4605    is used to create the vector form is actually V4SI (the type of the result).
4606
4607    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4608    indicates what is the actual level of parallelism (V8HI in the example), so
4609    that the right vectorization factor would be derived.  This vectype
4610    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4611    be used to create the vectorized stmt.  The right vectype for the vectorized
4612    stmt is obtained from the type of the result X:
4613         get_vectype_for_scalar_type (TREE_TYPE (X))
4614
4615    This means that, contrary to "regular" reductions (or "regular" stmts in
4616    general), the following equation:
4617       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4618    does *NOT* necessarily hold for reduction patterns.  */
4619
4620 bool
4621 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4622                         gimple *vec_stmt, slp_tree slp_node)
4623 {
4624   tree vec_dest;
4625   tree scalar_dest;
4626   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4627   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4628   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4629   tree vectype_in = NULL_TREE;
4630   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4631   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4632   enum tree_code code, orig_code, epilog_reduc_code;
4633   enum machine_mode vec_mode;
4634   int op_type;
4635   optab optab, reduc_optab;
4636   tree new_temp = NULL_TREE;
4637   tree def;
4638   gimple def_stmt;
4639   enum vect_def_type dt;
4640   gimple new_phi = NULL;
4641   tree scalar_type;
4642   bool is_simple_use;
4643   gimple orig_stmt;
4644   stmt_vec_info orig_stmt_info;
4645   tree expr = NULL_TREE;
4646   int i;
4647   int ncopies;
4648   int epilog_copies;
4649   stmt_vec_info prev_stmt_info, prev_phi_info;
4650   bool single_defuse_cycle = false;
4651   tree reduc_def = NULL_TREE;
4652   gimple new_stmt = NULL;
4653   int j;
4654   tree ops[3];
4655   bool nested_cycle = false, found_nested_cycle_def = false;
4656   gimple reduc_def_stmt = NULL;
4657   /* The default is that the reduction variable is the last in statement.  */
4658   int reduc_index = 2;
4659   bool double_reduc = false, dummy;
4660   basic_block def_bb;
4661   struct loop * def_stmt_loop, *outer_loop = NULL;
4662   tree def_arg;
4663   gimple def_arg_stmt;
4664   vec<tree> vec_oprnds0 = vNULL;
4665   vec<tree> vec_oprnds1 = vNULL;
4666   vec<tree> vect_defs = vNULL;
4667   vec<gimple> phis = vNULL;
4668   int vec_num;
4669   tree def0, def1, tem, op0, op1 = NULL_TREE;
4670
4671   /* In case of reduction chain we switch to the first stmt in the chain, but
4672      we don't update STMT_INFO, since only the last stmt is marked as reduction
4673      and has reduction properties.  */
4674   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4675     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4676
4677   if (nested_in_vect_loop_p (loop, stmt))
4678     {
4679       outer_loop = loop;
4680       loop = loop->inner;
4681       nested_cycle = true;
4682     }
4683
4684   /* 1. Is vectorizable reduction?  */
4685   /* Not supportable if the reduction variable is used in the loop, unless
4686      it's a reduction chain.  */
4687   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4688       && !GROUP_FIRST_ELEMENT (stmt_info))
4689     return false;
4690
4691   /* Reductions that are not used even in an enclosing outer-loop,
4692      are expected to be "live" (used out of the loop).  */
4693   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4694       && !STMT_VINFO_LIVE_P (stmt_info))
4695     return false;
4696
4697   /* Make sure it was already recognized as a reduction computation.  */
4698   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4699       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4700     return false;
4701
4702   /* 2. Has this been recognized as a reduction pattern?
4703
4704      Check if STMT represents a pattern that has been recognized
4705      in earlier analysis stages.  For stmts that represent a pattern,
4706      the STMT_VINFO_RELATED_STMT field records the last stmt in
4707      the original sequence that constitutes the pattern.  */
4708
4709   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4710   if (orig_stmt)
4711     {
4712       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4713       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4714       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4715     }
4716
4717   /* 3. Check the operands of the operation.  The first operands are defined
4718         inside the loop body. The last operand is the reduction variable,
4719         which is defined by the loop-header-phi.  */
4720
4721   gcc_assert (is_gimple_assign (stmt));
4722
4723   /* Flatten RHS.  */
4724   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4725     {
4726     case GIMPLE_SINGLE_RHS:
4727       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4728       if (op_type == ternary_op)
4729         {
4730           tree rhs = gimple_assign_rhs1 (stmt);
4731           ops[0] = TREE_OPERAND (rhs, 0);
4732           ops[1] = TREE_OPERAND (rhs, 1);
4733           ops[2] = TREE_OPERAND (rhs, 2);
4734           code = TREE_CODE (rhs);
4735         }
4736       else
4737         return false;
4738       break;
4739
4740     case GIMPLE_BINARY_RHS:
4741       code = gimple_assign_rhs_code (stmt);
4742       op_type = TREE_CODE_LENGTH (code);
4743       gcc_assert (op_type == binary_op);
4744       ops[0] = gimple_assign_rhs1 (stmt);
4745       ops[1] = gimple_assign_rhs2 (stmt);
4746       break;
4747
4748     case GIMPLE_TERNARY_RHS:
4749       code = gimple_assign_rhs_code (stmt);
4750       op_type = TREE_CODE_LENGTH (code);
4751       gcc_assert (op_type == ternary_op);
4752       ops[0] = gimple_assign_rhs1 (stmt);
4753       ops[1] = gimple_assign_rhs2 (stmt);
4754       ops[2] = gimple_assign_rhs3 (stmt);
4755       break;
4756
4757     case GIMPLE_UNARY_RHS:
4758       return false;
4759
4760     default:
4761       gcc_unreachable ();
4762     }
4763
4764   if (code == COND_EXPR && slp_node)
4765     return false;
4766
4767   scalar_dest = gimple_assign_lhs (stmt);
4768   scalar_type = TREE_TYPE (scalar_dest);
4769   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4770       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4771     return false;
4772
4773   /* Do not try to vectorize bit-precision reductions.  */
4774   if ((TYPE_PRECISION (scalar_type)
4775        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4776     return false;
4777
4778   /* All uses but the last are expected to be defined in the loop.
4779      The last use is the reduction variable.  In case of nested cycle this
4780      assumption is not true: we use reduc_index to record the index of the
4781      reduction variable.  */
4782   for (i = 0; i < op_type - 1; i++)
4783     {
4784       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4785       if (i == 0 && code == COND_EXPR)
4786         continue;
4787
4788       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4789                                             &def_stmt, &def, &dt, &tem);
4790       if (!vectype_in)
4791         vectype_in = tem;
4792       gcc_assert (is_simple_use);
4793
4794       if (dt != vect_internal_def
4795           && dt != vect_external_def
4796           && dt != vect_constant_def
4797           && dt != vect_induction_def
4798           && !(dt == vect_nested_cycle && nested_cycle))
4799         return false;
4800
4801       if (dt == vect_nested_cycle)
4802         {
4803           found_nested_cycle_def = true;
4804           reduc_def_stmt = def_stmt;
4805           reduc_index = i;
4806         }
4807     }
4808
4809   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4810                                         &def_stmt, &def, &dt, &tem);
4811   if (!vectype_in)
4812     vectype_in = tem;
4813   gcc_assert (is_simple_use);
4814   if (!(dt == vect_reduction_def
4815         || dt == vect_nested_cycle
4816         || ((dt == vect_internal_def || dt == vect_external_def
4817              || dt == vect_constant_def || dt == vect_induction_def)
4818             && nested_cycle && found_nested_cycle_def)))
4819     {
4820       /* For pattern recognized stmts, orig_stmt might be a reduction,
4821          but some helper statements for the pattern might not, or
4822          might be COND_EXPRs with reduction uses in the condition.  */
4823       gcc_assert (orig_stmt);
4824       return false;
4825     }
4826   if (!found_nested_cycle_def)
4827     reduc_def_stmt = def_stmt;
4828
4829   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4830   if (orig_stmt)
4831     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4832                                                        reduc_def_stmt,
4833                                                        !nested_cycle,
4834                                                        &dummy));
4835   else
4836     {
4837       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4838                                              !nested_cycle, &dummy);
4839       /* We changed STMT to be the first stmt in reduction chain, hence we
4840          check that in this case the first element in the chain is STMT.  */
4841       gcc_assert (stmt == tmp
4842                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4843     }
4844
4845   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4846     return false;
4847
4848   if (slp_node || PURE_SLP_STMT (stmt_info))
4849     ncopies = 1;
4850   else
4851     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4852                / TYPE_VECTOR_SUBPARTS (vectype_in));
4853
4854   gcc_assert (ncopies >= 1);
4855
4856   vec_mode = TYPE_MODE (vectype_in);
4857
4858   if (code == COND_EXPR)
4859     {
4860       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4861         {
4862           if (dump_enabled_p ())
4863             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4864                              "unsupported condition in reduction\n");
4865
4866             return false;
4867         }
4868     }
4869   else
4870     {
4871       /* 4. Supportable by target?  */
4872
4873       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4874           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4875         {
4876           /* Shifts and rotates are only supported by vectorizable_shifts,
4877              not vectorizable_reduction.  */
4878           if (dump_enabled_p ())
4879             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4880                              "unsupported shift or rotation.\n");
4881           return false;
4882         }
4883
4884       /* 4.1. check support for the operation in the loop  */
4885       optab = optab_for_tree_code (code, vectype_in, optab_default);
4886       if (!optab)
4887         {
4888           if (dump_enabled_p ())
4889             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4890                              "no optab.\n");
4891
4892           return false;
4893         }
4894
4895       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4896         {
4897           if (dump_enabled_p ())
4898             dump_printf (MSG_NOTE, "op not supported by target.\n");
4899
4900           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4901               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4902                   < vect_min_worthwhile_factor (code))
4903             return false;
4904
4905           if (dump_enabled_p ())
4906             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4907         }
4908
4909       /* Worthwhile without SIMD support?  */
4910       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4911           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4912              < vect_min_worthwhile_factor (code))
4913         {
4914           if (dump_enabled_p ())
4915             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4916                              "not worthwhile without SIMD support.\n");
4917
4918           return false;
4919         }
4920     }
4921
4922   /* 4.2. Check support for the epilog operation.
4923
4924           If STMT represents a reduction pattern, then the type of the
4925           reduction variable may be different than the type of the rest
4926           of the arguments.  For example, consider the case of accumulation
4927           of shorts into an int accumulator; The original code:
4928                         S1: int_a = (int) short_a;
4929           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4930
4931           was replaced with:
4932                         STMT: int_acc = widen_sum <short_a, int_acc>
4933
4934           This means that:
4935           1. The tree-code that is used to create the vector operation in the
4936              epilog code (that reduces the partial results) is not the
4937              tree-code of STMT, but is rather the tree-code of the original
4938              stmt from the pattern that STMT is replacing.  I.e, in the example
4939              above we want to use 'widen_sum' in the loop, but 'plus' in the
4940              epilog.
4941           2. The type (mode) we use to check available target support
4942              for the vector operation to be created in the *epilog*, is
4943              determined by the type of the reduction variable (in the example
4944              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4945              However the type (mode) we use to check available target support
4946              for the vector operation to be created *inside the loop*, is
4947              determined by the type of the other arguments to STMT (in the
4948              example we'd check this: optab_handler (widen_sum_optab,
4949              vect_short_mode)).
4950
4951           This is contrary to "regular" reductions, in which the types of all
4952           the arguments are the same as the type of the reduction variable.
4953           For "regular" reductions we can therefore use the same vector type
4954           (and also the same tree-code) when generating the epilog code and
4955           when generating the code inside the loop.  */
4956
4957   if (orig_stmt)
4958     {
4959       /* This is a reduction pattern: get the vectype from the type of the
4960          reduction variable, and get the tree-code from orig_stmt.  */
4961       orig_code = gimple_assign_rhs_code (orig_stmt);
4962       gcc_assert (vectype_out);
4963       vec_mode = TYPE_MODE (vectype_out);
4964     }
4965   else
4966     {
4967       /* Regular reduction: use the same vectype and tree-code as used for
4968          the vector code inside the loop can be used for the epilog code. */
4969       orig_code = code;
4970     }
4971
4972   if (nested_cycle)
4973     {
4974       def_bb = gimple_bb (reduc_def_stmt);
4975       def_stmt_loop = def_bb->loop_father;
4976       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4977                                        loop_preheader_edge (def_stmt_loop));
4978       if (TREE_CODE (def_arg) == SSA_NAME
4979           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4980           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4981           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4982           && vinfo_for_stmt (def_arg_stmt)
4983           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4984               == vect_double_reduction_def)
4985         double_reduc = true;
4986     }
4987
4988   epilog_reduc_code = ERROR_MARK;
4989   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4990     {
4991       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4992                                          optab_default);
4993       if (!reduc_optab)
4994         {
4995           if (dump_enabled_p ())
4996             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4997                              "no optab for reduction.\n");
4998
4999           epilog_reduc_code = ERROR_MARK;
5000         }
5001
5002       if (reduc_optab
5003           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5004         {
5005           if (dump_enabled_p ())
5006             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5007                              "reduc op not supported by target.\n");
5008
5009           epilog_reduc_code = ERROR_MARK;
5010         }
5011     }
5012   else
5013     {
5014       if (!nested_cycle || double_reduc)
5015         {
5016           if (dump_enabled_p ())
5017             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5018                              "no reduc code for scalar code.\n");
5019
5020           return false;
5021         }
5022     }
5023
5024   if (double_reduc && ncopies > 1)
5025     {
5026       if (dump_enabled_p ())
5027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5028                          "multiple types in double reduction\n");
5029
5030       return false;
5031     }
5032
5033   /* In case of widenning multiplication by a constant, we update the type
5034      of the constant to be the type of the other operand.  We check that the
5035      constant fits the type in the pattern recognition pass.  */
5036   if (code == DOT_PROD_EXPR
5037       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5038     {
5039       if (TREE_CODE (ops[0]) == INTEGER_CST)
5040         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5041       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5042         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5043       else
5044         {
5045           if (dump_enabled_p ())
5046             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5047                              "invalid types in dot-prod\n");
5048
5049           return false;
5050         }
5051     }
5052
5053   if (!vec_stmt) /* transformation not required.  */
5054     {
5055       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5056         return false;
5057       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5058       return true;
5059     }
5060
5061   /** Transform.  **/
5062
5063   if (dump_enabled_p ())
5064     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5065
5066   /* FORNOW: Multiple types are not supported for condition.  */
5067   if (code == COND_EXPR)
5068     gcc_assert (ncopies == 1);
5069
5070   /* Create the destination vector  */
5071   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5072
5073   /* In case the vectorization factor (VF) is bigger than the number
5074      of elements that we can fit in a vectype (nunits), we have to generate
5075      more than one vector stmt - i.e - we need to "unroll" the
5076      vector stmt by a factor VF/nunits.  For more details see documentation
5077      in vectorizable_operation.  */
5078
5079   /* If the reduction is used in an outer loop we need to generate
5080      VF intermediate results, like so (e.g. for ncopies=2):
5081         r0 = phi (init, r0)
5082         r1 = phi (init, r1)
5083         r0 = x0 + r0;
5084         r1 = x1 + r1;
5085     (i.e. we generate VF results in 2 registers).
5086     In this case we have a separate def-use cycle for each copy, and therefore
5087     for each copy we get the vector def for the reduction variable from the
5088     respective phi node created for this copy.
5089
5090     Otherwise (the reduction is unused in the loop nest), we can combine
5091     together intermediate results, like so (e.g. for ncopies=2):
5092         r = phi (init, r)
5093         r = x0 + r;
5094         r = x1 + r;
5095    (i.e. we generate VF/2 results in a single register).
5096    In this case for each copy we get the vector def for the reduction variable
5097    from the vectorized reduction operation generated in the previous iteration.
5098   */
5099
5100   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5101     {
5102       single_defuse_cycle = true;
5103       epilog_copies = 1;
5104     }
5105   else
5106     epilog_copies = ncopies;
5107
5108   prev_stmt_info = NULL;
5109   prev_phi_info = NULL;
5110   if (slp_node)
5111     {
5112       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5113       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5114                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5115     }
5116   else
5117     {
5118       vec_num = 1;
5119       vec_oprnds0.create (1);
5120       if (op_type == ternary_op)
5121         vec_oprnds1.create (1);
5122     }
5123
5124   phis.create (vec_num);
5125   vect_defs.create (vec_num);
5126   if (!slp_node)
5127     vect_defs.quick_push (NULL_TREE);
5128
5129   for (j = 0; j < ncopies; j++)
5130     {
5131       if (j == 0 || !single_defuse_cycle)
5132         {
5133           for (i = 0; i < vec_num; i++)
5134             {
5135               /* Create the reduction-phi that defines the reduction
5136                  operand.  */
5137               new_phi = create_phi_node (vec_dest, loop->header);
5138               set_vinfo_for_stmt (new_phi,
5139                                   new_stmt_vec_info (new_phi, loop_vinfo,
5140                                                      NULL));
5141                if (j == 0 || slp_node)
5142                  phis.quick_push (new_phi);
5143             }
5144         }
5145
5146       if (code == COND_EXPR)
5147         {
5148           gcc_assert (!slp_node);
5149           vectorizable_condition (stmt, gsi, vec_stmt,
5150                                   PHI_RESULT (phis[0]),
5151                                   reduc_index, NULL);
5152           /* Multiple types are not supported for condition.  */
5153           break;
5154         }
5155
5156       /* Handle uses.  */
5157       if (j == 0)
5158         {
5159           op0 = ops[!reduc_index];
5160           if (op_type == ternary_op)
5161             {
5162               if (reduc_index == 0)
5163                 op1 = ops[2];
5164               else
5165                 op1 = ops[1];
5166             }
5167
5168           if (slp_node)
5169             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5170                                slp_node, -1);
5171           else
5172             {
5173               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5174                                                             stmt, NULL);
5175               vec_oprnds0.quick_push (loop_vec_def0);
5176               if (op_type == ternary_op)
5177                {
5178                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5179                                                                NULL);
5180                  vec_oprnds1.quick_push (loop_vec_def1);
5181                }
5182             }
5183         }
5184       else
5185         {
5186           if (!slp_node)
5187             {
5188               enum vect_def_type dt;
5189               gimple dummy_stmt;
5190               tree dummy;
5191
5192               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5193                                   &dummy_stmt, &dummy, &dt);
5194               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5195                                                               loop_vec_def0);
5196               vec_oprnds0[0] = loop_vec_def0;
5197               if (op_type == ternary_op)
5198                 {
5199                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5200                                       &dummy, &dt);
5201                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5202                                                                 loop_vec_def1);
5203                   vec_oprnds1[0] = loop_vec_def1;
5204                 }
5205             }
5206
5207           if (single_defuse_cycle)
5208             reduc_def = gimple_assign_lhs (new_stmt);
5209
5210           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5211         }
5212
5213       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5214         {
5215           if (slp_node)
5216             reduc_def = PHI_RESULT (phis[i]);
5217           else
5218             {
5219               if (!single_defuse_cycle || j == 0)
5220                 reduc_def = PHI_RESULT (new_phi);
5221             }
5222
5223           def1 = ((op_type == ternary_op)
5224                   ? vec_oprnds1[i] : NULL);
5225           if (op_type == binary_op)
5226             {
5227               if (reduc_index == 0)
5228                 expr = build2 (code, vectype_out, reduc_def, def0);
5229               else
5230                 expr = build2 (code, vectype_out, def0, reduc_def);
5231             }
5232           else
5233             {
5234               if (reduc_index == 0)
5235                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5236               else
5237                 {
5238                   if (reduc_index == 1)
5239                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5240                   else
5241                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5242                 }
5243             }
5244
5245           new_stmt = gimple_build_assign (vec_dest, expr);
5246           new_temp = make_ssa_name (vec_dest, new_stmt);
5247           gimple_assign_set_lhs (new_stmt, new_temp);
5248           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5249
5250           if (slp_node)
5251             {
5252               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5253               vect_defs.quick_push (new_temp);
5254             }
5255           else
5256             vect_defs[0] = new_temp;
5257         }
5258
5259       if (slp_node)
5260         continue;
5261
5262       if (j == 0)
5263         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5264       else
5265         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5266
5267       prev_stmt_info = vinfo_for_stmt (new_stmt);
5268       prev_phi_info = vinfo_for_stmt (new_phi);
5269     }
5270
5271   /* Finalize the reduction-phi (set its arguments) and create the
5272      epilog reduction code.  */
5273   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5274     {
5275       new_temp = gimple_assign_lhs (*vec_stmt);
5276       vect_defs[0] = new_temp;
5277     }
5278
5279   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5280                                     epilog_reduc_code, phis, reduc_index,
5281                                     double_reduc, slp_node);
5282
5283   phis.release ();
5284   vect_defs.release ();
5285   vec_oprnds0.release ();
5286   vec_oprnds1.release ();
5287
5288   return true;
5289 }
5290
5291 /* Function vect_min_worthwhile_factor.
5292
5293    For a loop where we could vectorize the operation indicated by CODE,
5294    return the minimum vectorization factor that makes it worthwhile
5295    to use generic vectors.  */
5296 int
5297 vect_min_worthwhile_factor (enum tree_code code)
5298 {
5299   switch (code)
5300     {
5301     case PLUS_EXPR:
5302     case MINUS_EXPR:
5303     case NEGATE_EXPR:
5304       return 4;
5305
5306     case BIT_AND_EXPR:
5307     case BIT_IOR_EXPR:
5308     case BIT_XOR_EXPR:
5309     case BIT_NOT_EXPR:
5310       return 2;
5311
5312     default:
5313       return INT_MAX;
5314     }
5315 }
5316
5317
5318 /* Function vectorizable_induction
5319
5320    Check if PHI performs an induction computation that can be vectorized.
5321    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5322    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5323    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5324
5325 bool
5326 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5327                         gimple *vec_stmt)
5328 {
5329   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5330   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5331   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5333   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5334   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5335   tree vec_def;
5336
5337   gcc_assert (ncopies >= 1);
5338   /* FORNOW. These restrictions should be relaxed.  */
5339   if (nested_in_vect_loop_p (loop, phi))
5340     {
5341       imm_use_iterator imm_iter;
5342       use_operand_p use_p;
5343       gimple exit_phi;
5344       edge latch_e;
5345       tree loop_arg;
5346
5347       if (ncopies > 1)
5348         {
5349           if (dump_enabled_p ())
5350             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5351                              "multiple types in nested loop.\n");
5352           return false;
5353         }
5354
5355       exit_phi = NULL;
5356       latch_e = loop_latch_edge (loop->inner);
5357       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5358       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5359         {
5360           if (!flow_bb_inside_loop_p (loop->inner,
5361                                       gimple_bb (USE_STMT (use_p))))
5362             {
5363               exit_phi = USE_STMT (use_p);
5364               break;
5365             }
5366         }
5367       if (exit_phi)
5368         {
5369           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5370           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5371                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5372             {
5373               if (dump_enabled_p ())
5374                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5375                                  "inner-loop induction only used outside "
5376                                  "of the outer vectorized loop.\n");
5377               return false;
5378             }
5379         }
5380     }
5381
5382   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5383     return false;
5384
5385   /* FORNOW: SLP not supported.  */
5386   if (STMT_SLP_TYPE (stmt_info))
5387     return false;
5388
5389   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5390
5391   if (gimple_code (phi) != GIMPLE_PHI)
5392     return false;
5393
5394   if (!vec_stmt) /* transformation not required.  */
5395     {
5396       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5397       if (dump_enabled_p ())
5398         dump_printf_loc (MSG_NOTE, vect_location,
5399                          "=== vectorizable_induction ===\n");
5400       vect_model_induction_cost (stmt_info, ncopies);
5401       return true;
5402     }
5403
5404   /** Transform.  **/
5405
5406   if (dump_enabled_p ())
5407     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5408
5409   vec_def = get_initial_def_for_induction (phi);
5410   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5411   return true;
5412 }
5413
5414 /* Function vectorizable_live_operation.
5415
5416    STMT computes a value that is used outside the loop.  Check if
5417    it can be supported.  */
5418
5419 bool
5420 vectorizable_live_operation (gimple stmt,
5421                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5422                              gimple *vec_stmt)
5423 {
5424   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5425   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5426   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5427   int i;
5428   int op_type;
5429   tree op;
5430   tree def;
5431   gimple def_stmt;
5432   enum vect_def_type dt;
5433   enum tree_code code;
5434   enum gimple_rhs_class rhs_class;
5435
5436   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5437
5438   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5439     return false;
5440
5441   if (!is_gimple_assign (stmt))
5442     {
5443       if (gimple_call_internal_p (stmt)
5444           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5445           && gimple_call_lhs (stmt)
5446           && loop->simduid
5447           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5448           && loop->simduid
5449              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5450         {
5451           edge e = single_exit (loop);
5452           basic_block merge_bb = e->dest;
5453           imm_use_iterator imm_iter;
5454           use_operand_p use_p;
5455           tree lhs = gimple_call_lhs (stmt);
5456
5457           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5458             {
5459               gimple use_stmt = USE_STMT (use_p);
5460               if (gimple_code (use_stmt) == GIMPLE_PHI
5461                   || gimple_bb (use_stmt) == merge_bb)
5462                 {
5463                   if (vec_stmt)
5464                     {
5465                       tree vfm1
5466                         = build_int_cst (unsigned_type_node,
5467                                          loop_vinfo->vectorization_factor - 1);
5468                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5469                     }
5470                   return true;
5471                 }
5472             }
5473         }
5474
5475       return false;
5476     }
5477
5478   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5479     return false;
5480
5481   /* FORNOW. CHECKME. */
5482   if (nested_in_vect_loop_p (loop, stmt))
5483     return false;
5484
5485   code = gimple_assign_rhs_code (stmt);
5486   op_type = TREE_CODE_LENGTH (code);
5487   rhs_class = get_gimple_rhs_class (code);
5488   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5489   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5490
5491   /* FORNOW: support only if all uses are invariant.  This means
5492      that the scalar operations can remain in place, unvectorized.
5493      The original last scalar value that they compute will be used.  */
5494
5495   for (i = 0; i < op_type; i++)
5496     {
5497       if (rhs_class == GIMPLE_SINGLE_RHS)
5498         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5499       else
5500         op = gimple_op (stmt, i + 1);
5501       if (op
5502           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5503                                   &dt))
5504         {
5505           if (dump_enabled_p ())
5506             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5507                              "use not simple.\n");
5508           return false;
5509         }
5510
5511       if (dt != vect_external_def && dt != vect_constant_def)
5512         return false;
5513     }
5514
5515   /* No transformation is required for the cases we currently support.  */
5516   return true;
5517 }
5518
5519 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5520
5521 static void
5522 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5523 {
5524   ssa_op_iter op_iter;
5525   imm_use_iterator imm_iter;
5526   def_operand_p def_p;
5527   gimple ustmt;
5528
5529   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5530     {
5531       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5532         {
5533           basic_block bb;
5534
5535           if (!is_gimple_debug (ustmt))
5536             continue;
5537
5538           bb = gimple_bb (ustmt);
5539
5540           if (!flow_bb_inside_loop_p (loop, bb))
5541             {
5542               if (gimple_debug_bind_p (ustmt))
5543                 {
5544                   if (dump_enabled_p ())
5545                     dump_printf_loc (MSG_NOTE, vect_location,
5546                                      "killing debug use\n");
5547
5548                   gimple_debug_bind_reset_value (ustmt);
5549                   update_stmt (ustmt);
5550                 }
5551               else
5552                 gcc_unreachable ();
5553             }
5554         }
5555     }
5556 }
5557
5558 /* Function vect_transform_loop.
5559
5560    The analysis phase has determined that the loop is vectorizable.
5561    Vectorize the loop - created vectorized stmts to replace the scalar
5562    stmts in the loop, and update the loop exit condition.  */
5563
5564 void
5565 vect_transform_loop (loop_vec_info loop_vinfo)
5566 {
5567   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5568   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5569   int nbbs = loop->num_nodes;
5570   gimple_stmt_iterator si;
5571   int i;
5572   tree ratio = NULL;
5573   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5574   bool grouped_store;
5575   bool slp_scheduled = false;
5576   unsigned int nunits;
5577   gimple stmt, pattern_stmt;
5578   gimple_seq pattern_def_seq = NULL;
5579   gimple_stmt_iterator pattern_def_si = gsi_none ();
5580   bool transform_pattern_stmt = false;
5581   bool check_profitability = false;
5582   int th;
5583   /* Record number of iterations before we started tampering with the profile. */
5584   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5585
5586   if (dump_enabled_p ())
5587     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5588
5589   /* If profile is inprecise, we have chance to fix it up.  */
5590   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5591     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5592
5593   /* Use the more conservative vectorization threshold.  If the number
5594      of iterations is constant assume the cost check has been performed
5595      by our caller.  If the threshold makes all loops profitable that
5596      run at least the vectorization factor number of times checking
5597      is pointless, too.  */
5598   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5599          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5600   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5601   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5602       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5603     {
5604       if (dump_enabled_p ())
5605         dump_printf_loc (MSG_NOTE, vect_location,
5606                          "Profitability threshold is %d loop iterations.\n",
5607                          th);
5608       check_profitability = true;
5609     }
5610
5611   /* Version the loop first, if required, so the profitability check
5612      comes first.  */
5613
5614   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5615       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5616     {
5617       vect_loop_versioning (loop_vinfo, th, check_profitability);
5618       check_profitability = false;
5619     }
5620
5621   /* Peel the loop if there are data refs with unknown alignment.
5622      Only one data ref with unknown store is allowed.  */
5623
5624   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5625     {
5626       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5627       check_profitability = false;
5628     }
5629
5630   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5631      compile time constant), or it is a constant that doesn't divide by the
5632      vectorization factor, then an epilog loop needs to be created.
5633      We therefore duplicate the loop: the original loop will be vectorized,
5634      and will compute the first (n/VF) iterations.  The second copy of the loop
5635      will remain scalar and will compute the remaining (n%VF) iterations.
5636      (VF is the vectorization factor).  */
5637
5638   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5639        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5640            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5641        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5642     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5643                                     th, check_profitability);
5644   else
5645     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5646                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5647
5648   /* 1) Make sure the loop header has exactly two entries
5649      2) Make sure we have a preheader basic block.  */
5650
5651   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5652
5653   split_edge (loop_preheader_edge (loop));
5654
5655   /* FORNOW: the vectorizer supports only loops which body consist
5656      of one basic block (header + empty latch). When the vectorizer will
5657      support more involved loop forms, the order by which the BBs are
5658      traversed need to be reconsidered.  */
5659
5660   for (i = 0; i < nbbs; i++)
5661     {
5662       basic_block bb = bbs[i];
5663       stmt_vec_info stmt_info;
5664       gimple phi;
5665
5666       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5667         {
5668           phi = gsi_stmt (si);
5669           if (dump_enabled_p ())
5670             {
5671               dump_printf_loc (MSG_NOTE, vect_location,
5672                                "------>vectorizing phi: ");
5673               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5674               dump_printf (MSG_NOTE, "\n");
5675             }
5676           stmt_info = vinfo_for_stmt (phi);
5677           if (!stmt_info)
5678             continue;
5679
5680           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5681             vect_loop_kill_debug_uses (loop, phi);
5682
5683           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5684               && !STMT_VINFO_LIVE_P (stmt_info))
5685             continue;
5686
5687           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5688                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5689               && dump_enabled_p ())
5690             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5691
5692           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5693             {
5694               if (dump_enabled_p ())
5695                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5696               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5697             }
5698         }
5699
5700       pattern_stmt = NULL;
5701       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5702         {
5703           bool is_store;
5704
5705           if (transform_pattern_stmt)
5706             stmt = pattern_stmt;
5707           else
5708             {
5709               stmt = gsi_stmt (si);
5710               /* During vectorization remove existing clobber stmts.  */
5711               if (gimple_clobber_p (stmt))
5712                 {
5713                   unlink_stmt_vdef (stmt);
5714                   gsi_remove (&si, true);
5715                   release_defs (stmt);
5716                   continue;
5717                 }
5718             }
5719
5720           if (dump_enabled_p ())
5721             {
5722               dump_printf_loc (MSG_NOTE, vect_location,
5723                                "------>vectorizing statement: ");
5724               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5725               dump_printf (MSG_NOTE, "\n");
5726             }
5727
5728           stmt_info = vinfo_for_stmt (stmt);
5729
5730           /* vector stmts created in the outer-loop during vectorization of
5731              stmts in an inner-loop may not have a stmt_info, and do not
5732              need to be vectorized.  */
5733           if (!stmt_info)
5734             {
5735               gsi_next (&si);
5736               continue;
5737             }
5738
5739           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5740             vect_loop_kill_debug_uses (loop, stmt);
5741
5742           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5743               && !STMT_VINFO_LIVE_P (stmt_info))
5744             {
5745               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5746                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5747                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5748                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5749                 {
5750                   stmt = pattern_stmt;
5751                   stmt_info = vinfo_for_stmt (stmt);
5752                 }
5753               else
5754                 {
5755                   gsi_next (&si);
5756                   continue;
5757                 }
5758             }
5759           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5760                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5761                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5762                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5763             transform_pattern_stmt = true;
5764
5765           /* If pattern statement has def stmts, vectorize them too.  */
5766           if (is_pattern_stmt_p (stmt_info))
5767             {
5768               if (pattern_def_seq == NULL)
5769                 {
5770                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5771                   pattern_def_si = gsi_start (pattern_def_seq);
5772                 }
5773               else if (!gsi_end_p (pattern_def_si))
5774                 gsi_next (&pattern_def_si);
5775               if (pattern_def_seq != NULL)
5776                 {
5777                   gimple pattern_def_stmt = NULL;
5778                   stmt_vec_info pattern_def_stmt_info = NULL;
5779
5780                   while (!gsi_end_p (pattern_def_si))
5781                     {
5782                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5783                       pattern_def_stmt_info
5784                         = vinfo_for_stmt (pattern_def_stmt);
5785                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5786                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5787                         break;
5788                       gsi_next (&pattern_def_si);
5789                     }
5790
5791                   if (!gsi_end_p (pattern_def_si))
5792                     {
5793                       if (dump_enabled_p ())
5794                         {
5795                           dump_printf_loc (MSG_NOTE, vect_location,
5796                                            "==> vectorizing pattern def "
5797                                            "stmt: ");
5798                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5799                                             pattern_def_stmt, 0);
5800                           dump_printf (MSG_NOTE, "\n");
5801                         }
5802
5803                       stmt = pattern_def_stmt;
5804                       stmt_info = pattern_def_stmt_info;
5805                     }
5806                   else
5807                     {
5808                       pattern_def_si = gsi_none ();
5809                       transform_pattern_stmt = false;
5810                     }
5811                 }
5812               else
5813                 transform_pattern_stmt = false;
5814             }
5815
5816           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5817           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5818                                                STMT_VINFO_VECTYPE (stmt_info));
5819           if (!STMT_SLP_TYPE (stmt_info)
5820               && nunits != (unsigned int) vectorization_factor
5821               && dump_enabled_p ())
5822             /* For SLP VF is set according to unrolling factor, and not to
5823                vector size, hence for SLP this print is not valid.  */
5824             dump_printf_loc (MSG_NOTE, vect_location,
5825                              "multiple-types.\n");
5826
5827           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5828              reached.  */
5829           if (STMT_SLP_TYPE (stmt_info))
5830             {
5831               if (!slp_scheduled)
5832                 {
5833                   slp_scheduled = true;
5834
5835                   if (dump_enabled_p ())
5836                     dump_printf_loc (MSG_NOTE, vect_location,
5837                                      "=== scheduling SLP instances ===\n");
5838
5839                   vect_schedule_slp (loop_vinfo, NULL);
5840                 }
5841
5842               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5843               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5844                 {
5845                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5846                     {
5847                       pattern_def_seq = NULL;
5848                       gsi_next (&si);
5849                     }
5850                   continue;
5851                 }
5852             }
5853
5854           /* -------- vectorize statement ------------ */
5855           if (dump_enabled_p ())
5856             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
5857
5858           grouped_store = false;
5859           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5860           if (is_store)
5861             {
5862               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5863                 {
5864                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5865                      interleaving chain was completed - free all the stores in
5866                      the chain.  */
5867                   gsi_next (&si);
5868                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5869                   continue;
5870                 }
5871               else
5872                 {
5873                   /* Free the attached stmt_vec_info and remove the stmt.  */
5874                   gimple store = gsi_stmt (si);
5875                   free_stmt_vec_info (store);
5876                   unlink_stmt_vdef (store);
5877                   gsi_remove (&si, true);
5878                   release_defs (store);
5879                   continue;
5880                 }
5881             }
5882
5883           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5884             {
5885               pattern_def_seq = NULL;
5886               gsi_next (&si);
5887             }
5888         }                       /* stmts in BB */
5889     }                           /* BBs in loop */
5890
5891   slpeel_make_loop_iterate_ntimes (loop, ratio);
5892
5893   /* Reduce loop iterations by the vectorization factor.  */
5894   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5895                       expected_iterations / vectorization_factor);
5896   loop->nb_iterations_upper_bound
5897     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5898                                             FLOOR_DIV_EXPR);
5899   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5900       && loop->nb_iterations_upper_bound != double_int_zero)
5901     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5902   if (loop->any_estimate)
5903     {
5904       loop->nb_iterations_estimate
5905         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5906                                              FLOOR_DIV_EXPR);
5907        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5908            && loop->nb_iterations_estimate != double_int_zero)
5909          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5910     }
5911
5912   if (dump_enabled_p ())
5913     {
5914       dump_printf_loc (MSG_NOTE, vect_location,
5915                        "LOOP VECTORIZED\n");
5916       if (loop->inner)
5917         dump_printf_loc (MSG_NOTE, vect_location,
5918                          "OUTER LOOP VECTORIZED\n");
5919       dump_printf (MSG_NOTE, "\n");
5920     }
5921 }