gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "dumpfile.h"
  27 #include "tm.h"
  28 #include "ggc.h"
  29 #include "tree.h"
  30 #include "basic-block.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-pass.h"
  34 #include "cfgloop.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "diagnostic-core.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "target.h"
  44
  45 /* Loop Vectorization Pass.
  46
  47    This pass tries to vectorize loops.
  48
  49    For example, the vectorizer transforms the following simple loop:
  50
  51         short a[N]; short b[N]; short c[N]; int i;
  52
  53         for (i=0; i<N; i++){
  54           a[i] = b[i] + c[i];
  55         }
  56
  57    as if it was manually vectorized by rewriting the source code into:
  58
  59         typedef int __attribute__((mode(V8HI))) v8hi;
  60         short a[N];  short b[N]; short c[N];   int i;
  61         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  62         v8hi va, vb, vc;
  63
  64         for (i=0; i<N/8; i++){
  65           vb = pb[i];
  66           vc = pc[i];
  67           va = vb + vc;
  68           pa[i] = va;
  69         }
  70
  71         The main entry to this pass is vectorize_loops(), in which
  72    the vectorizer applies a set of analyses on a given set of loops,
  73    followed by the actual vectorization transformation for the loops that
  74    had successfully passed the analysis phase.
  75         Throughout this pass we make a distinction between two types of
  76    data: scalars (which are represented by SSA_NAMES), and memory references
  77    ("data-refs").  These two types of data require different handling both
  78    during analysis and transformation. The types of data-refs that the
  79    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  80    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  81    accesses are required to have a simple (consecutive) access pattern.
  82
  83    Analysis phase:
  84    ===============
  85         The driver for the analysis phase is vect_analyze_loop().
  86    It applies a set of analyses, some of which rely on the scalar evolution
  87    analyzer (scev) developed by Sebastian Pop.
  88
  89         During the analysis phase the vectorizer records some information
  90    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  91    loop, as well as general information about the loop as a whole, which is
  92    recorded in a "loop_vec_info" struct attached to each loop.
  93
  94    Transformation phase:
  95    =====================
  96         The loop transformation phase scans all the stmts in the loop, and
  97    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  98    the loop that needs to be vectorized.  It inserts the vector code sequence
  99    just before the scalar stmt S, and records a pointer to the vector code
 100    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 101    attached to S).  This pointer will be used for the vectorization of following
 102    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 103    otherwise, we rely on dead code elimination for removing it.
 104
 105         For example, say stmt S1 was vectorized into stmt VS1:
 106
 107    VS1: vb = px[i];
 108    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 109    S2:  a = b;
 110
 111    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 112    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 113    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 114    resulting sequence would be:
 115
 116    VS1: vb = px[i];
 117    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 118    VS2: va = vb;
 119    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 120
 121         Operands that are not SSA_NAMEs, are data-refs that appear in
 122    load/store operations (like 'x[i]' in S1), and are handled differently.
 123
 124    Target modeling:
 125    =================
 126         Currently the only target specific information that is used is the
 127    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 128    Targets that can support different sizes of vectors, for now will need
 129    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 130    flexibility will be added in the future.
 131
 132         Since we only vectorize operations which vector form can be
 133    expressed using existing tree codes, to verify that an operation is
 134    supported, the vectorizer checks the relevant optab at the relevant
 135    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 136    the value found is CODE_FOR_nothing, then there's no target support, and
 137    we can't vectorize the stmt.
 138
 139    For additional information on this project see:
 140    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 141 */
 142
 143 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 144
 145 /* Function vect_determine_vectorization_factor
 146
 147    Determine the vectorization factor (VF).  VF is the number of data elements
 148    that are operated upon in parallel in a single iteration of the vectorized
 149    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 150    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 151    elements can fit in a single vector register.
 152
 153    We currently support vectorization of loops in which all types operated upon
 154    are of the same size.  Therefore this function currently sets VF according to
 155    the size of the types operated upon, and fails if there are multiple sizes
 156    in the loop.
 157
 158    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 159    original loop:
 160         for (i=0; i<N; i++){
 161           a[i] = b[i] + c[i];
 162         }
 163
 164    vectorized loop:
 165         for (i=0; i<N; i+=VF){
 166           a[i:VF] = b[i:VF] + c[i:VF];
 167         }
 168 */
 169
 170 static bool
 171 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 172 {
 173   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 174   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 175   int nbbs = loop->num_nodes;
 176   gimple_stmt_iterator si;
 177   unsigned int vectorization_factor = 0;
 178   tree scalar_type;
 179   gimple phi;
 180   tree vectype;
 181   unsigned int nunits;
 182   stmt_vec_info stmt_info;
 183   int i;
 184   HOST_WIDE_INT dummy;
 185   gimple stmt, pattern_stmt = NULL;
 186   gimple_seq pattern_def_seq = NULL;
 187   gimple_stmt_iterator pattern_def_si = gsi_none ();
 188   bool analyze_pattern_stmt = false;
 189
 190   if (dump_enabled_p ())
 191     dump_printf_loc (MSG_NOTE, vect_location,
 192                      "=== vect_determine_vectorization_factor ===");
 193
 194   for (i = 0; i < nbbs; i++)
 195     {
 196       basic_block bb = bbs[i];
 197
 198       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 199         {
 200           phi = gsi_stmt (si);
 201           stmt_info = vinfo_for_stmt (phi);
 202           if (dump_enabled_p ())
 203             {
 204               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 205               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 206             }
 207
 208           gcc_assert (stmt_info);
 209
 210           if (STMT_VINFO_RELEVANT_P (stmt_info))
 211             {
 212               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 213               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 214
 215               if (dump_enabled_p ())
 216                 {
 217                   dump_printf_loc (MSG_NOTE, vect_location,
 218                                    "get vectype for scalar type:  ");
 219                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 220                 }
 221
 222               vectype = get_vectype_for_scalar_type (scalar_type);
 223               if (!vectype)
 224                 {
 225                   if (dump_enabled_p ())
 226                     {
 227                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 228                                        "not vectorized: unsupported "
 229                                        "data-type ");
 230                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 231                                          scalar_type);
 232                     }
 233                   return false;
 234                 }
 235               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 236
 237               if (dump_enabled_p ())
 238                 {
 239                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 240                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 241                 }
 242
 243               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 244               if (dump_enabled_p ())
 245                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 246
 247               if (!vectorization_factor
 248                   || (nunits > vectorization_factor))
 249                 vectorization_factor = nunits;
 250             }
 251         }
 252
 253       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 254         {
 255           tree vf_vectype;
 256
 257           if (analyze_pattern_stmt)
 258             stmt = pattern_stmt;
 259           else
 260             stmt = gsi_stmt (si);
 261
 262           stmt_info = vinfo_for_stmt (stmt);
 263
 264           if (dump_enabled_p ())
 265             {
 266               dump_printf_loc (MSG_NOTE, vect_location,
 267                                "==> examining statement: ");
 268               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 269             }
 270
 271           gcc_assert (stmt_info);
 272
 273           /* Skip stmts which do not need to be vectorized.  */
 274           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 275               && !STMT_VINFO_LIVE_P (stmt_info))
 276             {
 277               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 278                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 279                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 280                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 281                 {
 282                   stmt = pattern_stmt;
 283                   stmt_info = vinfo_for_stmt (pattern_stmt);
 284                   if (dump_enabled_p ())
 285                     {
 286                       dump_printf_loc (MSG_NOTE, vect_location,
 287                                        "==> examining pattern statement: ");
 288                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 289                     }
 290                 }
 291               else
 292                 {
 293                   if (dump_enabled_p ())
 294                     dump_printf_loc (MSG_NOTE, vect_location, "skip.");
 295                   gsi_next (&si);
 296                   continue;
 297                 }
 298             }
 299           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 300                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 301                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 302                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 303             analyze_pattern_stmt = true;
 304
 305           /* If a pattern statement has def stmts, analyze them too.  */
 306           if (is_pattern_stmt_p (stmt_info))
 307             {
 308               if (pattern_def_seq == NULL)
 309                 {
 310                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 311                   pattern_def_si = gsi_start (pattern_def_seq);
 312                 }
 313               else if (!gsi_end_p (pattern_def_si))
 314                 gsi_next (&pattern_def_si);
 315               if (pattern_def_seq != NULL)
 316                 {
 317                   gimple pattern_def_stmt = NULL;
 318                   stmt_vec_info pattern_def_stmt_info = NULL;
 319
 320                   while (!gsi_end_p (pattern_def_si))
 321                     {
 322                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 323                       pattern_def_stmt_info
 324                         = vinfo_for_stmt (pattern_def_stmt);
 325                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 326                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 327                         break;
 328                       gsi_next (&pattern_def_si);
 329                     }
 330
 331                   if (!gsi_end_p (pattern_def_si))
 332                     {
 333                       if (dump_enabled_p ())
 334                         {
 335                           dump_printf_loc (MSG_NOTE, vect_location,
 336                                            "==> examining pattern def stmt: ");
 337                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 338                                             pattern_def_stmt, 0);
 339                         }
 340
 341                       stmt = pattern_def_stmt;
 342                       stmt_info = pattern_def_stmt_info;
 343                     }
 344                   else
 345                     {
 346                       pattern_def_si = gsi_none ();
 347                       analyze_pattern_stmt = false;
 348                     }
 349                 }
 350               else
 351                 analyze_pattern_stmt = false;
 352             }
 353
 354           if (gimple_get_lhs (stmt) == NULL_TREE)
 355             {
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 359                                    "not vectorized: irregular stmt.");
 360                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 361                                     0);
 362                 }
 363               return false;
 364             }
 365
 366           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 367             {
 368               if (dump_enabled_p ())
 369                 {
 370                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 371                                    "not vectorized: vector stmt in loop:");
 372                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 373                 }
 374               return false;
 375             }
 376
 377           if (STMT_VINFO_VECTYPE (stmt_info))
 378             {
 379               /* The only case when a vectype had been already set is for stmts
 380                  that contain a dataref, or for "pattern-stmts" (stmts
 381                  generated by the vectorizer to represent/replace a certain
 382                  idiom).  */
 383               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 384                           || is_pattern_stmt_p (stmt_info)
 385                           || !gsi_end_p (pattern_def_si));
 386               vectype = STMT_VINFO_VECTYPE (stmt_info);
 387             }
 388           else
 389             {
 390               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 391               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 392               if (dump_enabled_p ())
 393                 {
 394                   dump_printf_loc (MSG_NOTE, vect_location,
 395                                    "get vectype for scalar type:  ");
 396                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 397                 }
 398               vectype = get_vectype_for_scalar_type (scalar_type);
 399               if (!vectype)
 400                 {
 401                   if (dump_enabled_p ())
 402                     {
 403                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 404                                        "not vectorized: unsupported "
 405                                        "data-type ");
 406                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 407                                          scalar_type);
 408                     }
 409                   return false;
 410                 }
 411
 412               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 413             }
 414
 415           /* The vectorization factor is according to the smallest
 416              scalar type (or the largest vector size, but we only
 417              support one vector size per loop).  */
 418           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 419                                                        &dummy);
 420           if (dump_enabled_p ())
 421             {
 422               dump_printf_loc (MSG_NOTE, vect_location,
 423                                "get vectype for scalar type:  ");
 424               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 425             }
 426           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 427           if (!vf_vectype)
 428             {
 429               if (dump_enabled_p ())
 430                 {
 431                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 432                                    "not vectorized: unsupported data-type ");
 433                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 434                                      scalar_type);
 435                 }
 436               return false;
 437             }
 438
 439           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 440                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 441             {
 442               if (dump_enabled_p ())
 443                 {
 444                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 445                                    "not vectorized: different sized vector "
 446                                    "types in statement, ");
 447                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 448                                      vectype);
 449                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 450                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 451                                      vf_vectype);
 452                 }
 453               return false;
 454             }
 455
 456           if (dump_enabled_p ())
 457             {
 458               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 459               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 460             }
 461
 462           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 463           if (dump_enabled_p ())
 464             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 465           if (!vectorization_factor
 466               || (nunits > vectorization_factor))
 467             vectorization_factor = nunits;
 468
 469           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 470             {
 471               pattern_def_seq = NULL;
 472               gsi_next (&si);
 473             }
 474         }
 475     }
 476
 477   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 478   if (dump_enabled_p ())
 479     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
 480                      vectorization_factor);
 481   if (vectorization_factor <= 1)
 482     {
 483       if (dump_enabled_p ())
 484         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 485                          "not vectorized: unsupported data-type");
 486       return false;
 487     }
 488   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 489
 490   return true;
 491 }
 492
 493
 494 /* Function vect_is_simple_iv_evolution.
 495
 496    FORNOW: A simple evolution of an induction variables in the loop is
 497    considered a polynomial evolution with constant step.  */
 498
 499 static bool
 500 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 501                              tree * step)
 502 {
 503   tree init_expr;
 504   tree step_expr;
 505   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 506
 507   /* When there is no evolution in this loop, the evolution function
 508      is not "simple".  */
 509   if (evolution_part == NULL_TREE)
 510     return false;
 511
 512   /* When the evolution is a polynomial of degree >= 2
 513      the evolution function is not "simple".  */
 514   if (tree_is_chrec (evolution_part))
 515     return false;
 516
 517   step_expr = evolution_part;
 518   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 519
 520   if (dump_enabled_p ())
 521     {
 522       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 523       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 524       dump_printf (MSG_NOTE, ",  init: ");
 525       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 526     }
 527
 528   *init = init_expr;
 529   *step = step_expr;
 530
 531   if (TREE_CODE (step_expr) != INTEGER_CST)
 532     {
 533       if (dump_enabled_p ())
 534         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                          "step unknown.");
 536       return false;
 537     }
 538
 539   return true;
 540 }
 541
 542 /* Function vect_analyze_scalar_cycles_1.
 543
 544    Examine the cross iteration def-use cycles of scalar variables
 545    in LOOP.  LOOP_VINFO represents the loop that is now being
 546    considered for vectorization (can be LOOP, or an outer-loop
 547    enclosing LOOP).  */
 548
 549 static void
 550 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 551 {
 552   basic_block bb = loop->header;
 553   tree dumy;
 554   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 555   gimple_stmt_iterator gsi;
 556   bool double_reduc;
 557
 558   if (dump_enabled_p ())
 559     dump_printf_loc (MSG_NOTE, vect_location,
 560                      "=== vect_analyze_scalar_cycles ===");
 561
 562   /* First - identify all inductions.  Reduction detection assumes that all the
 563      inductions have been identified, therefore, this order must not be
 564      changed.  */
 565   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 566     {
 567       gimple phi = gsi_stmt (gsi);
 568       tree access_fn = NULL;
 569       tree def = PHI_RESULT (phi);
 570       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 571
 572       if (dump_enabled_p ())
 573         {
 574           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 575           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 576         }
 577
 578       /* Skip virtual phi's.  The data dependences that are associated with
 579          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 580       if (virtual_operand_p (def))
 581         continue;
 582
 583       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 584
 585       /* Analyze the evolution function.  */
 586       access_fn = analyze_scalar_evolution (loop, def);
 587       if (access_fn)
 588         {
 589           STRIP_NOPS (access_fn);
 590           if (dump_enabled_p ())
 591             {
 592               dump_printf_loc (MSG_NOTE, vect_location,
 593                                "Access function of PHI: ");
 594               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 595             }
 596           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 597             = evolution_part_in_loop_num (access_fn, loop->num);
 598         }
 599
 600       if (!access_fn
 601           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 602         {
 603           VEC_safe_push (gimple, heap, worklist, phi);
 604           continue;
 605         }
 606
 607       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
 611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 612     }
 613
 614
 615   /* Second - identify all reductions and nested cycles.  */
 616   while (VEC_length (gimple, worklist) > 0)
 617     {
 618       gimple phi = VEC_pop (gimple, worklist);
 619       tree def = PHI_RESULT (phi);
 620       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 621       gimple reduc_stmt;
 622       bool nested_cycle;
 623
 624       if (dump_enabled_p ())
 625         {
 626           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 627           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 628         }
 629
 630       gcc_assert (!virtual_operand_p (def)
 631                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 632
 633       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 634       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 635                                                 &double_reduc);
 636       if (reduc_stmt)
 637         {
 638           if (double_reduc)
 639             {
 640               if (dump_enabled_p ())
 641                 dump_printf_loc (MSG_NOTE, vect_location,
 642                                  "Detected double reduction.");
 643
 644               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 645               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 646                                                     vect_double_reduction_def;
 647             }
 648           else
 649             {
 650               if (nested_cycle)
 651                 {
 652                   if (dump_enabled_p ())
 653                     dump_printf_loc (MSG_NOTE, vect_location,
 654                                      "Detected vectorizable nested cycle.");
 655
 656                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 657                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 658                                                              vect_nested_cycle;
 659                 }
 660               else
 661                 {
 662                   if (dump_enabled_p ())
 663                     dump_printf_loc (MSG_NOTE, vect_location,
 664                                      "Detected reduction.");
 665
 666                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 667                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 668                                                            vect_reduction_def;
 669                   /* Store the reduction cycles for possible vectorization in
 670                      loop-aware SLP.  */
 671                   VEC_safe_push (gimple, heap,
 672                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 673                                  reduc_stmt);
 674                 }
 675             }
 676         }
 677       else
 678         if (dump_enabled_p ())
 679           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 680                            "Unknown def-use cycle pattern.");
 681     }
 682
 683   VEC_free (gimple, heap, worklist);
 684 }
 685
 686
 687 /* Function vect_analyze_scalar_cycles.
 688
 689    Examine the cross iteration def-use cycles of scalar variables, by
 690    analyzing the loop-header PHIs of scalar variables.  Classify each
 691    cycle as one of the following: invariant, induction, reduction, unknown.
 692    We do that for the loop represented by LOOP_VINFO, and also to its
 693    inner-loop, if exists.
 694    Examples for scalar cycles:
 695
 696    Example1: reduction:
 697
 698               loop1:
 699               for (i=0; i<N; i++)
 700                  sum += a[i];
 701
 702    Example2: induction:
 703
 704               loop2:
 705               for (i=0; i<N; i++)
 706                  a[i] = i;  */
 707
 708 static void
 709 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 710 {
 711   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 712
 713   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 714
 715   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 716      Reductions in such inner-loop therefore have different properties than
 717      the reductions in the nest that gets vectorized:
 718      1. When vectorized, they are executed in the same order as in the original
 719         scalar loop, so we can't change the order of computation when
 720         vectorizing them.
 721      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 722         current checks are too strict.  */
 723
 724   if (loop->inner)
 725     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 726 }
 727
 728 /* Function vect_get_loop_niters.
 729
 730    Determine how many iterations the loop is executed.
 731    If an expression that represents the number of iterations
 732    can be constructed, place it in NUMBER_OF_ITERATIONS.
 733    Return the loop exit condition.  */
 734
 735 static gimple
 736 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 737 {
 738   tree niters;
 739
 740   if (dump_enabled_p ())
 741     dump_printf_loc (MSG_NOTE, vect_location,
 742                      "=== get_loop_niters ===");
 743   niters = number_of_exit_cond_executions (loop);
 744
 745   if (niters != NULL_TREE
 746       && niters != chrec_dont_know)
 747     {
 748       *number_of_iterations = niters;
 749
 750       if (dump_enabled_p ())
 751         {
 752           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 753           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 754         }
 755     }
 756
 757   return get_loop_exit_condition (loop);
 758 }
 759
 760
 761 /* Function bb_in_loop_p
 762
 763    Used as predicate for dfs order traversal of the loop bbs.  */
 764
 765 static bool
 766 bb_in_loop_p (const_basic_block bb, const void *data)
 767 {
 768   const struct loop *const loop = (const struct loop *)data;
 769   if (flow_bb_inside_loop_p (loop, bb))
 770     return true;
 771   return false;
 772 }
 773
 774
 775 /* Function new_loop_vec_info.
 776
 777    Create and initialize a new loop_vec_info struct for LOOP, as well as
 778    stmt_vec_info structs for all the stmts in LOOP.  */
 779
 780 static loop_vec_info
 781 new_loop_vec_info (struct loop *loop)
 782 {
 783   loop_vec_info res;
 784   basic_block *bbs;
 785   gimple_stmt_iterator si;
 786   unsigned int i, nbbs;
 787
 788   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 789   LOOP_VINFO_LOOP (res) = loop;
 790
 791   bbs = get_loop_body (loop);
 792
 793   /* Create/Update stmt_info for all stmts in the loop.  */
 794   for (i = 0; i < loop->num_nodes; i++)
 795     {
 796       basic_block bb = bbs[i];
 797
 798       /* BBs in a nested inner-loop will have been already processed (because
 799          we will have called vect_analyze_loop_form for any nested inner-loop).
 800          Therefore, for stmts in an inner-loop we just want to update the
 801          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 802          loop_info of the outer-loop we are currently considering to vectorize
 803          (instead of the loop_info of the inner-loop).
 804          For stmts in other BBs we need to create a stmt_info from scratch.  */
 805       if (bb->loop_father != loop)
 806         {
 807           /* Inner-loop bb.  */
 808           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 809           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 810             {
 811               gimple phi = gsi_stmt (si);
 812               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 813               loop_vec_info inner_loop_vinfo =
 814                 STMT_VINFO_LOOP_VINFO (stmt_info);
 815               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 816               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 817             }
 818           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 819            {
 820               gimple stmt = gsi_stmt (si);
 821               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 822               loop_vec_info inner_loop_vinfo =
 823                  STMT_VINFO_LOOP_VINFO (stmt_info);
 824               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 825               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 826            }
 827         }
 828       else
 829         {
 830           /* bb in current nest.  */
 831           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 832             {
 833               gimple phi = gsi_stmt (si);
 834               gimple_set_uid (phi, 0);
 835               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 836             }
 837
 838           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 839             {
 840               gimple stmt = gsi_stmt (si);
 841               gimple_set_uid (stmt, 0);
 842               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 843             }
 844         }
 845     }
 846
 847   /* CHECKME: We want to visit all BBs before their successors (except for
 848      latch blocks, for which this assertion wouldn't hold).  In the simple
 849      case of the loop forms we allow, a dfs order of the BBs would the same
 850      as reversed postorder traversal, so we are safe.  */
 851
 852    free (bbs);
 853    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 854    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 855                               bbs, loop->num_nodes, loop);
 856    gcc_assert (nbbs == loop->num_nodes);
 857
 858   LOOP_VINFO_BBS (res) = bbs;
 859   LOOP_VINFO_NITERS (res) = NULL;
 860   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 861   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 862   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 863   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 864   LOOP_VINFO_VECT_FACTOR (res) = 0;
 865   LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
 866   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 867   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 868   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 869   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 870     VEC_alloc (gimple, heap,
 871                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 872   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 873     VEC_alloc (ddr_p, heap,
 874                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 875   LOOP_VINFO_GROUPED_STORES (res) = VEC_alloc (gimple, heap, 10);
 876   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 877   LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
 878   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 879   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 880   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 881   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 882   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 883   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 884
 885   return res;
 886 }
 887
 888
 889 /* Function destroy_loop_vec_info.
 890
 891    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 892    stmts in the loop.  */
 893
 894 void
 895 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 896 {
 897   struct loop *loop;
 898   basic_block *bbs;
 899   int nbbs;
 900   gimple_stmt_iterator si;
 901   int j;
 902   VEC (slp_instance, heap) *slp_instances;
 903   slp_instance instance;
 904   bool swapped;
 905
 906   if (!loop_vinfo)
 907     return;
 908
 909   loop = LOOP_VINFO_LOOP (loop_vinfo);
 910
 911   bbs = LOOP_VINFO_BBS (loop_vinfo);
 912   nbbs = loop->num_nodes;
 913   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 914
 915   if (!clean_stmts)
 916     {
 917       free (LOOP_VINFO_BBS (loop_vinfo));
 918       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 919       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 920       VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 921       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 922       VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 923
 924       free (loop_vinfo);
 925       loop->aux = NULL;
 926       return;
 927     }
 928
 929   for (j = 0; j < nbbs; j++)
 930     {
 931       basic_block bb = bbs[j];
 932       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 933         free_stmt_vec_info (gsi_stmt (si));
 934
 935       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 936         {
 937           gimple stmt = gsi_stmt (si);
 938
 939           /* We may have broken canonical form by moving a constant
 940              into RHS1 of a commutative op.  Fix such occurrences.  */
 941           if (swapped && is_gimple_assign (stmt))
 942             {
 943               enum tree_code code = gimple_assign_rhs_code (stmt);
 944
 945               if ((code == PLUS_EXPR
 946                    || code == POINTER_PLUS_EXPR
 947                    || code == MULT_EXPR)
 948                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 949                 swap_tree_operands (stmt,
 950                                     gimple_assign_rhs1_ptr (stmt),
 951                                     gimple_assign_rhs2_ptr (stmt));
 952             }
 953
 954           /* Free stmt_vec_info.  */
 955           free_stmt_vec_info (stmt);
 956           gsi_next (&si);
 957         }
 958     }
 959
 960   free (LOOP_VINFO_BBS (loop_vinfo));
 961   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 962   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 963   VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 964   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 965   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 966   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 967   FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
 968     vect_free_slp_instance (instance);
 969
 970   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 971   VEC_free (gimple, heap, LOOP_VINFO_GROUPED_STORES (loop_vinfo));
 972   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 973   VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
 974
 975   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 976     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 977
 978   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 979
 980   free (loop_vinfo);
 981   loop->aux = NULL;
 982 }
 983
 984
 985 /* Function vect_analyze_loop_1.
 986
 987    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 988    for it. The different analyses will record information in the
 989    loop_vec_info struct.  This is a subset of the analyses applied in
 990    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 991    that is now considered for (outer-loop) vectorization.  */
 992
 993 static loop_vec_info
 994 vect_analyze_loop_1 (struct loop *loop)
 995 {
 996   loop_vec_info loop_vinfo;
 997
 998   if (dump_enabled_p ())
 999     dump_printf_loc (MSG_NOTE, vect_location,
1000                      "===== analyze_loop_nest_1 =====");
1001
1002   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1003
1004   loop_vinfo = vect_analyze_loop_form (loop);
1005   if (!loop_vinfo)
1006     {
1007       if (dump_enabled_p ())
1008         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1009                          "bad inner-loop form.");
1010       return NULL;
1011     }
1012
1013   return loop_vinfo;
1014 }
1015
1016
1017 /* Function vect_analyze_loop_form.
1018
1019    Verify that certain CFG restrictions hold, including:
1020    - the loop has a pre-header
1021    - the loop has a single entry and exit
1022    - the loop exit condition is simple enough, and the number of iterations
1023      can be analyzed (a countable loop).  */
1024
1025 loop_vec_info
1026 vect_analyze_loop_form (struct loop *loop)
1027 {
1028   loop_vec_info loop_vinfo;
1029   gimple loop_cond;
1030   tree number_of_iterations = NULL;
1031   loop_vec_info inner_loop_vinfo = NULL;
1032
1033   if (dump_enabled_p ())
1034     dump_printf_loc (MSG_NOTE, vect_location,
1035                      "=== vect_analyze_loop_form ===");
1036
1037   /* Different restrictions apply when we are considering an inner-most loop,
1038      vs. an outer (nested) loop.
1039      (FORNOW. May want to relax some of these restrictions in the future).  */
1040
1041   if (!loop->inner)
1042     {
1043       /* Inner-most loop.  We currently require that the number of BBs is
1044          exactly 2 (the header and latch).  Vectorizable inner-most loops
1045          look like this:
1046
1047                         (pre-header)
1048                            |
1049                           header <--------+
1050                            | |            |
1051                            | +--> latch --+
1052                            |
1053                         (exit-bb)  */
1054
1055       if (loop->num_nodes != 2)
1056         {
1057           if (dump_enabled_p ())
1058             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1059                              "not vectorized: control flow in loop.");
1060           return NULL;
1061         }
1062
1063       if (empty_block_p (loop->header))
1064     {
1065           if (dump_enabled_p ())
1066             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1067                              "not vectorized: empty loop.");
1068       return NULL;
1069     }
1070     }
1071   else
1072     {
1073       struct loop *innerloop = loop->inner;
1074       edge entryedge;
1075
1076       /* Nested loop. We currently require that the loop is doubly-nested,
1077          contains a single inner loop, and the number of BBs is exactly 5.
1078          Vectorizable outer-loops look like this:
1079
1080                         (pre-header)
1081                            |
1082                           header <---+
1083                            |         |
1084                           inner-loop |
1085                            |         |
1086                           tail ------+
1087                            |
1088                         (exit-bb)
1089
1090          The inner-loop has the properties expected of inner-most loops
1091          as described above.  */
1092
1093       if ((loop->inner)->inner || (loop->inner)->next)
1094         {
1095           if (dump_enabled_p ())
1096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097                              "not vectorized: multiple nested loops.");
1098           return NULL;
1099         }
1100
1101       /* Analyze the inner-loop.  */
1102       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1103       if (!inner_loop_vinfo)
1104         {
1105           if (dump_enabled_p ())
1106             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1107                              "not vectorized: Bad inner loop.");
1108           return NULL;
1109         }
1110
1111       if (!expr_invariant_in_loop_p (loop,
1112                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1113         {
1114           if (dump_enabled_p ())
1115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1116                              "not vectorized: inner-loop count not invariant.");
1117           destroy_loop_vec_info (inner_loop_vinfo, true);
1118           return NULL;
1119         }
1120
1121       if (loop->num_nodes != 5)
1122         {
1123           if (dump_enabled_p ())
1124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125                              "not vectorized: control flow in loop.");
1126           destroy_loop_vec_info (inner_loop_vinfo, true);
1127           return NULL;
1128         }
1129
1130       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1131       entryedge = EDGE_PRED (innerloop->header, 0);
1132       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1133         entryedge = EDGE_PRED (innerloop->header, 1);
1134
1135       if (entryedge->src != loop->header
1136           || !single_exit (innerloop)
1137           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1138         {
1139           if (dump_enabled_p ())
1140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141                              "not vectorized: unsupported outerloop form.");
1142           destroy_loop_vec_info (inner_loop_vinfo, true);
1143           return NULL;
1144         }
1145
1146       if (dump_enabled_p ())
1147         dump_printf_loc (MSG_NOTE, vect_location,
1148                          "Considering outer-loop vectorization.");
1149     }
1150
1151   if (!single_exit (loop)
1152       || EDGE_COUNT (loop->header->preds) != 2)
1153     {
1154       if (dump_enabled_p ())
1155         {
1156           if (!single_exit (loop))
1157             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1158                              "not vectorized: multiple exits.");
1159           else if (EDGE_COUNT (loop->header->preds) != 2)
1160             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161                              "not vectorized: too many incoming edges.");
1162         }
1163       if (inner_loop_vinfo)
1164         destroy_loop_vec_info (inner_loop_vinfo, true);
1165       return NULL;
1166     }
1167
1168   /* We assume that the loop exit condition is at the end of the loop. i.e,
1169      that the loop is represented as a do-while (with a proper if-guard
1170      before the loop if needed), where the loop header contains all the
1171      executable statements, and the latch is empty.  */
1172   if (!empty_block_p (loop->latch)
1173         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1174     {
1175       if (dump_enabled_p ())
1176         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177                          "not vectorized: unexpected loop form.");
1178       if (inner_loop_vinfo)
1179         destroy_loop_vec_info (inner_loop_vinfo, true);
1180       return NULL;
1181     }
1182
1183   /* Make sure there exists a single-predecessor exit bb:  */
1184   if (!single_pred_p (single_exit (loop)->dest))
1185     {
1186       edge e = single_exit (loop);
1187       if (!(e->flags & EDGE_ABNORMAL))
1188         {
1189           split_loop_exit_edge (e);
1190           if (dump_enabled_p ())
1191             dump_printf (MSG_NOTE, "split exit edge.");
1192         }
1193       else
1194         {
1195           if (dump_enabled_p ())
1196             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197                              "not vectorized: abnormal loop exit edge.");
1198           if (inner_loop_vinfo)
1199             destroy_loop_vec_info (inner_loop_vinfo, true);
1200           return NULL;
1201         }
1202     }
1203
1204   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1205   if (!loop_cond)
1206     {
1207       if (dump_enabled_p ())
1208         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209                          "not vectorized: complicated exit condition.");
1210       if (inner_loop_vinfo)
1211         destroy_loop_vec_info (inner_loop_vinfo, true);
1212       return NULL;
1213     }
1214
1215   if (!number_of_iterations)
1216     {
1217       if (dump_enabled_p ())
1218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                          "not vectorized: number of iterations cannot be "
1220                          "computed.");
1221       if (inner_loop_vinfo)
1222         destroy_loop_vec_info (inner_loop_vinfo, true);
1223       return NULL;
1224     }
1225
1226   if (chrec_contains_undetermined (number_of_iterations))
1227     {
1228       if (dump_enabled_p ())
1229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230                              "Infinite number of iterations.");
1231       if (inner_loop_vinfo)
1232         destroy_loop_vec_info (inner_loop_vinfo, true);
1233       return NULL;
1234     }
1235
1236   if (!NITERS_KNOWN_P (number_of_iterations))
1237     {
1238       if (dump_enabled_p ())
1239         {
1240           dump_printf_loc (MSG_NOTE, vect_location,
1241                            "Symbolic number of iterations is ");
1242           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1243         }
1244     }
1245   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1246     {
1247       if (dump_enabled_p ())
1248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                          "not vectorized: number of iterations = 0.");
1250       if (inner_loop_vinfo)
1251         destroy_loop_vec_info (inner_loop_vinfo, false);
1252       return NULL;
1253     }
1254
1255   loop_vinfo = new_loop_vec_info (loop);
1256   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1257   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1258
1259   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1260
1261   /* CHECKME: May want to keep it around it in the future.  */
1262   if (inner_loop_vinfo)
1263     destroy_loop_vec_info (inner_loop_vinfo, false);
1264
1265   gcc_assert (!loop->aux);
1266   loop->aux = loop_vinfo;
1267   return loop_vinfo;
1268 }
1269
1270
1271 /* Function vect_analyze_loop_operations.
1272
1273    Scan the loop stmts and make sure they are all vectorizable.  */
1274
1275 static bool
1276 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1277 {
1278   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1279   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1280   int nbbs = loop->num_nodes;
1281   gimple_stmt_iterator si;
1282   unsigned int vectorization_factor = 0;
1283   int i;
1284   gimple phi;
1285   stmt_vec_info stmt_info;
1286   bool need_to_vectorize = false;
1287   int min_profitable_iters;
1288   int min_scalar_loop_bound;
1289   unsigned int th;
1290   bool only_slp_in_loop = true, ok;
1291   HOST_WIDE_INT max_niter;
1292   HOST_WIDE_INT estimated_niter;
1293   int min_profitable_estimate;
1294
1295   if (dump_enabled_p ())
1296     dump_printf_loc (MSG_NOTE, vect_location,
1297                      "=== vect_analyze_loop_operations ===");
1298
1299   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1300   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1301   if (slp)
1302     {
1303       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1304          vectorization factor of the loop is the unrolling factor required by
1305          the SLP instances.  If that unrolling factor is 1, we say, that we
1306          perform pure SLP on loop - cross iteration parallelism is not
1307          exploited.  */
1308       for (i = 0; i < nbbs; i++)
1309         {
1310           basic_block bb = bbs[i];
1311           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1312             {
1313               gimple stmt = gsi_stmt (si);
1314               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1315               gcc_assert (stmt_info);
1316               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1317                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1318                   && !PURE_SLP_STMT (stmt_info))
1319                 /* STMT needs both SLP and loop-based vectorization.  */
1320                 only_slp_in_loop = false;
1321             }
1322         }
1323
1324       if (only_slp_in_loop)
1325         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1326       else
1327         vectorization_factor = least_common_multiple (vectorization_factor,
1328                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1329
1330       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1331       if (dump_enabled_p ())
1332         dump_printf_loc (MSG_NOTE, vect_location,
1333                          "Updating vectorization factor to %d ",
1334                          vectorization_factor);
1335     }
1336
1337   for (i = 0; i < nbbs; i++)
1338     {
1339       basic_block bb = bbs[i];
1340
1341       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1342         {
1343           phi = gsi_stmt (si);
1344           ok = true;
1345
1346           stmt_info = vinfo_for_stmt (phi);
1347           if (dump_enabled_p ())
1348             {
1349               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1350               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1351             }
1352
1353           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1354              (i.e., a phi in the tail of the outer-loop).  */
1355           if (! is_loop_header_bb_p (bb))
1356             {
1357               /* FORNOW: we currently don't support the case that these phis
1358                  are not used in the outerloop (unless it is double reduction,
1359                  i.e., this phi is vect_reduction_def), cause this case
1360                  requires to actually do something here.  */
1361               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1362                    || STMT_VINFO_LIVE_P (stmt_info))
1363                   && STMT_VINFO_DEF_TYPE (stmt_info)
1364                      != vect_double_reduction_def)
1365                 {
1366                   if (dump_enabled_p ())
1367                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368                                      "Unsupported loop-closed phi in "
1369                                      "outer-loop.");
1370                   return false;
1371                 }
1372
1373               /* If PHI is used in the outer loop, we check that its operand
1374                  is defined in the inner loop.  */
1375               if (STMT_VINFO_RELEVANT_P (stmt_info))
1376                 {
1377                   tree phi_op;
1378                   gimple op_def_stmt;
1379
1380                   if (gimple_phi_num_args (phi) != 1)
1381                     return false;
1382
1383                   phi_op = PHI_ARG_DEF (phi, 0);
1384                   if (TREE_CODE (phi_op) != SSA_NAME)
1385                     return false;
1386
1387                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1388                   if (!op_def_stmt
1389                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1390                       || !vinfo_for_stmt (op_def_stmt))
1391                     return false;
1392
1393                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1394                         != vect_used_in_outer
1395                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1396                            != vect_used_in_outer_by_reduction)
1397                     return false;
1398                 }
1399
1400               continue;
1401             }
1402
1403           gcc_assert (stmt_info);
1404
1405           if (STMT_VINFO_LIVE_P (stmt_info))
1406             {
1407               /* FORNOW: not yet supported.  */
1408               if (dump_enabled_p ())
1409                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410                                  "not vectorized: value used after loop.");
1411               return false;
1412             }
1413
1414           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1415               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1416             {
1417               /* A scalar-dependence cycle that we don't support.  */
1418               if (dump_enabled_p ())
1419                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420                                  "not vectorized: scalar dependence cycle.");
1421               return false;
1422             }
1423
1424           if (STMT_VINFO_RELEVANT_P (stmt_info))
1425             {
1426               need_to_vectorize = true;
1427               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1428                 ok = vectorizable_induction (phi, NULL, NULL);
1429             }
1430
1431           if (!ok)
1432             {
1433               if (dump_enabled_p ())
1434                 {
1435                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436                                    "not vectorized: relevant phi not "
1437                                    "supported: ");
1438                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1439                 }
1440               return false;
1441             }
1442         }
1443
1444       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1445         {
1446           gimple stmt = gsi_stmt (si);
1447           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1448             return false;
1449         }
1450     } /* bbs */
1451
1452   /* All operations in the loop are either irrelevant (deal with loop
1453      control, or dead), or only used outside the loop and can be moved
1454      out of the loop (e.g. invariants, inductions).  The loop can be
1455      optimized away by scalar optimizations.  We're better off not
1456      touching this loop.  */
1457   if (!need_to_vectorize)
1458     {
1459       if (dump_enabled_p ())
1460         dump_printf_loc (MSG_NOTE, vect_location,
1461                          "All the computation can be taken out of the loop.");
1462       if (dump_enabled_p ())
1463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                          "not vectorized: redundant loop. no profit to "
1465                          "vectorize.");
1466       return false;
1467     }
1468
1469   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1470     dump_printf_loc (MSG_NOTE, vect_location,
1471                      "vectorization_factor = %d, niters = "
1472                      HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1473                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1474
1475   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1476        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1477       || ((max_niter = max_stmt_executions_int (loop)) != -1
1478           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1479     {
1480       if (dump_enabled_p ())
1481         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1482                          "not vectorized: iteration count too small.");
1483       if (dump_enabled_p ())
1484         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485                          "not vectorized: iteration count smaller than "
1486                          "vectorization factor.");
1487       return false;
1488     }
1489
1490   /* Analyze cost.  Decide if worth while to vectorize.  */
1491
1492   /* Once VF is set, SLP costs should be updated since the number of created
1493      vector stmts depends on VF.  */
1494   vect_update_slp_costs_according_to_vf (loop_vinfo);
1495
1496   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1497                                       &min_profitable_estimate);
1498   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1499
1500   if (min_profitable_iters < 0)
1501     {
1502       if (dump_enabled_p ())
1503         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504                          "not vectorized: vectorization not profitable.");
1505       if (dump_enabled_p ())
1506         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                          "not vectorized: vector version will never be "
1508                          "profitable.");
1509       return false;
1510     }
1511
1512   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1513                             * vectorization_factor) - 1);
1514
1515
1516   /* Use the cost model only if it is more conservative than user specified
1517      threshold.  */
1518
1519   th = (unsigned) min_scalar_loop_bound;
1520   if (min_profitable_iters
1521       && (!min_scalar_loop_bound
1522           || min_profitable_iters > min_scalar_loop_bound))
1523     th = (unsigned) min_profitable_iters;
1524
1525   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1526       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1527     {
1528       if (dump_enabled_p ())
1529         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530                          "not vectorized: vectorization not profitable.");
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_NOTE, vect_location,
1533                          "not vectorized: iteration count smaller than user "
1534                          "specified loop bound parameter or minimum profitable "
1535                          "iterations (whichever is more conservative).");
1536       return false;
1537     }
1538
1539   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1540       && ((unsigned HOST_WIDE_INT) estimated_niter
1541           <= MAX (th, (unsigned)min_profitable_estimate)))
1542     {
1543       if (dump_enabled_p ())
1544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                          "not vectorized: estimated iteration count too "
1546                          "small.");
1547       if (dump_enabled_p ())
1548         dump_printf_loc (MSG_NOTE, vect_location,
1549                          "not vectorized: estimated iteration count smaller "
1550                          "than specified loop bound parameter or minimum "
1551                          "profitable iterations (whichever is more "
1552                          "conservative).");
1553       return false;
1554     }
1555
1556   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1557       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1558       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1559     {
1560       if (dump_enabled_p ())
1561         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1562       if (!vect_can_advance_ivs_p (loop_vinfo))
1563         {
1564           if (dump_enabled_p ())
1565             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566                              "not vectorized: can't create epilog loop 1.");
1567           return false;
1568         }
1569       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1570         {
1571           if (dump_enabled_p ())
1572             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573                              "not vectorized: can't create epilog loop 2.");
1574           return false;
1575         }
1576     }
1577
1578   return true;
1579 }
1580
1581
1582 /* Function vect_analyze_loop_2.
1583
1584    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1585    for it.  The different analyses will record information in the
1586    loop_vec_info struct.  */
1587 static bool
1588 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1589 {
1590   bool ok, slp = false;
1591   int max_vf = MAX_VECTORIZATION_FACTOR;
1592   int min_vf = 2;
1593
1594   /* Find all data references in the loop (which correspond to vdefs/vuses)
1595      and analyze their evolution in the loop.  Also adjust the minimal
1596      vectorization factor according to the loads and stores.
1597
1598      FORNOW: Handle only simple, array references, which
1599      alignment can be forced, and aligned pointer-references.  */
1600
1601   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1602   if (!ok)
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "bad data references.");
1607       return false;
1608     }
1609
1610   /* Classify all cross-iteration scalar data-flow cycles.
1611      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1612
1613   vect_analyze_scalar_cycles (loop_vinfo);
1614
1615   vect_pattern_recog (loop_vinfo, NULL);
1616
1617   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1618
1619   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1620   if (!ok)
1621     {
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624                          "unexpected pattern.");
1625       return false;
1626     }
1627
1628   /* Analyze data dependences between the data-refs in the loop
1629      and adjust the maximum vectorization factor according to
1630      the dependences.
1631      FORNOW: fail at the first data dependence that we encounter.  */
1632
1633   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1634   if (!ok
1635       || max_vf < min_vf)
1636     {
1637       if (dump_enabled_p ())
1638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                              "bad data dependence.");
1640       return false;
1641     }
1642
1643   ok = vect_determine_vectorization_factor (loop_vinfo);
1644   if (!ok)
1645     {
1646       if (dump_enabled_p ())
1647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648                          "can't determine vectorization factor.");
1649       return false;
1650     }
1651   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                          "bad data dependence.");
1656       return false;
1657     }
1658
1659   /* Analyze the alignment of the data-refs in the loop.
1660      Fail if a data reference is found that cannot be vectorized.  */
1661
1662   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1663   if (!ok)
1664     {
1665       if (dump_enabled_p ())
1666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667                          "bad data alignment.");
1668       return false;
1669     }
1670
1671   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1672      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1673
1674   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1675   if (!ok)
1676     {
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679                          "bad data access.");
1680       return false;
1681     }
1682
1683   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1684      It is important to call pruning after vect_analyze_data_ref_accesses,
1685      since we use grouping information gathered by interleaving analysis.  */
1686   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1687   if (!ok)
1688     {
1689       if (dump_enabled_p ())
1690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691                          "too long list of versioning for alias "
1692                          "run-time tests.");
1693       return false;
1694     }
1695
1696   /* This pass will decide on using loop versioning and/or loop peeling in
1697      order to enhance the alignment of data references in the loop.  */
1698
1699   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1700   if (!ok)
1701     {
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "bad data alignment.");
1705       return false;
1706     }
1707
1708   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1709   ok = vect_analyze_slp (loop_vinfo, NULL);
1710   if (ok)
1711     {
1712       /* Decide which possible SLP instances to SLP.  */
1713       slp = vect_make_slp_decision (loop_vinfo);
1714
1715       /* Find stmts that need to be both vectorized and SLPed.  */
1716       vect_detect_hybrid_slp (loop_vinfo);
1717     }
1718   else
1719     return false;
1720
1721   /* Scan all the operations in the loop and make sure they are
1722      vectorizable.  */
1723
1724   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1725   if (!ok)
1726     {
1727       if (dump_enabled_p ())
1728         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729                          "bad operation or unsupported loop bound.");
1730       return false;
1731     }
1732
1733   return true;
1734 }
1735
1736 /* Function vect_analyze_loop.
1737
1738    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1739    for it.  The different analyses will record information in the
1740    loop_vec_info struct.  */
1741 loop_vec_info
1742 vect_analyze_loop (struct loop *loop)
1743 {
1744   loop_vec_info loop_vinfo;
1745   unsigned int vector_sizes;
1746
1747   /* Autodetect first vector size we try.  */
1748   current_vector_size = 0;
1749   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1750
1751   if (dump_enabled_p ())
1752     dump_printf_loc (MSG_NOTE, vect_location,
1753                      "===== analyze_loop_nest =====");
1754
1755   if (loop_outer (loop)
1756       && loop_vec_info_for_loop (loop_outer (loop))
1757       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1758     {
1759       if (dump_enabled_p ())
1760         dump_printf_loc (MSG_NOTE, vect_location,
1761                          "outer-loop already vectorized.");
1762       return NULL;
1763     }
1764
1765   while (1)
1766     {
1767       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1768       loop_vinfo = vect_analyze_loop_form (loop);
1769       if (!loop_vinfo)
1770         {
1771           if (dump_enabled_p ())
1772             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773                              "bad loop form.");
1774           return NULL;
1775         }
1776
1777       if (vect_analyze_loop_2 (loop_vinfo))
1778         {
1779           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1780
1781           return loop_vinfo;
1782         }
1783
1784       destroy_loop_vec_info (loop_vinfo, true);
1785
1786       vector_sizes &= ~current_vector_size;
1787       if (vector_sizes == 0
1788           || current_vector_size == 0)
1789         return NULL;
1790
1791       /* Try the next biggest vector size.  */
1792       current_vector_size = 1 << floor_log2 (vector_sizes);
1793       if (dump_enabled_p ())
1794         dump_printf_loc (MSG_NOTE, vect_location,
1795                          "***** Re-trying analysis with "
1796                          "vector size %d\n", current_vector_size);
1797     }
1798 }
1799
1800
1801 /* Function reduction_code_for_scalar_code
1802
1803    Input:
1804    CODE - tree_code of a reduction operations.
1805
1806    Output:
1807    REDUC_CODE - the corresponding tree-code to be used to reduce the
1808       vector of partial results into a single scalar result (which
1809       will also reside in a vector) or ERROR_MARK if the operation is
1810       a supported reduction operation, but does not have such tree-code.
1811
1812    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1813
1814 static bool
1815 reduction_code_for_scalar_code (enum tree_code code,
1816                                 enum tree_code *reduc_code)
1817 {
1818   switch (code)
1819     {
1820       case MAX_EXPR:
1821         *reduc_code = REDUC_MAX_EXPR;
1822         return true;
1823
1824       case MIN_EXPR:
1825         *reduc_code = REDUC_MIN_EXPR;
1826         return true;
1827
1828       case PLUS_EXPR:
1829         *reduc_code = REDUC_PLUS_EXPR;
1830         return true;
1831
1832       case MULT_EXPR:
1833       case MINUS_EXPR:
1834       case BIT_IOR_EXPR:
1835       case BIT_XOR_EXPR:
1836       case BIT_AND_EXPR:
1837         *reduc_code = ERROR_MARK;
1838         return true;
1839
1840       default:
1841        return false;
1842     }
1843 }
1844
1845
1846 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1847    STMT is printed with a message MSG. */
1848
1849 static void
1850 report_vect_op (int msg_type, gimple stmt, const char *msg)
1851 {
1852   dump_printf_loc (msg_type, vect_location, "%s", msg);
1853   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1854 }
1855
1856
1857 /* Detect SLP reduction of the form:
1858
1859    #a1 = phi <a5, a0>
1860    a2 = operation (a1)
1861    a3 = operation (a2)
1862    a4 = operation (a3)
1863    a5 = operation (a4)
1864
1865    #a = phi <a5>
1866
1867    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1868    FIRST_STMT is the first reduction stmt in the chain
1869    (a2 = operation (a1)).
1870
1871    Return TRUE if a reduction chain was detected.  */
1872
1873 static bool
1874 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1875 {
1876   struct loop *loop = (gimple_bb (phi))->loop_father;
1877   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1878   enum tree_code code;
1879   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1880   stmt_vec_info use_stmt_info, current_stmt_info;
1881   tree lhs;
1882   imm_use_iterator imm_iter;
1883   use_operand_p use_p;
1884   int nloop_uses, size = 0, n_out_of_loop_uses;
1885   bool found = false;
1886
1887   if (loop != vect_loop)
1888     return false;
1889
1890   lhs = PHI_RESULT (phi);
1891   code = gimple_assign_rhs_code (first_stmt);
1892   while (1)
1893     {
1894       nloop_uses = 0;
1895       n_out_of_loop_uses = 0;
1896       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1897         {
1898           gimple use_stmt = USE_STMT (use_p);
1899           if (is_gimple_debug (use_stmt))
1900             continue;
1901
1902           use_stmt = USE_STMT (use_p);
1903
1904           /* Check if we got back to the reduction phi.  */
1905           if (use_stmt == phi)
1906             {
1907               loop_use_stmt = use_stmt;
1908               found = true;
1909               break;
1910             }
1911
1912           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1913             {
1914               if (vinfo_for_stmt (use_stmt)
1915                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1916                 {
1917                   loop_use_stmt = use_stmt;
1918                   nloop_uses++;
1919                 }
1920             }
1921            else
1922              n_out_of_loop_uses++;
1923
1924            /* There are can be either a single use in the loop or two uses in
1925               phi nodes.  */
1926            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1927              return false;
1928         }
1929
1930       if (found)
1931         break;
1932
1933       /* We reached a statement with no loop uses.  */
1934       if (nloop_uses == 0)
1935         return false;
1936
1937       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1938       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1939         return false;
1940
1941       if (!is_gimple_assign (loop_use_stmt)
1942           || code != gimple_assign_rhs_code (loop_use_stmt)
1943           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1944         return false;
1945
1946       /* Insert USE_STMT into reduction chain.  */
1947       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1948       if (current_stmt)
1949         {
1950           current_stmt_info = vinfo_for_stmt (current_stmt);
1951           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1952           GROUP_FIRST_ELEMENT (use_stmt_info)
1953             = GROUP_FIRST_ELEMENT (current_stmt_info);
1954         }
1955       else
1956         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1957
1958       lhs = gimple_assign_lhs (loop_use_stmt);
1959       current_stmt = loop_use_stmt;
1960       size++;
1961    }
1962
1963   if (!found || loop_use_stmt != phi || size < 2)
1964     return false;
1965
1966   /* Swap the operands, if needed, to make the reduction operand be the second
1967      operand.  */
1968   lhs = PHI_RESULT (phi);
1969   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1970   while (next_stmt)
1971     {
1972       if (gimple_assign_rhs2 (next_stmt) == lhs)
1973         {
1974           tree op = gimple_assign_rhs1 (next_stmt);
1975           gimple def_stmt = NULL;
1976
1977           if (TREE_CODE (op) == SSA_NAME)
1978             def_stmt = SSA_NAME_DEF_STMT (op);
1979
1980           /* Check that the other def is either defined in the loop
1981              ("vect_internal_def"), or it's an induction (defined by a
1982              loop-header phi-node).  */
1983           if (def_stmt
1984               && gimple_bb (def_stmt)
1985               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1986               && (is_gimple_assign (def_stmt)
1987                   || is_gimple_call (def_stmt)
1988                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1989                            == vect_induction_def
1990                   || (gimple_code (def_stmt) == GIMPLE_PHI
1991                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1992                                   == vect_internal_def
1993                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1994             {
1995               lhs = gimple_assign_lhs (next_stmt);
1996               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1997               continue;
1998             }
1999
2000           return false;
2001         }
2002       else
2003         {
2004           tree op = gimple_assign_rhs2 (next_stmt);
2005           gimple def_stmt = NULL;
2006
2007           if (TREE_CODE (op) == SSA_NAME)
2008             def_stmt = SSA_NAME_DEF_STMT (op);
2009
2010           /* Check that the other def is either defined in the loop
2011             ("vect_internal_def"), or it's an induction (defined by a
2012             loop-header phi-node).  */
2013           if (def_stmt
2014               && gimple_bb (def_stmt)
2015               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2016               && (is_gimple_assign (def_stmt)
2017                   || is_gimple_call (def_stmt)
2018                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2019                               == vect_induction_def
2020                   || (gimple_code (def_stmt) == GIMPLE_PHI
2021                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2022                                   == vect_internal_def
2023                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2024             {
2025               if (dump_enabled_p ())
2026                 {
2027                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2028                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2029                 }
2030
2031               swap_tree_operands (next_stmt,
2032                                   gimple_assign_rhs1_ptr (next_stmt),
2033                                   gimple_assign_rhs2_ptr (next_stmt));
2034               update_stmt (next_stmt);
2035
2036               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2037                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2038             }
2039           else
2040             return false;
2041         }
2042
2043       lhs = gimple_assign_lhs (next_stmt);
2044       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2045     }
2046
2047   /* Save the chain for further analysis in SLP detection.  */
2048   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2049   VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
2050   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2051
2052   return true;
2053 }
2054
2055
2056 /* Function vect_is_simple_reduction_1
2057
2058    (1) Detect a cross-iteration def-use cycle that represents a simple
2059    reduction computation.  We look for the following pattern:
2060
2061    loop_header:
2062      a1 = phi < a0, a2 >
2063      a3 = ...
2064      a2 = operation (a3, a1)
2065
2066    such that:
2067    1. operation is commutative and associative and it is safe to
2068       change the order of the computation (if CHECK_REDUCTION is true)
2069    2. no uses for a2 in the loop (a2 is used out of the loop)
2070    3. no uses of a1 in the loop besides the reduction operation
2071    4. no uses of a1 outside the loop.
2072
2073    Conditions 1,4 are tested here.
2074    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2075
2076    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2077    nested cycles, if CHECK_REDUCTION is false.
2078
2079    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2080    reductions:
2081
2082      a1 = phi < a0, a2 >
2083      inner loop (def of a3)
2084      a2 = phi < a3 >
2085
2086    If MODIFY is true it tries also to rework the code in-place to enable
2087    detection of more reduction patterns.  For the time being we rewrite
2088    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2089 */
2090
2091 static gimple
2092 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2093                             bool check_reduction, bool *double_reduc,
2094                             bool modify)
2095 {
2096   struct loop *loop = (gimple_bb (phi))->loop_father;
2097   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2098   edge latch_e = loop_latch_edge (loop);
2099   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2100   gimple def_stmt, def1 = NULL, def2 = NULL;
2101   enum tree_code orig_code, code;
2102   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2103   tree type;
2104   int nloop_uses;
2105   tree name;
2106   imm_use_iterator imm_iter;
2107   use_operand_p use_p;
2108   bool phi_def;
2109
2110   *double_reduc = false;
2111
2112   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2113      otherwise, we assume outer loop vectorization.  */
2114   gcc_assert ((check_reduction && loop == vect_loop)
2115               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2116
2117   name = PHI_RESULT (phi);
2118   nloop_uses = 0;
2119   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2120     {
2121       gimple use_stmt = USE_STMT (use_p);
2122       if (is_gimple_debug (use_stmt))
2123         continue;
2124
2125       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2126         {
2127           if (dump_enabled_p ())
2128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129                              "intermediate value used outside loop.");
2130
2131           return NULL;
2132         }
2133
2134       if (vinfo_for_stmt (use_stmt)
2135           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2136         nloop_uses++;
2137       if (nloop_uses > 1)
2138         {
2139           if (dump_enabled_p ())
2140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141                              "reduction used in loop.");
2142           return NULL;
2143         }
2144     }
2145
2146   if (TREE_CODE (loop_arg) != SSA_NAME)
2147     {
2148       if (dump_enabled_p ())
2149         {
2150           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151                            "reduction: not ssa_name: ");
2152           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2153         }
2154       return NULL;
2155     }
2156
2157   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2158   if (!def_stmt)
2159     {
2160       if (dump_enabled_p ())
2161         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162                          "reduction: no def_stmt.");
2163       return NULL;
2164     }
2165
2166   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2167     {
2168       if (dump_enabled_p ())
2169         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2170       return NULL;
2171     }
2172
2173   if (is_gimple_assign (def_stmt))
2174     {
2175       name = gimple_assign_lhs (def_stmt);
2176       phi_def = false;
2177     }
2178   else
2179     {
2180       name = PHI_RESULT (def_stmt);
2181       phi_def = true;
2182     }
2183
2184   nloop_uses = 0;
2185   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2186     {
2187       gimple use_stmt = USE_STMT (use_p);
2188       if (is_gimple_debug (use_stmt))
2189         continue;
2190       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2191           && vinfo_for_stmt (use_stmt)
2192           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2193         nloop_uses++;
2194       if (nloop_uses > 1)
2195         {
2196           if (dump_enabled_p ())
2197             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198                              "reduction used in loop.");
2199           return NULL;
2200         }
2201     }
2202
2203   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2204      defined in the inner loop.  */
2205   if (phi_def)
2206     {
2207       op1 = PHI_ARG_DEF (def_stmt, 0);
2208
2209       if (gimple_phi_num_args (def_stmt) != 1
2210           || TREE_CODE (op1) != SSA_NAME)
2211         {
2212           if (dump_enabled_p ())
2213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214                              "unsupported phi node definition.");
2215
2216           return NULL;
2217         }
2218
2219       def1 = SSA_NAME_DEF_STMT (op1);
2220       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2221           && loop->inner
2222           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2223           && is_gimple_assign (def1))
2224         {
2225           if (dump_enabled_p ())
2226             report_vect_op (MSG_NOTE, def_stmt,
2227                             "detected double reduction: ");
2228
2229           *double_reduc = true;
2230           return def_stmt;
2231         }
2232
2233       return NULL;
2234     }
2235
2236   code = orig_code = gimple_assign_rhs_code (def_stmt);
2237
2238   /* We can handle "res -= x[i]", which is non-associative by
2239      simply rewriting this into "res += -x[i]".  Avoid changing
2240      gimple instruction for the first simple tests and only do this
2241      if we're allowed to change code at all.  */
2242   if (code == MINUS_EXPR
2243       && modify
2244       && (op1 = gimple_assign_rhs1 (def_stmt))
2245       && TREE_CODE (op1) == SSA_NAME
2246       && SSA_NAME_DEF_STMT (op1) == phi)
2247     code = PLUS_EXPR;
2248
2249   if (check_reduction
2250       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2251     {
2252       if (dump_enabled_p ())
2253         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2254                         "reduction: not commutative/associative: ");
2255       return NULL;
2256     }
2257
2258   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2259     {
2260       if (code != COND_EXPR)
2261         {
2262           if (dump_enabled_p ())
2263             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2264                             "reduction: not binary operation: ");
2265
2266           return NULL;
2267         }
2268
2269       op3 = gimple_assign_rhs1 (def_stmt);
2270       if (COMPARISON_CLASS_P (op3))
2271         {
2272           op4 = TREE_OPERAND (op3, 1);
2273           op3 = TREE_OPERAND (op3, 0);
2274         }
2275
2276       op1 = gimple_assign_rhs2 (def_stmt);
2277       op2 = gimple_assign_rhs3 (def_stmt);
2278
2279       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2280         {
2281           if (dump_enabled_p ())
2282             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2283                             "reduction: uses not ssa_names: ");
2284
2285           return NULL;
2286         }
2287     }
2288   else
2289     {
2290       op1 = gimple_assign_rhs1 (def_stmt);
2291       op2 = gimple_assign_rhs2 (def_stmt);
2292
2293       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2294         {
2295           if (dump_enabled_p ())
2296             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2297                             "reduction: uses not ssa_names: ");
2298
2299           return NULL;
2300         }
2301    }
2302
2303   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2304   if ((TREE_CODE (op1) == SSA_NAME
2305        && !types_compatible_p (type,TREE_TYPE (op1)))
2306       || (TREE_CODE (op2) == SSA_NAME
2307           && !types_compatible_p (type, TREE_TYPE (op2)))
2308       || (op3 && TREE_CODE (op3) == SSA_NAME
2309           && !types_compatible_p (type, TREE_TYPE (op3)))
2310       || (op4 && TREE_CODE (op4) == SSA_NAME
2311           && !types_compatible_p (type, TREE_TYPE (op4))))
2312     {
2313       if (dump_enabled_p ())
2314         {
2315           dump_printf_loc (MSG_NOTE, vect_location,
2316                            "reduction: multiple types: operation type: ");
2317           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2318           dump_printf (MSG_NOTE, ", operands types: ");
2319           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2320                              TREE_TYPE (op1));
2321           dump_printf (MSG_NOTE, ",");
2322           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323                              TREE_TYPE (op2));
2324           if (op3)
2325             {
2326               dump_printf (MSG_NOTE, ",");
2327               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2328                                  TREE_TYPE (op3));
2329             }
2330
2331           if (op4)
2332             {
2333               dump_printf (MSG_NOTE, ",");
2334               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2335                                  TREE_TYPE (op4));
2336             }
2337         }
2338
2339       return NULL;
2340     }
2341
2342   /* Check that it's ok to change the order of the computation.
2343      Generally, when vectorizing a reduction we change the order of the
2344      computation.  This may change the behavior of the program in some
2345      cases, so we need to check that this is ok.  One exception is when
2346      vectorizing an outer-loop: the inner-loop is executed sequentially,
2347      and therefore vectorizing reductions in the inner-loop during
2348      outer-loop vectorization is safe.  */
2349
2350   /* CHECKME: check for !flag_finite_math_only too?  */
2351   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2352       && check_reduction)
2353     {
2354       /* Changing the order of operations changes the semantics.  */
2355       if (dump_enabled_p ())
2356         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2357                         "reduction: unsafe fp math optimization: ");
2358       return NULL;
2359     }
2360   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2361            && check_reduction)
2362     {
2363       /* Changing the order of operations changes the semantics.  */
2364       if (dump_enabled_p ())
2365         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2366                         "reduction: unsafe int math optimization: ");
2367       return NULL;
2368     }
2369   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2370     {
2371       /* Changing the order of operations changes the semantics.  */
2372       if (dump_enabled_p ())
2373         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2374                         "reduction: unsafe fixed-point math optimization: ");
2375       return NULL;
2376     }
2377
2378   /* If we detected "res -= x[i]" earlier, rewrite it into
2379      "res += -x[i]" now.  If this turns out to be useless reassoc
2380      will clean it up again.  */
2381   if (orig_code == MINUS_EXPR)
2382     {
2383       tree rhs = gimple_assign_rhs2 (def_stmt);
2384       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2385       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2386                                                          rhs, NULL);
2387       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2388       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2389                                                           loop_info, NULL));
2390       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2391       gimple_assign_set_rhs2 (def_stmt, negrhs);
2392       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2393       update_stmt (def_stmt);
2394     }
2395
2396   /* Reduction is safe. We're dealing with one of the following:
2397      1) integer arithmetic and no trapv
2398      2) floating point arithmetic, and special flags permit this optimization
2399      3) nested cycle (i.e., outer loop vectorization).  */
2400   if (TREE_CODE (op1) == SSA_NAME)
2401     def1 = SSA_NAME_DEF_STMT (op1);
2402
2403   if (TREE_CODE (op2) == SSA_NAME)
2404     def2 = SSA_NAME_DEF_STMT (op2);
2405
2406   if (code != COND_EXPR
2407       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2408     {
2409       if (dump_enabled_p ())
2410         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2411       return NULL;
2412     }
2413
2414   /* Check that one def is the reduction def, defined by PHI,
2415      the other def is either defined in the loop ("vect_internal_def"),
2416      or it's an induction (defined by a loop-header phi-node).  */
2417
2418   if (def2 && def2 == phi
2419       && (code == COND_EXPR
2420           || !def1 || gimple_nop_p (def1)
2421           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2422               && (is_gimple_assign (def1)
2423                   || is_gimple_call (def1)
2424                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2425                       == vect_induction_def
2426                   || (gimple_code (def1) == GIMPLE_PHI
2427                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2428                           == vect_internal_def
2429                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2430     {
2431       if (dump_enabled_p ())
2432         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2433       return def_stmt;
2434     }
2435
2436   if (def1 && def1 == phi
2437       && (code == COND_EXPR
2438           || !def2 || gimple_nop_p (def2)
2439           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2440               && (is_gimple_assign (def2)
2441                   || is_gimple_call (def2)
2442                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2443                       == vect_induction_def
2444                   || (gimple_code (def2) == GIMPLE_PHI
2445                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2446                           == vect_internal_def
2447                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2448     {
2449       if (check_reduction)
2450         {
2451           /* Swap operands (just for simplicity - so that the rest of the code
2452              can assume that the reduction variable is always the last (second)
2453              argument).  */
2454           if (dump_enabled_p ())
2455             report_vect_op (MSG_NOTE, def_stmt,
2456                             "detected reduction: need to swap operands: ");
2457
2458           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2459                               gimple_assign_rhs2_ptr (def_stmt));
2460
2461           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2462             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2463         }
2464       else
2465         {
2466           if (dump_enabled_p ())
2467             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2468         }
2469
2470       return def_stmt;
2471     }
2472
2473   /* Try to find SLP reduction chain.  */
2474   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2475     {
2476       if (dump_enabled_p ())
2477         report_vect_op (MSG_NOTE, def_stmt,
2478                         "reduction: detected reduction chain: ");
2479
2480       return def_stmt;
2481     }
2482
2483   if (dump_enabled_p ())
2484     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2485                     "reduction: unknown pattern: ");
2486
2487   return NULL;
2488 }
2489
2490 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2491    in-place.  Arguments as there.  */
2492
2493 static gimple
2494 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2495                           bool check_reduction, bool *double_reduc)
2496 {
2497   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498                                      double_reduc, false);
2499 }
2500
2501 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2502    in-place if it enables detection of more reductions.  Arguments
2503    as there.  */
2504
2505 gimple
2506 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2507                           bool check_reduction, bool *double_reduc)
2508 {
2509   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2510                                      double_reduc, true);
2511 }
2512
2513 /* Calculate the cost of one scalar iteration of the loop.  */
2514 int
2515 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2516 {
2517   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2518   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2519   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2520   int innerloop_iters, i, stmt_cost;
2521
2522   /* Count statements in scalar loop.  Using this as scalar cost for a single
2523      iteration for now.
2524
2525      TODO: Add outer loop support.
2526
2527      TODO: Consider assigning different costs to different scalar
2528      statements.  */
2529
2530   /* FORNOW.  */
2531   innerloop_iters = 1;
2532   if (loop->inner)
2533     innerloop_iters = 50; /* FIXME */
2534
2535   for (i = 0; i < nbbs; i++)
2536     {
2537       gimple_stmt_iterator si;
2538       basic_block bb = bbs[i];
2539
2540       if (bb->loop_father == loop->inner)
2541         factor = innerloop_iters;
2542       else
2543         factor = 1;
2544
2545       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2546         {
2547           gimple stmt = gsi_stmt (si);
2548           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2549
2550           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2551             continue;
2552
2553           /* Skip stmts that are not vectorized inside the loop.  */
2554           if (stmt_info
2555               && !STMT_VINFO_RELEVANT_P (stmt_info)
2556               && (!STMT_VINFO_LIVE_P (stmt_info)
2557                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2558               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2559             continue;
2560
2561           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2562             {
2563               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2564                stmt_cost = vect_get_stmt_cost (scalar_load);
2565              else
2566                stmt_cost = vect_get_stmt_cost (scalar_store);
2567             }
2568           else
2569             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2570
2571           scalar_single_iter_cost += stmt_cost * factor;
2572         }
2573     }
2574   return scalar_single_iter_cost;
2575 }
2576
2577 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2578 int
2579 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2580                              int *peel_iters_epilogue,
2581                              int scalar_single_iter_cost,
2582                              stmt_vector_for_cost *prologue_cost_vec,
2583                              stmt_vector_for_cost *epilogue_cost_vec)
2584 {
2585   int retval = 0;
2586   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2587
2588   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2589     {
2590       *peel_iters_epilogue = vf/2;
2591       if (dump_enabled_p ())
2592         dump_printf_loc (MSG_NOTE, vect_location,
2593                          "cost model: epilogue peel iters set to vf/2 "
2594                          "because loop iterations are unknown .");
2595
2596       /* If peeled iterations are known but number of scalar loop
2597          iterations are unknown, count a taken branch per peeled loop.  */
2598       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2599                                  NULL, 0, vect_prologue);
2600     }
2601   else
2602     {
2603       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2604       peel_iters_prologue = niters < peel_iters_prologue ?
2605                             niters : peel_iters_prologue;
2606       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2607       /* If we need to peel for gaps, but no peeling is required, we have to
2608          peel VF iterations.  */
2609       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2610         *peel_iters_epilogue = vf;
2611     }
2612
2613   if (peel_iters_prologue)
2614     retval += record_stmt_cost (prologue_cost_vec,
2615                                 peel_iters_prologue * scalar_single_iter_cost,
2616                                 scalar_stmt, NULL, 0, vect_prologue);
2617   if (*peel_iters_epilogue)
2618     retval += record_stmt_cost (epilogue_cost_vec,
2619                                 *peel_iters_epilogue * scalar_single_iter_cost,
2620                                 scalar_stmt, NULL, 0, vect_epilogue);
2621   return retval;
2622 }
2623
2624 /* Function vect_estimate_min_profitable_iters
2625
2626    Return the number of iterations required for the vector version of the
2627    loop to be profitable relative to the cost of the scalar version of the
2628    loop.  */
2629
2630 static void
2631 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2632                                     int *ret_min_profitable_niters,
2633                                     int *ret_min_profitable_estimate)
2634 {
2635   int min_profitable_iters;
2636   int min_profitable_estimate;
2637   int peel_iters_prologue;
2638   int peel_iters_epilogue;
2639   unsigned vec_inside_cost = 0;
2640   int vec_outside_cost = 0;
2641   unsigned vec_prologue_cost = 0;
2642   unsigned vec_epilogue_cost = 0;
2643   int scalar_single_iter_cost = 0;
2644   int scalar_outside_cost = 0;
2645   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2646   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2648
2649   /* Cost model disabled.  */
2650   if (!flag_vect_cost_model)
2651     {
2652       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2653       *ret_min_profitable_niters = 0;
2654       *ret_min_profitable_estimate = 0;
2655       return;
2656     }
2657
2658   /* Requires loop versioning tests to handle misalignment.  */
2659   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2660     {
2661       /*  FIXME: Make cost depend on complexity of individual check.  */
2662       unsigned len = VEC_length (gimple,
2663                                  LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2664       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2665                             vect_prologue);
2666       dump_printf (MSG_NOTE,
2667                    "cost model: Adding cost of checks for loop "
2668                    "versioning to treat misalignment.\n");
2669     }
2670
2671   /* Requires loop versioning with alias checks.  */
2672   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2673     {
2674       /*  FIXME: Make cost depend on complexity of individual check.  */
2675       unsigned len = VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2676       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2677                             vect_prologue);
2678       dump_printf (MSG_NOTE,
2679                    "cost model: Adding cost of checks for loop "
2680                    "versioning aliasing.\n");
2681     }
2682
2683   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2684       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2685     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2686                           vect_prologue);
2687
2688   /* Count statements in scalar loop.  Using this as scalar cost for a single
2689      iteration for now.
2690
2691      TODO: Add outer loop support.
2692
2693      TODO: Consider assigning different costs to different scalar
2694      statements.  */
2695
2696   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2697
2698   /* Add additional cost for the peeled instructions in prologue and epilogue
2699      loop.
2700
2701      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2702      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2703
2704      TODO: Build an expression that represents peel_iters for prologue and
2705      epilogue to be used in a run-time test.  */
2706
2707   if (npeel  < 0)
2708     {
2709       peel_iters_prologue = vf/2;
2710       dump_printf (MSG_NOTE, "cost model: "
2711                    "prologue peel iters set to vf/2.");
2712
2713       /* If peeling for alignment is unknown, loop bound of main loop becomes
2714          unknown.  */
2715       peel_iters_epilogue = vf/2;
2716       dump_printf (MSG_NOTE, "cost model: "
2717                    "epilogue peel iters set to vf/2 because "
2718                    "peeling for alignment is unknown.");
2719
2720       /* If peeled iterations are unknown, count a taken branch and a not taken
2721          branch per peeled loop. Even if scalar loop iterations are known,
2722          vector iterations are not known since peeled prologue iterations are
2723          not known. Hence guards remain the same.  */
2724       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2725                             NULL, 0, vect_prologue);
2726       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2727                             NULL, 0, vect_prologue);
2728       /* FORNOW: Don't attempt to pass individual scalar instructions to
2729          the model; just assume linear cost for scalar iterations.  */
2730       (void) add_stmt_cost (target_cost_data,
2731                             peel_iters_prologue * scalar_single_iter_cost,
2732                             scalar_stmt, NULL, 0, vect_prologue);
2733       (void) add_stmt_cost (target_cost_data,
2734                             peel_iters_epilogue * scalar_single_iter_cost,
2735                             scalar_stmt, NULL, 0, vect_epilogue);
2736     }
2737   else
2738     {
2739       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2740       stmt_info_for_cost *si;
2741       int j;
2742       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2743
2744       prologue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
2745       epilogue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
2746       peel_iters_prologue = npeel;
2747
2748       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2749                                           &peel_iters_epilogue,
2750                                           scalar_single_iter_cost,
2751                                           &prologue_cost_vec,
2752                                           &epilogue_cost_vec);
2753
2754       FOR_EACH_VEC_ELT (stmt_info_for_cost, prologue_cost_vec, j, si)
2755         {
2756           struct _stmt_vec_info *stmt_info
2757             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2758           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2759                                 si->misalign, vect_prologue);
2760         }
2761
2762       FOR_EACH_VEC_ELT (stmt_info_for_cost, epilogue_cost_vec, j, si)
2763         {
2764           struct _stmt_vec_info *stmt_info
2765             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2766           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2767                                 si->misalign, vect_epilogue);
2768         }
2769
2770       VEC_free (stmt_info_for_cost, heap, prologue_cost_vec);
2771       VEC_free (stmt_info_for_cost, heap, epilogue_cost_vec);
2772     }
2773
2774   /* FORNOW: The scalar outside cost is incremented in one of the
2775      following ways:
2776
2777      1. The vectorizer checks for alignment and aliasing and generates
2778      a condition that allows dynamic vectorization.  A cost model
2779      check is ANDED with the versioning condition.  Hence scalar code
2780      path now has the added cost of the versioning check.
2781
2782        if (cost > th & versioning_check)
2783          jmp to vector code
2784
2785      Hence run-time scalar is incremented by not-taken branch cost.
2786
2787      2. The vectorizer then checks if a prologue is required.  If the
2788      cost model check was not done before during versioning, it has to
2789      be done before the prologue check.
2790
2791        if (cost <= th)
2792          prologue = scalar_iters
2793        if (prologue == 0)
2794          jmp to vector code
2795        else
2796          execute prologue
2797        if (prologue == num_iters)
2798          go to exit
2799
2800      Hence the run-time scalar cost is incremented by a taken branch,
2801      plus a not-taken branch, plus a taken branch cost.
2802
2803      3. The vectorizer then checks if an epilogue is required.  If the
2804      cost model check was not done before during prologue check, it
2805      has to be done with the epilogue check.
2806
2807        if (prologue == 0)
2808          jmp to vector code
2809        else
2810          execute prologue
2811        if (prologue == num_iters)
2812          go to exit
2813        vector code:
2814          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2815            jmp to epilogue
2816
2817      Hence the run-time scalar cost should be incremented by 2 taken
2818      branches.
2819
2820      TODO: The back end may reorder the BBS's differently and reverse
2821      conditions/branch directions.  Change the estimates below to
2822      something more reasonable.  */
2823
2824   /* If the number of iterations is known and we do not do versioning, we can
2825      decide whether to vectorize at compile time.  Hence the scalar version
2826      do not carry cost model guard costs.  */
2827   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2828       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2829       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2830     {
2831       /* Cost model check occurs at versioning.  */
2832       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2833           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2834         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2835       else
2836         {
2837           /* Cost model check occurs at prologue generation.  */
2838           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2839             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2840               + vect_get_stmt_cost (cond_branch_not_taken);
2841           /* Cost model check occurs at epilogue generation.  */
2842           else
2843             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2844         }
2845     }
2846
2847   /* Complete the target-specific cost calculations.  */
2848   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2849                &vec_inside_cost, &vec_epilogue_cost);
2850
2851   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2852
2853   /* Calculate number of iterations required to make the vector version
2854      profitable, relative to the loop bodies only.  The following condition
2855      must hold true:
2856      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2857      where
2858      SIC = scalar iteration cost, VIC = vector iteration cost,
2859      VOC = vector outside cost, VF = vectorization factor,
2860      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2861      SOC = scalar outside cost for run time cost model check.  */
2862
2863   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2864     {
2865       if (vec_outside_cost <= 0)
2866         min_profitable_iters = 1;
2867       else
2868         {
2869           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2870                                   - vec_inside_cost * peel_iters_prologue
2871                                   - vec_inside_cost * peel_iters_epilogue)
2872                                  / ((scalar_single_iter_cost * vf)
2873                                     - vec_inside_cost);
2874
2875           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2876               <= (((int) vec_inside_cost * min_profitable_iters)
2877                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2878             min_profitable_iters++;
2879         }
2880     }
2881   /* vector version will never be profitable.  */
2882   else
2883     {
2884       if (dump_enabled_p ())
2885         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2886                          "cost model: the vector iteration cost = %d "
2887                          "divided by the scalar iteration cost = %d "
2888                          "is greater or equal to the vectorization factor = %d.",
2889                          vec_inside_cost, scalar_single_iter_cost, vf);
2890       *ret_min_profitable_niters = -1;
2891       *ret_min_profitable_estimate = -1;
2892       return;
2893     }
2894
2895   if (dump_enabled_p ())
2896     {
2897       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2898       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2899                    vec_inside_cost);
2900       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2901                    vec_prologue_cost);
2902       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2903                    vec_epilogue_cost);
2904       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2905                    scalar_single_iter_cost);
2906       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2907                    scalar_outside_cost);
2908       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2909                    vec_outside_cost);
2910       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2911                    peel_iters_prologue);
2912       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2913                    peel_iters_epilogue);
2914       dump_printf (MSG_NOTE,
2915                    "  Calculated minimum iters for profitability: %d\n",
2916                    min_profitable_iters);
2917     }
2918
2919   min_profitable_iters =
2920         min_profitable_iters < vf ? vf : min_profitable_iters;
2921
2922   /* Because the condition we create is:
2923      if (niters <= min_profitable_iters)
2924        then skip the vectorized loop.  */
2925   min_profitable_iters--;
2926
2927   if (dump_enabled_p ())
2928     dump_printf_loc (MSG_NOTE, vect_location,
2929                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2930
2931   *ret_min_profitable_niters = min_profitable_iters;
2932
2933   /* Calculate number of iterations required to make the vector version
2934      profitable, relative to the loop bodies only.
2935
2936      Non-vectorized variant is SIC * niters and it must win over vector
2937      variant on the expected loop trip count.  The following condition must hold true:
2938      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2939
2940   if (vec_outside_cost <= 0)
2941     min_profitable_estimate = 1;
2942   else
2943     {
2944       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2945                                  - vec_inside_cost * peel_iters_prologue
2946                                  - vec_inside_cost * peel_iters_epilogue)
2947                                  / ((scalar_single_iter_cost * vf)
2948                                    - vec_inside_cost);
2949     }
2950   min_profitable_estimate --;
2951   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2952   if (dump_enabled_p ())
2953     dump_printf_loc (MSG_NOTE, vect_location,
2954                      "  Static estimate profitability threshold = %d\n",
2955                       min_profitable_iters);
2956
2957   *ret_min_profitable_estimate = min_profitable_estimate;
2958 }
2959
2960
2961 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2962    functions. Design better to avoid maintenance issues.  */
2963
2964 /* Function vect_model_reduction_cost.
2965
2966    Models cost for a reduction operation, including the vector ops
2967    generated within the strip-mine loop, the initial definition before
2968    the loop, and the epilogue code that must be generated.  */
2969
2970 static bool
2971 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2972                            int ncopies)
2973 {
2974   int prologue_cost = 0, epilogue_cost = 0;
2975   enum tree_code code;
2976   optab optab;
2977   tree vectype;
2978   gimple stmt, orig_stmt;
2979   tree reduction_op;
2980   enum machine_mode mode;
2981   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2982   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2983   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2984
2985   /* Cost of reduction op inside loop.  */
2986   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2987                                         stmt_info, 0, vect_body);
2988   stmt = STMT_VINFO_STMT (stmt_info);
2989
2990   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2991     {
2992     case GIMPLE_SINGLE_RHS:
2993       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2994       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2995       break;
2996     case GIMPLE_UNARY_RHS:
2997       reduction_op = gimple_assign_rhs1 (stmt);
2998       break;
2999     case GIMPLE_BINARY_RHS:
3000       reduction_op = gimple_assign_rhs2 (stmt);
3001       break;
3002     case GIMPLE_TERNARY_RHS:
3003       reduction_op = gimple_assign_rhs3 (stmt);
3004       break;
3005     default:
3006       gcc_unreachable ();
3007     }
3008
3009   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3010   if (!vectype)
3011     {
3012       if (dump_enabled_p ())
3013         {
3014           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3015                            "unsupported data-type ");
3016           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3017                              TREE_TYPE (reduction_op));
3018         }
3019       return false;
3020    }
3021
3022   mode = TYPE_MODE (vectype);
3023   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3024
3025   if (!orig_stmt)
3026     orig_stmt = STMT_VINFO_STMT (stmt_info);
3027
3028   code = gimple_assign_rhs_code (orig_stmt);
3029
3030   /* Add in cost for initial definition.  */
3031   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3032                                   stmt_info, 0, vect_prologue);
3033
3034   /* Determine cost of epilogue code.
3035
3036      We have a reduction operator that will reduce the vector in one statement.
3037      Also requires scalar extract.  */
3038
3039   if (!nested_in_vect_loop_p (loop, orig_stmt))
3040     {
3041       if (reduc_code != ERROR_MARK)
3042         {
3043           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3044                                           stmt_info, 0, vect_epilogue);
3045           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3046                                           stmt_info, 0, vect_epilogue);
3047         }
3048       else
3049         {
3050           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3051           tree bitsize =
3052             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3053           int element_bitsize = tree_low_cst (bitsize, 1);
3054           int nelements = vec_size_in_bits / element_bitsize;
3055
3056           optab = optab_for_tree_code (code, vectype, optab_default);
3057
3058           /* We have a whole vector shift available.  */
3059           if (VECTOR_MODE_P (mode)
3060               && optab_handler (optab, mode) != CODE_FOR_nothing
3061               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3062             {
3063               /* Final reduction via vector shifts and the reduction operator.
3064                  Also requires scalar extract.  */
3065               epilogue_cost += add_stmt_cost (target_cost_data,
3066                                               exact_log2 (nelements) * 2,
3067                                               vector_stmt, stmt_info, 0,
3068                                               vect_epilogue);
3069               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3070                                               vec_to_scalar, stmt_info, 0,
3071                                               vect_epilogue);
3072             }
3073           else
3074             /* Use extracts and reduction op for final reduction.  For N
3075                elements, we have N extracts and N-1 reduction ops.  */
3076             epilogue_cost += add_stmt_cost (target_cost_data,
3077                                             nelements + nelements - 1,
3078                                             vector_stmt, stmt_info, 0,
3079                                             vect_epilogue);
3080         }
3081     }
3082
3083   if (dump_enabled_p ())
3084     dump_printf (MSG_NOTE,
3085                  "vect_model_reduction_cost: inside_cost = %d, "
3086                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3087                  prologue_cost, epilogue_cost);
3088
3089   return true;
3090 }
3091
3092
3093 /* Function vect_model_induction_cost.
3094
3095    Models cost for induction operations.  */
3096
3097 static void
3098 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3099 {
3100   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3101   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3102   unsigned inside_cost, prologue_cost;
3103
3104   /* loop cost for vec_loop.  */
3105   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3106                                stmt_info, 0, vect_body);
3107
3108   /* prologue cost for vec_init and vec_step.  */
3109   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3110                                  stmt_info, 0, vect_prologue);
3111
3112   if (dump_enabled_p ())
3113     dump_printf_loc (MSG_NOTE, vect_location,
3114                      "vect_model_induction_cost: inside_cost = %d, "
3115                      "prologue_cost = %d .", inside_cost, prologue_cost);
3116 }
3117
3118
3119 /* Function get_initial_def_for_induction
3120
3121    Input:
3122    STMT - a stmt that performs an induction operation in the loop.
3123    IV_PHI - the initial value of the induction variable
3124
3125    Output:
3126    Return a vector variable, initialized with the first VF values of
3127    the induction variable.  E.g., for an iv with IV_PHI='X' and
3128    evolution S, for a vector of 4 units, we want to return:
3129    [X, X + S, X + 2*S, X + 3*S].  */
3130
3131 static tree
3132 get_initial_def_for_induction (gimple iv_phi)
3133 {
3134   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3135   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3136   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3137   tree scalar_type;
3138   tree vectype;
3139   int nunits;
3140   edge pe = loop_preheader_edge (loop);
3141   struct loop *iv_loop;
3142   basic_block new_bb;
3143   tree vec, vec_init, vec_step, t;
3144   tree access_fn;
3145   tree new_var;
3146   tree new_name;
3147   gimple init_stmt, induction_phi, new_stmt;
3148   tree induc_def, vec_def, vec_dest;
3149   tree init_expr, step_expr;
3150   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3151   int i;
3152   bool ok;
3153   int ncopies;
3154   tree expr;
3155   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3156   bool nested_in_vect_loop = false;
3157   gimple_seq stmts = NULL;
3158   imm_use_iterator imm_iter;
3159   use_operand_p use_p;
3160   gimple exit_phi;
3161   edge latch_e;
3162   tree loop_arg;
3163   gimple_stmt_iterator si;
3164   basic_block bb = gimple_bb (iv_phi);
3165   tree stepvectype;
3166   tree resvectype;
3167
3168   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3169   if (nested_in_vect_loop_p (loop, iv_phi))
3170     {
3171       nested_in_vect_loop = true;
3172       iv_loop = loop->inner;
3173     }
3174   else
3175     iv_loop = loop;
3176   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3177
3178   latch_e = loop_latch_edge (iv_loop);
3179   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3180
3181   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3182   gcc_assert (access_fn);
3183   STRIP_NOPS (access_fn);
3184   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3185                                     &init_expr, &step_expr);
3186   gcc_assert (ok);
3187   pe = loop_preheader_edge (iv_loop);
3188
3189   scalar_type = TREE_TYPE (init_expr);
3190   vectype = get_vectype_for_scalar_type (scalar_type);
3191   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3192   gcc_assert (vectype);
3193   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3194   ncopies = vf / nunits;
3195
3196   gcc_assert (phi_info);
3197   gcc_assert (ncopies >= 1);
3198
3199   /* Find the first insertion point in the BB.  */
3200   si = gsi_after_labels (bb);
3201
3202   /* Create the vector that holds the initial_value of the induction.  */
3203   if (nested_in_vect_loop)
3204     {
3205       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3206          been created during vectorization of previous stmts.  We obtain it
3207          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3208       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3209                                            loop_preheader_edge (iv_loop));
3210       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3211     }
3212   else
3213     {
3214       VEC(constructor_elt,gc) *v;
3215
3216       /* iv_loop is the loop to be vectorized. Create:
3217          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3218       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3219       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3220       if (stmts)
3221         {
3222           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3223           gcc_assert (!new_bb);
3224         }
3225
3226       v = VEC_alloc (constructor_elt, gc, nunits);
3227       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3228       for (i = 1; i < nunits; i++)
3229         {
3230           /* Create: new_name_i = new_name + step_expr  */
3231           enum tree_code code = POINTER_TYPE_P (scalar_type)
3232                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3233           init_stmt = gimple_build_assign_with_ops (code, new_var,
3234                                                     new_name, step_expr);
3235           new_name = make_ssa_name (new_var, init_stmt);
3236           gimple_assign_set_lhs (init_stmt, new_name);
3237
3238           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3239           gcc_assert (!new_bb);
3240
3241           if (dump_enabled_p ())
3242             {
3243               dump_printf_loc (MSG_NOTE, vect_location,
3244                                "created new init_stmt: ");
3245               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3246             }
3247           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3248         }
3249       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3250       vec = build_constructor (vectype, v);
3251       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3252     }
3253
3254
3255   /* Create the vector that holds the step of the induction.  */
3256   if (nested_in_vect_loop)
3257     /* iv_loop is nested in the loop to be vectorized. Generate:
3258        vec_step = [S, S, S, S]  */
3259     new_name = step_expr;
3260   else
3261     {
3262       /* iv_loop is the loop to be vectorized. Generate:
3263           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3264       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3265       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3266                               expr, step_expr);
3267     }
3268
3269   t = unshare_expr (new_name);
3270   gcc_assert (CONSTANT_CLASS_P (new_name));
3271   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3272   gcc_assert (stepvectype);
3273   vec = build_vector_from_val (stepvectype, t);
3274   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3275
3276
3277   /* Create the following def-use cycle:
3278      loop prolog:
3279          vec_init = ...
3280          vec_step = ...
3281      loop:
3282          vec_iv = PHI <vec_init, vec_loop>
3283          ...
3284          STMT
3285          ...
3286          vec_loop = vec_iv + vec_step;  */
3287
3288   /* Create the induction-phi that defines the induction-operand.  */
3289   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3290   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3291   set_vinfo_for_stmt (induction_phi,
3292                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3293   induc_def = PHI_RESULT (induction_phi);
3294
3295   /* Create the iv update inside the loop  */
3296   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3297                                            induc_def, vec_step);
3298   vec_def = make_ssa_name (vec_dest, new_stmt);
3299   gimple_assign_set_lhs (new_stmt, vec_def);
3300   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3301   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3302                                                    NULL));
3303
3304   /* Set the arguments of the phi node:  */
3305   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3306   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3307                UNKNOWN_LOCATION);
3308
3309
3310   /* In case that vectorization factor (VF) is bigger than the number
3311      of elements that we can fit in a vectype (nunits), we have to generate
3312      more than one vector stmt - i.e - we need to "unroll" the
3313      vector stmt by a factor VF/nunits.  For more details see documentation
3314      in vectorizable_operation.  */
3315
3316   if (ncopies > 1)
3317     {
3318       stmt_vec_info prev_stmt_vinfo;
3319       /* FORNOW. This restriction should be relaxed.  */
3320       gcc_assert (!nested_in_vect_loop);
3321
3322       /* Create the vector that holds the step of the induction.  */
3323       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3324       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3325                               expr, step_expr);
3326       t = unshare_expr (new_name);
3327       gcc_assert (CONSTANT_CLASS_P (new_name));
3328       vec = build_vector_from_val (stepvectype, t);
3329       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3330
3331       vec_def = induc_def;
3332       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3333       for (i = 1; i < ncopies; i++)
3334         {
3335           /* vec_i = vec_prev + vec_step  */
3336           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3337                                                    vec_def, vec_step);
3338           vec_def = make_ssa_name (vec_dest, new_stmt);
3339           gimple_assign_set_lhs (new_stmt, vec_def);
3340
3341           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3342           if (!useless_type_conversion_p (resvectype, vectype))
3343             {
3344               new_stmt = gimple_build_assign_with_ops
3345                   (VIEW_CONVERT_EXPR,
3346                    vect_get_new_vect_var (resvectype, vect_simple_var,
3347                                           "vec_iv_"),
3348                    build1 (VIEW_CONVERT_EXPR, resvectype,
3349                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3350               gimple_assign_set_lhs (new_stmt,
3351                                      make_ssa_name
3352                                        (gimple_assign_lhs (new_stmt), new_stmt));
3353               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3354             }
3355           set_vinfo_for_stmt (new_stmt,
3356                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3357           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3358           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3359         }
3360     }
3361
3362   if (nested_in_vect_loop)
3363     {
3364       /* Find the loop-closed exit-phi of the induction, and record
3365          the final vector of induction results:  */
3366       exit_phi = NULL;
3367       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3368         {
3369           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3370             {
3371               exit_phi = USE_STMT (use_p);
3372               break;
3373             }
3374         }
3375       if (exit_phi)
3376         {
3377           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3378           /* FORNOW. Currently not supporting the case that an inner-loop induction
3379              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3380           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3381                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3382
3383           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3384           if (dump_enabled_p ())
3385             {
3386               dump_printf_loc (MSG_NOTE, vect_location,
3387                                "vector of inductions after inner-loop:");
3388               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3389             }
3390         }
3391     }
3392
3393
3394   if (dump_enabled_p ())
3395     {
3396       dump_printf_loc (MSG_NOTE, vect_location,
3397                        "transform induction: created def-use cycle: ");
3398       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3399       dump_printf (MSG_NOTE, "\n");
3400       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3401                         SSA_NAME_DEF_STMT (vec_def), 0);
3402     }
3403
3404   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3405   if (!useless_type_conversion_p (resvectype, vectype))
3406     {
3407       new_stmt = gimple_build_assign_with_ops
3408          (VIEW_CONVERT_EXPR,
3409           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3410           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3411       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3412       gimple_assign_set_lhs (new_stmt, induc_def);
3413       si = gsi_start_bb (bb);
3414       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3415       set_vinfo_for_stmt (new_stmt,
3416                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3417       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3418         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3419     }
3420
3421   return induc_def;
3422 }
3423
3424
3425 /* Function get_initial_def_for_reduction
3426
3427    Input:
3428    STMT - a stmt that performs a reduction operation in the loop.
3429    INIT_VAL - the initial value of the reduction variable
3430
3431    Output:
3432    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3433         of the reduction (used for adjusting the epilog - see below).
3434    Return a vector variable, initialized according to the operation that STMT
3435         performs. This vector will be used as the initial value of the
3436         vector of partial results.
3437
3438    Option1 (adjust in epilog): Initialize the vector as follows:
3439      add/bit or/xor:    [0,0,...,0,0]
3440      mult/bit and:      [1,1,...,1,1]
3441      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3442    and when necessary (e.g. add/mult case) let the caller know
3443    that it needs to adjust the result by init_val.
3444
3445    Option2: Initialize the vector as follows:
3446      add/bit or/xor:    [init_val,0,0,...,0]
3447      mult/bit and:      [init_val,1,1,...,1]
3448      min/max/cond_expr: [init_val,init_val,...,init_val]
3449    and no adjustments are needed.
3450
3451    For example, for the following code:
3452
3453    s = init_val;
3454    for (i=0;i<n;i++)
3455      s = s + a[i];
3456
3457    STMT is 's = s + a[i]', and the reduction variable is 's'.
3458    For a vector of 4 units, we want to return either [0,0,0,init_val],
3459    or [0,0,0,0] and let the caller know that it needs to adjust
3460    the result at the end by 'init_val'.
3461
3462    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3463    initialization vector is simpler (same element in all entries), if
3464    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3465
3466    A cost model should help decide between these two schemes.  */
3467
3468 tree
3469 get_initial_def_for_reduction (gimple stmt, tree init_val,
3470                                tree *adjustment_def)
3471 {
3472   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3473   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3474   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3475   tree scalar_type = TREE_TYPE (init_val);
3476   tree vectype = get_vectype_for_scalar_type (scalar_type);
3477   int nunits;
3478   enum tree_code code = gimple_assign_rhs_code (stmt);
3479   tree def_for_init;
3480   tree init_def;
3481   tree *elts;
3482   int i;
3483   bool nested_in_vect_loop = false;
3484   tree init_value;
3485   REAL_VALUE_TYPE real_init_val = dconst0;
3486   int int_init_val = 0;
3487   gimple def_stmt = NULL;
3488
3489   gcc_assert (vectype);
3490   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3491
3492   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3493               || SCALAR_FLOAT_TYPE_P (scalar_type));
3494
3495   if (nested_in_vect_loop_p (loop, stmt))
3496     nested_in_vect_loop = true;
3497   else
3498     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3499
3500   /* In case of double reduction we only create a vector variable to be put
3501      in the reduction phi node.  The actual statement creation is done in
3502      vect_create_epilog_for_reduction.  */
3503   if (adjustment_def && nested_in_vect_loop
3504       && TREE_CODE (init_val) == SSA_NAME
3505       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3506       && gimple_code (def_stmt) == GIMPLE_PHI
3507       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3508       && vinfo_for_stmt (def_stmt)
3509       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3510           == vect_double_reduction_def)
3511     {
3512       *adjustment_def = NULL;
3513       return vect_create_destination_var (init_val, vectype);
3514     }
3515
3516   if (TREE_CONSTANT (init_val))
3517     {
3518       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3519         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3520       else
3521         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3522     }
3523   else
3524     init_value = init_val;
3525
3526   switch (code)
3527     {
3528       case WIDEN_SUM_EXPR:
3529       case DOT_PROD_EXPR:
3530       case PLUS_EXPR:
3531       case MINUS_EXPR:
3532       case BIT_IOR_EXPR:
3533       case BIT_XOR_EXPR:
3534       case MULT_EXPR:
3535       case BIT_AND_EXPR:
3536         /* ADJUSMENT_DEF is NULL when called from
3537            vect_create_epilog_for_reduction to vectorize double reduction.  */
3538         if (adjustment_def)
3539           {
3540             if (nested_in_vect_loop)
3541               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3542                                                               NULL);
3543             else
3544               *adjustment_def = init_val;
3545           }
3546
3547         if (code == MULT_EXPR)
3548           {
3549             real_init_val = dconst1;
3550             int_init_val = 1;
3551           }
3552
3553         if (code == BIT_AND_EXPR)
3554           int_init_val = -1;
3555
3556         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3557           def_for_init = build_real (scalar_type, real_init_val);
3558         else
3559           def_for_init = build_int_cst (scalar_type, int_init_val);
3560
3561         /* Create a vector of '0' or '1' except the first element.  */
3562         elts = XALLOCAVEC (tree, nunits);
3563         for (i = nunits - 2; i >= 0; --i)
3564           elts[i + 1] = def_for_init;
3565
3566         /* Option1: the first element is '0' or '1' as well.  */
3567         if (adjustment_def)
3568           {
3569             elts[0] = def_for_init;
3570             init_def = build_vector (vectype, elts);
3571             break;
3572           }
3573
3574         /* Option2: the first element is INIT_VAL.  */
3575         elts[0] = init_val;
3576         if (TREE_CONSTANT (init_val))
3577           init_def = build_vector (vectype, elts);
3578         else
3579           {
3580             VEC(constructor_elt,gc) *v;
3581             v = VEC_alloc (constructor_elt, gc, nunits);
3582             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3583             for (i = 1; i < nunits; ++i)
3584               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3585             init_def = build_constructor (vectype, v);
3586           }
3587
3588         break;
3589
3590       case MIN_EXPR:
3591       case MAX_EXPR:
3592       case COND_EXPR:
3593         if (adjustment_def)
3594           {
3595             *adjustment_def = NULL_TREE;
3596             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3597             break;
3598           }
3599
3600         init_def = build_vector_from_val (vectype, init_value);
3601         break;
3602
3603       default:
3604         gcc_unreachable ();
3605     }
3606
3607   return init_def;
3608 }
3609
3610
3611 /* Function vect_create_epilog_for_reduction
3612
3613    Create code at the loop-epilog to finalize the result of a reduction
3614    computation.
3615
3616    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3617      reduction statements.
3618    STMT is the scalar reduction stmt that is being vectorized.
3619    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3620      number of elements that we can fit in a vectype (nunits).  In this case
3621      we have to generate more than one vector stmt - i.e - we need to "unroll"
3622      the vector stmt by a factor VF/nunits.  For more details see documentation
3623      in vectorizable_operation.
3624    REDUC_CODE is the tree-code for the epilog reduction.
3625    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3626      computation.
3627    REDUC_INDEX is the index of the operand in the right hand side of the
3628      statement that is defined by REDUCTION_PHI.
3629    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3630    SLP_NODE is an SLP node containing a group of reduction statements. The
3631      first one in this group is STMT.
3632
3633    This function:
3634    1. Creates the reduction def-use cycles: sets the arguments for
3635       REDUCTION_PHIS:
3636       The loop-entry argument is the vectorized initial-value of the reduction.
3637       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3638       sums.
3639    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3640       by applying the operation specified by REDUC_CODE if available, or by
3641       other means (whole-vector shifts or a scalar loop).
3642       The function also creates a new phi node at the loop exit to preserve
3643       loop-closed form, as illustrated below.
3644
3645      The flow at the entry to this function:
3646
3647         loop:
3648           vec_def = phi <null, null>            # REDUCTION_PHI
3649           VECT_DEF = vector_stmt                # vectorized form of STMT
3650           s_loop = scalar_stmt                  # (scalar) STMT
3651         loop_exit:
3652           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3653           use <s_out0>
3654           use <s_out0>
3655
3656      The above is transformed by this function into:
3657
3658         loop:
3659           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3660           VECT_DEF = vector_stmt                # vectorized form of STMT
3661           s_loop = scalar_stmt                  # (scalar) STMT
3662         loop_exit:
3663           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3664           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3665           v_out2 = reduce <v_out1>
3666           s_out3 = extract_field <v_out2, 0>
3667           s_out4 = adjust_result <s_out3>
3668           use <s_out4>
3669           use <s_out4>
3670 */
3671
3672 static void
3673 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3674                                   int ncopies, enum tree_code reduc_code,
3675                                   VEC (gimple, heap) *reduction_phis,
3676                                   int reduc_index, bool double_reduc,
3677                                   slp_tree slp_node)
3678 {
3679   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3680   stmt_vec_info prev_phi_info;
3681   tree vectype;
3682   enum machine_mode mode;
3683   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3684   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3685   basic_block exit_bb;
3686   tree scalar_dest;
3687   tree scalar_type;
3688   gimple new_phi = NULL, phi;
3689   gimple_stmt_iterator exit_gsi;
3690   tree vec_dest;
3691   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3692   gimple epilog_stmt = NULL;
3693   enum tree_code code = gimple_assign_rhs_code (stmt);
3694   gimple exit_phi;
3695   tree bitsize, bitpos;
3696   tree adjustment_def = NULL;
3697   tree vec_initial_def = NULL;
3698   tree reduction_op, expr, def;
3699   tree orig_name, scalar_result;
3700   imm_use_iterator imm_iter, phi_imm_iter;
3701   use_operand_p use_p, phi_use_p;
3702   bool extract_scalar_result = false;
3703   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3704   bool nested_in_vect_loop = false;
3705   VEC (gimple, heap) *new_phis = NULL;
3706   VEC (gimple, heap) *inner_phis = NULL;
3707   enum vect_def_type dt = vect_unknown_def_type;
3708   int j, i;
3709   VEC (tree, heap) *scalar_results = NULL;
3710   unsigned int group_size = 1, k, ratio;
3711   VEC (tree, heap) *vec_initial_defs = NULL;
3712   VEC (gimple, heap) *phis;
3713   bool slp_reduc = false;
3714   tree new_phi_result;
3715   gimple inner_phi = NULL;
3716
3717   if (slp_node)
3718     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3719
3720   if (nested_in_vect_loop_p (loop, stmt))
3721     {
3722       outer_loop = loop;
3723       loop = loop->inner;
3724       nested_in_vect_loop = true;
3725       gcc_assert (!slp_node);
3726     }
3727
3728   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3729     {
3730     case GIMPLE_SINGLE_RHS:
3731       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3732                   == ternary_op);
3733       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3734       break;
3735     case GIMPLE_UNARY_RHS:
3736       reduction_op = gimple_assign_rhs1 (stmt);
3737       break;
3738     case GIMPLE_BINARY_RHS:
3739       reduction_op = reduc_index ?
3740                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3741       break;
3742     case GIMPLE_TERNARY_RHS:
3743       reduction_op = gimple_op (stmt, reduc_index + 1);
3744       break;
3745     default:
3746       gcc_unreachable ();
3747     }
3748
3749   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3750   gcc_assert (vectype);
3751   mode = TYPE_MODE (vectype);
3752
3753   /* 1. Create the reduction def-use cycle:
3754      Set the arguments of REDUCTION_PHIS, i.e., transform
3755
3756         loop:
3757           vec_def = phi <null, null>            # REDUCTION_PHI
3758           VECT_DEF = vector_stmt                # vectorized form of STMT
3759           ...
3760
3761      into:
3762
3763         loop:
3764           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3765           VECT_DEF = vector_stmt                # vectorized form of STMT
3766           ...
3767
3768      (in case of SLP, do it for all the phis). */
3769
3770   /* Get the loop-entry arguments.  */
3771   if (slp_node)
3772     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3773                        NULL, slp_node, reduc_index);
3774   else
3775     {
3776       vec_initial_defs = VEC_alloc (tree, heap, 1);
3777      /* For the case of reduction, vect_get_vec_def_for_operand returns
3778         the scalar def before the loop, that defines the initial value
3779         of the reduction variable.  */
3780       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3781                                                       &adjustment_def);
3782       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3783     }
3784
3785   /* Set phi nodes arguments.  */
3786   FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3787     {
3788       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3789       tree def = VEC_index (tree, vect_defs, i);
3790       for (j = 0; j < ncopies; j++)
3791         {
3792           /* Set the loop-entry arg of the reduction-phi.  */
3793           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3794                        UNKNOWN_LOCATION);
3795
3796           /* Set the loop-latch arg for the reduction-phi.  */
3797           if (j > 0)
3798             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3799
3800           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3801
3802           if (dump_enabled_p ())
3803             {
3804               dump_printf_loc (MSG_NOTE, vect_location,
3805                                "transform reduction: created def-use cycle: ");
3806               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3807               dump_printf (MSG_NOTE, "\n");
3808               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3809             }
3810
3811           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3812         }
3813     }
3814
3815   VEC_free (tree, heap, vec_initial_defs);
3816
3817   /* 2. Create epilog code.
3818         The reduction epilog code operates across the elements of the vector
3819         of partial results computed by the vectorized loop.
3820         The reduction epilog code consists of:
3821
3822         step 1: compute the scalar result in a vector (v_out2)
3823         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3824         step 3: adjust the scalar result (s_out3) if needed.
3825
3826         Step 1 can be accomplished using one the following three schemes:
3827           (scheme 1) using reduc_code, if available.
3828           (scheme 2) using whole-vector shifts, if available.
3829           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3830                      combined.
3831
3832           The overall epilog code looks like this:
3833
3834           s_out0 = phi <s_loop>         # original EXIT_PHI
3835           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3836           v_out2 = reduce <v_out1>              # step 1
3837           s_out3 = extract_field <v_out2, 0>    # step 2
3838           s_out4 = adjust_result <s_out3>       # step 3
3839
3840           (step 3 is optional, and steps 1 and 2 may be combined).
3841           Lastly, the uses of s_out0 are replaced by s_out4.  */
3842
3843
3844   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3845          v_out1 = phi <VECT_DEF>
3846          Store them in NEW_PHIS.  */
3847
3848   exit_bb = single_exit (loop)->dest;
3849   prev_phi_info = NULL;
3850   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3851   FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3852     {
3853       for (j = 0; j < ncopies; j++)
3854         {
3855           tree new_def = copy_ssa_name (def, NULL);
3856           phi = create_phi_node (new_def, exit_bb);
3857           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3858           if (j == 0)
3859             VEC_quick_push (gimple, new_phis, phi);
3860           else
3861             {
3862               def = vect_get_vec_def_for_stmt_copy (dt, def);
3863               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3864             }
3865
3866           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3867           prev_phi_info = vinfo_for_stmt (phi);
3868         }
3869     }
3870
3871   /* The epilogue is created for the outer-loop, i.e., for the loop being
3872      vectorized.  Create exit phis for the outer loop.  */
3873   if (double_reduc)
3874     {
3875       loop = outer_loop;
3876       exit_bb = single_exit (loop)->dest;
3877       inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3878       FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
3879         {
3880           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3881           gimple outer_phi = create_phi_node (new_result, exit_bb);
3882           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3883                            PHI_RESULT (phi));
3884           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3885                                                             loop_vinfo, NULL));
3886           VEC_quick_push (gimple, inner_phis, phi);
3887           VEC_replace (gimple, new_phis, i, outer_phi);
3888           prev_phi_info = vinfo_for_stmt (outer_phi);
3889           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3890             {
3891               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3892               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3893               outer_phi = create_phi_node (new_result, exit_bb);
3894               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3895                                PHI_RESULT (phi));
3896               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3897                                                         loop_vinfo, NULL));
3898               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3899               prev_phi_info = vinfo_for_stmt (outer_phi);
3900             }
3901         }
3902     }
3903
3904   exit_gsi = gsi_after_labels (exit_bb);
3905
3906   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3907          (i.e. when reduc_code is not available) and in the final adjustment
3908          code (if needed).  Also get the original scalar reduction variable as
3909          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3910          represents a reduction pattern), the tree-code and scalar-def are
3911          taken from the original stmt that the pattern-stmt (STMT) replaces.
3912          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3913          are taken from STMT.  */
3914
3915   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3916   if (!orig_stmt)
3917     {
3918       /* Regular reduction  */
3919       orig_stmt = stmt;
3920     }
3921   else
3922     {
3923       /* Reduction pattern  */
3924       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3925       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3926       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3927     }
3928
3929   code = gimple_assign_rhs_code (orig_stmt);
3930   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3931      partial results are added and not subtracted.  */
3932   if (code == MINUS_EXPR)
3933     code = PLUS_EXPR;
3934
3935   scalar_dest = gimple_assign_lhs (orig_stmt);
3936   scalar_type = TREE_TYPE (scalar_dest);
3937   scalar_results = VEC_alloc (tree, heap, group_size);
3938   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3939   bitsize = TYPE_SIZE (scalar_type);
3940
3941   /* In case this is a reduction in an inner-loop while vectorizing an outer
3942      loop - we don't need to extract a single scalar result at the end of the
3943      inner-loop (unless it is double reduction, i.e., the use of reduction is
3944      outside the outer-loop).  The final vector of partial results will be used
3945      in the vectorized outer-loop, or reduced to a scalar result at the end of
3946      the outer-loop.  */
3947   if (nested_in_vect_loop && !double_reduc)
3948     goto vect_finalize_reduction;
3949
3950   /* SLP reduction without reduction chain, e.g.,
3951      # a1 = phi <a2, a0>
3952      # b1 = phi <b2, b0>
3953      a2 = operation (a1)
3954      b2 = operation (b1)  */
3955   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3956
3957   /* In case of reduction chain, e.g.,
3958      # a1 = phi <a3, a0>
3959      a2 = operation (a1)
3960      a3 = operation (a2),
3961
3962      we may end up with more than one vector result.  Here we reduce them to
3963      one vector.  */
3964   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3965     {
3966       tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3967       tree tmp;
3968       gimple new_vec_stmt = NULL;
3969
3970       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3971       for (k = 1; k < VEC_length (gimple, new_phis); k++)
3972         {
3973           gimple next_phi = VEC_index (gimple, new_phis, k);
3974           tree second_vect = PHI_RESULT (next_phi);
3975
3976           tmp = build2 (code, vectype,  first_vect, second_vect);
3977           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3978           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3979           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3980           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3981         }
3982
3983       new_phi_result = first_vect;
3984       if (new_vec_stmt)
3985         {
3986           VEC_truncate (gimple, new_phis, 0);
3987           VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3988         }
3989     }
3990   else
3991     new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3992
3993   /* 2.3 Create the reduction code, using one of the three schemes described
3994          above. In SLP we simply need to extract all the elements from the
3995          vector (without reducing them), so we use scalar shifts.  */
3996   if (reduc_code != ERROR_MARK && !slp_reduc)
3997     {
3998       tree tmp;
3999
4000       /*** Case 1:  Create:
4001            v_out2 = reduc_expr <v_out1>  */
4002
4003       if (dump_enabled_p ())
4004         dump_printf_loc (MSG_NOTE, vect_location,
4005                          "Reduce using direct vector reduction.");
4006
4007       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4008       tmp = build1 (reduc_code, vectype, new_phi_result);
4009       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4010       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4011       gimple_assign_set_lhs (epilog_stmt, new_temp);
4012       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4013
4014       extract_scalar_result = true;
4015     }
4016   else
4017     {
4018       enum tree_code shift_code = ERROR_MARK;
4019       bool have_whole_vector_shift = true;
4020       int bit_offset;
4021       int element_bitsize = tree_low_cst (bitsize, 1);
4022       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4023       tree vec_temp;
4024
4025       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4026         shift_code = VEC_RSHIFT_EXPR;
4027       else
4028         have_whole_vector_shift = false;
4029
4030       /* Regardless of whether we have a whole vector shift, if we're
4031          emulating the operation via tree-vect-generic, we don't want
4032          to use it.  Only the first round of the reduction is likely
4033          to still be profitable via emulation.  */
4034       /* ??? It might be better to emit a reduction tree code here, so that
4035          tree-vect-generic can expand the first round via bit tricks.  */
4036       if (!VECTOR_MODE_P (mode))
4037         have_whole_vector_shift = false;
4038       else
4039         {
4040           optab optab = optab_for_tree_code (code, vectype, optab_default);
4041           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4042             have_whole_vector_shift = false;
4043         }
4044
4045       if (have_whole_vector_shift && !slp_reduc)
4046         {
4047           /*** Case 2: Create:
4048              for (offset = VS/2; offset >= element_size; offset/=2)
4049                 {
4050                   Create:  va' = vec_shift <va, offset>
4051                   Create:  va = vop <va, va'>
4052                 }  */
4053
4054           if (dump_enabled_p ())
4055             dump_printf_loc (MSG_NOTE, vect_location,
4056                              "Reduce using vector shifts");
4057
4058           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4059           new_temp = new_phi_result;
4060           for (bit_offset = vec_size_in_bits/2;
4061                bit_offset >= element_bitsize;
4062                bit_offset /= 2)
4063             {
4064               tree bitpos = size_int (bit_offset);
4065
4066               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4067                                                vec_dest, new_temp, bitpos);
4068               new_name = make_ssa_name (vec_dest, epilog_stmt);
4069               gimple_assign_set_lhs (epilog_stmt, new_name);
4070               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4071
4072               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4073                                                           new_name, new_temp);
4074               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4075               gimple_assign_set_lhs (epilog_stmt, new_temp);
4076               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4077             }
4078
4079           extract_scalar_result = true;
4080         }
4081       else
4082         {
4083           tree rhs;
4084
4085           /*** Case 3: Create:
4086              s = extract_field <v_out2, 0>
4087              for (offset = element_size;
4088                   offset < vector_size;
4089                   offset += element_size;)
4090                {
4091                  Create:  s' = extract_field <v_out2, offset>
4092                  Create:  s = op <s, s'>  // For non SLP cases
4093                }  */
4094
4095           if (dump_enabled_p ())
4096             dump_printf_loc (MSG_NOTE, vect_location,
4097                              "Reduce using scalar code. ");
4098
4099           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4100           FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
4101             {
4102               if (gimple_code (new_phi) == GIMPLE_PHI)
4103                 vec_temp = PHI_RESULT (new_phi);
4104               else
4105                 vec_temp = gimple_assign_lhs (new_phi);
4106               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4107                             bitsize_zero_node);
4108               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4109               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4110               gimple_assign_set_lhs (epilog_stmt, new_temp);
4111               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4112
4113               /* In SLP we don't need to apply reduction operation, so we just
4114                  collect s' values in SCALAR_RESULTS.  */
4115               if (slp_reduc)
4116                 VEC_safe_push (tree, heap, scalar_results, new_temp);
4117
4118               for (bit_offset = element_bitsize;
4119                    bit_offset < vec_size_in_bits;
4120                    bit_offset += element_bitsize)
4121                 {
4122                   tree bitpos = bitsize_int (bit_offset);
4123                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4124                                      bitsize, bitpos);
4125
4126                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4127                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4128                   gimple_assign_set_lhs (epilog_stmt, new_name);
4129                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4130
4131                   if (slp_reduc)
4132                     {
4133                       /* In SLP we don't need to apply reduction operation, so
4134                          we just collect s' values in SCALAR_RESULTS.  */
4135                       new_temp = new_name;
4136                       VEC_safe_push (tree, heap, scalar_results, new_name);
4137                     }
4138                   else
4139                     {
4140                       epilog_stmt = gimple_build_assign_with_ops (code,
4141                                           new_scalar_dest, new_name, new_temp);
4142                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4143                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4144                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4145                     }
4146                 }
4147             }
4148
4149           /* The only case where we need to reduce scalar results in SLP, is
4150              unrolling.  If the size of SCALAR_RESULTS is greater than
4151              GROUP_SIZE, we reduce them combining elements modulo
4152              GROUP_SIZE.  */
4153           if (slp_reduc)
4154             {
4155               tree res, first_res, new_res;
4156               gimple new_stmt;
4157
4158               /* Reduce multiple scalar results in case of SLP unrolling.  */
4159               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
4160                    j++)
4161                 {
4162                   first_res = VEC_index (tree, scalar_results, j % group_size);
4163                   new_stmt = gimple_build_assign_with_ops (code,
4164                                               new_scalar_dest, first_res, res);
4165                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4166                   gimple_assign_set_lhs (new_stmt, new_res);
4167                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4168                   VEC_replace (tree, scalar_results, j % group_size, new_res);
4169                 }
4170             }
4171           else
4172             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4173             VEC_safe_push (tree, heap, scalar_results, new_temp);
4174
4175           extract_scalar_result = false;
4176         }
4177     }
4178
4179   /* 2.4  Extract the final scalar result.  Create:
4180           s_out3 = extract_field <v_out2, bitpos>  */
4181
4182   if (extract_scalar_result)
4183     {
4184       tree rhs;
4185
4186       if (dump_enabled_p ())
4187         dump_printf_loc (MSG_NOTE, vect_location,
4188                          "extract scalar result");
4189
4190       if (BYTES_BIG_ENDIAN)
4191         bitpos = size_binop (MULT_EXPR,
4192                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4193                              TYPE_SIZE (scalar_type));
4194       else
4195         bitpos = bitsize_zero_node;
4196
4197       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4198       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4199       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4200       gimple_assign_set_lhs (epilog_stmt, new_temp);
4201       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4202       VEC_safe_push (tree, heap, scalar_results, new_temp);
4203     }
4204
4205 vect_finalize_reduction:
4206
4207   if (double_reduc)
4208     loop = loop->inner;
4209
4210   /* 2.5 Adjust the final result by the initial value of the reduction
4211          variable. (When such adjustment is not needed, then
4212          'adjustment_def' is zero).  For example, if code is PLUS we create:
4213          new_temp = loop_exit_def + adjustment_def  */
4214
4215   if (adjustment_def)
4216     {
4217       gcc_assert (!slp_reduc);
4218       if (nested_in_vect_loop)
4219         {
4220           new_phi = VEC_index (gimple, new_phis, 0);
4221           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4222           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4223           new_dest = vect_create_destination_var (scalar_dest, vectype);
4224         }
4225       else
4226         {
4227           new_temp = VEC_index (tree, scalar_results, 0);
4228           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4229           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4230           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4231         }
4232
4233       epilog_stmt = gimple_build_assign (new_dest, expr);
4234       new_temp = make_ssa_name (new_dest, epilog_stmt);
4235       gimple_assign_set_lhs (epilog_stmt, new_temp);
4236       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4237       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4238       if (nested_in_vect_loop)
4239         {
4240           set_vinfo_for_stmt (epilog_stmt,
4241                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4242                                                  NULL));
4243           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4244                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4245
4246           if (!double_reduc)
4247             VEC_quick_push (tree, scalar_results, new_temp);
4248           else
4249             VEC_replace (tree, scalar_results, 0, new_temp);
4250         }
4251       else
4252         VEC_replace (tree, scalar_results, 0, new_temp);
4253
4254       VEC_replace (gimple, new_phis, 0, epilog_stmt);
4255     }
4256
4257   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4258           phis with new adjusted scalar results, i.e., replace use <s_out0>
4259           with use <s_out4>.
4260
4261      Transform:
4262         loop_exit:
4263           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4264           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4265           v_out2 = reduce <v_out1>
4266           s_out3 = extract_field <v_out2, 0>
4267           s_out4 = adjust_result <s_out3>
4268           use <s_out0>
4269           use <s_out0>
4270
4271      into:
4272
4273         loop_exit:
4274           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4275           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4276           v_out2 = reduce <v_out1>
4277           s_out3 = extract_field <v_out2, 0>
4278           s_out4 = adjust_result <s_out3>
4279           use <s_out4>
4280           use <s_out4> */
4281
4282
4283   /* In SLP reduction chain we reduce vector results into one vector if
4284      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4285      the last stmt in the reduction chain, since we are looking for the loop
4286      exit phi node.  */
4287   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4288     {
4289       scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4290                                        SLP_TREE_SCALAR_STMTS (slp_node),
4291                                        group_size - 1));
4292       group_size = 1;
4293     }
4294
4295   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4296      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4297      need to match SCALAR_RESULTS with corresponding statements.  The first
4298      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4299      the first vector stmt, etc.
4300      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4301   if (group_size > VEC_length (gimple, new_phis))
4302     {
4303       ratio = group_size / VEC_length (gimple, new_phis);
4304       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4305     }
4306   else
4307     ratio = 1;
4308
4309   for (k = 0; k < group_size; k++)
4310     {
4311       if (k % ratio == 0)
4312         {
4313           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4314           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4315           if (double_reduc)
4316             inner_phi = VEC_index (gimple, inner_phis, k / ratio);
4317         }
4318
4319       if (slp_reduc)
4320         {
4321           gimple current_stmt = VEC_index (gimple,
4322                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
4323
4324           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4325           /* SLP statements can't participate in patterns.  */
4326           gcc_assert (!orig_stmt);
4327           scalar_dest = gimple_assign_lhs (current_stmt);
4328         }
4329
4330       phis = VEC_alloc (gimple, heap, 3);
4331       /* Find the loop-closed-use at the loop exit of the original scalar
4332          result.  (The reduction result is expected to have two immediate uses -
4333          one at the latch block, and one at the loop exit).  */
4334       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4335         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4336           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4337
4338       /* We expect to have found an exit_phi because of loop-closed-ssa
4339          form.  */
4340       gcc_assert (!VEC_empty (gimple, phis));
4341
4342       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4343         {
4344           if (outer_loop)
4345             {
4346               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4347               gimple vect_phi;
4348
4349               /* FORNOW. Currently not supporting the case that an inner-loop
4350                  reduction is not used in the outer-loop (but only outside the
4351                  outer-loop), unless it is double reduction.  */
4352               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4353                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4354                           || double_reduc);
4355
4356               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4357               if (!double_reduc
4358                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4359                       != vect_double_reduction_def)
4360                 continue;
4361
4362               /* Handle double reduction:
4363
4364                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4365                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4366                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4367                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4368
4369                  At that point the regular reduction (stmt2 and stmt3) is
4370                  already vectorized, as well as the exit phi node, stmt4.
4371                  Here we vectorize the phi node of double reduction, stmt1, and
4372                  update all relevant statements.  */
4373
4374               /* Go through all the uses of s2 to find double reduction phi
4375                  node, i.e., stmt1 above.  */
4376               orig_name = PHI_RESULT (exit_phi);
4377               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4378                 {
4379                   stmt_vec_info use_stmt_vinfo;
4380                   stmt_vec_info new_phi_vinfo;
4381                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4382                   basic_block bb = gimple_bb (use_stmt);
4383                   gimple use;
4384
4385                   /* Check that USE_STMT is really double reduction phi
4386                      node.  */
4387                   if (gimple_code (use_stmt) != GIMPLE_PHI
4388                       || gimple_phi_num_args (use_stmt) != 2
4389                       || bb->loop_father != outer_loop)
4390                     continue;
4391                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4392                   if (!use_stmt_vinfo
4393                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4394                           != vect_double_reduction_def)
4395                     continue;
4396
4397                   /* Create vector phi node for double reduction:
4398                      vs1 = phi <vs0, vs2>
4399                      vs1 was created previously in this function by a call to
4400                        vect_get_vec_def_for_operand and is stored in
4401                        vec_initial_def;
4402                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4403                      vs0 is created here.  */
4404
4405                   /* Create vector phi node.  */
4406                   vect_phi = create_phi_node (vec_initial_def, bb);
4407                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4408                                     loop_vec_info_for_loop (outer_loop), NULL);
4409                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4410
4411                   /* Create vs0 - initial def of the double reduction phi.  */
4412                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4413                                              loop_preheader_edge (outer_loop));
4414                   init_def = get_initial_def_for_reduction (stmt,
4415                                                           preheader_arg, NULL);
4416                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4417                                                     vectype, NULL);
4418
4419                   /* Update phi node arguments with vs0 and vs2.  */
4420                   add_phi_arg (vect_phi, vect_phi_init,
4421                                loop_preheader_edge (outer_loop),
4422                                UNKNOWN_LOCATION);
4423                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4424                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4425                   if (dump_enabled_p ())
4426                     {
4427                       dump_printf_loc (MSG_NOTE, vect_location,
4428                                        "created double reduction phi node: ");
4429                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4430                     }
4431
4432                   vect_phi_res = PHI_RESULT (vect_phi);
4433
4434                   /* Replace the use, i.e., set the correct vs1 in the regular
4435                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4436                      loop is redundant.  */
4437                   use = reduction_phi;
4438                   for (j = 0; j < ncopies; j++)
4439                     {
4440                       edge pr_edge = loop_preheader_edge (loop);
4441                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4442                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4443                     }
4444                 }
4445             }
4446         }
4447
4448       VEC_free (gimple, heap, phis);
4449       if (nested_in_vect_loop)
4450         {
4451           if (double_reduc)
4452             loop = outer_loop;
4453           else
4454             continue;
4455         }
4456
4457       phis = VEC_alloc (gimple, heap, 3);
4458       /* Find the loop-closed-use at the loop exit of the original scalar
4459          result.  (The reduction result is expected to have two immediate uses,
4460          one at the latch block, and one at the loop exit).  For double
4461          reductions we are looking for exit phis of the outer loop.  */
4462       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4463         {
4464           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4465             VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4466           else
4467             {
4468               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4469                 {
4470                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4471
4472                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4473                     {
4474                       if (!flow_bb_inside_loop_p (loop,
4475                                              gimple_bb (USE_STMT (phi_use_p))))
4476                         VEC_safe_push (gimple, heap, phis,
4477                                        USE_STMT (phi_use_p));
4478                     }
4479                 }
4480             }
4481         }
4482
4483       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4484         {
4485           /* Replace the uses:  */
4486           orig_name = PHI_RESULT (exit_phi);
4487           scalar_result = VEC_index (tree, scalar_results, k);
4488           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4489             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4490               SET_USE (use_p, scalar_result);
4491         }
4492
4493       VEC_free (gimple, heap, phis);
4494     }
4495
4496   VEC_free (tree, heap, scalar_results);
4497   VEC_free (gimple, heap, new_phis);
4498 }
4499
4500
4501 /* Function vectorizable_reduction.
4502
4503    Check if STMT performs a reduction operation that can be vectorized.
4504    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4505    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4506    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4507
4508    This function also handles reduction idioms (patterns) that have been
4509    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4510    of this form:
4511      X = pattern_expr (arg0, arg1, ..., X)
4512    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4513    sequence that had been detected and replaced by the pattern-stmt (STMT).
4514
4515    In some cases of reduction patterns, the type of the reduction variable X is
4516    different than the type of the other arguments of STMT.
4517    In such cases, the vectype that is used when transforming STMT into a vector
4518    stmt is different than the vectype that is used to determine the
4519    vectorization factor, because it consists of a different number of elements
4520    than the actual number of elements that are being operated upon in parallel.
4521
4522    For example, consider an accumulation of shorts into an int accumulator.
4523    On some targets it's possible to vectorize this pattern operating on 8
4524    shorts at a time (hence, the vectype for purposes of determining the
4525    vectorization factor should be V8HI); on the other hand, the vectype that
4526    is used to create the vector form is actually V4SI (the type of the result).
4527
4528    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4529    indicates what is the actual level of parallelism (V8HI in the example), so
4530    that the right vectorization factor would be derived.  This vectype
4531    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4532    be used to create the vectorized stmt.  The right vectype for the vectorized
4533    stmt is obtained from the type of the result X:
4534         get_vectype_for_scalar_type (TREE_TYPE (X))
4535
4536    This means that, contrary to "regular" reductions (or "regular" stmts in
4537    general), the following equation:
4538       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4539    does *NOT* necessarily hold for reduction patterns.  */
4540
4541 bool
4542 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4543                         gimple *vec_stmt, slp_tree slp_node)
4544 {
4545   tree vec_dest;
4546   tree scalar_dest;
4547   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4548   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4549   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4550   tree vectype_in = NULL_TREE;
4551   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4552   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4553   enum tree_code code, orig_code, epilog_reduc_code;
4554   enum machine_mode vec_mode;
4555   int op_type;
4556   optab optab, reduc_optab;
4557   tree new_temp = NULL_TREE;
4558   tree def;
4559   gimple def_stmt;
4560   enum vect_def_type dt;
4561   gimple new_phi = NULL;
4562   tree scalar_type;
4563   bool is_simple_use;
4564   gimple orig_stmt;
4565   stmt_vec_info orig_stmt_info;
4566   tree expr = NULL_TREE;
4567   int i;
4568   int ncopies;
4569   int epilog_copies;
4570   stmt_vec_info prev_stmt_info, prev_phi_info;
4571   bool single_defuse_cycle = false;
4572   tree reduc_def = NULL_TREE;
4573   gimple new_stmt = NULL;
4574   int j;
4575   tree ops[3];
4576   bool nested_cycle = false, found_nested_cycle_def = false;
4577   gimple reduc_def_stmt = NULL;
4578   /* The default is that the reduction variable is the last in statement.  */
4579   int reduc_index = 2;
4580   bool double_reduc = false, dummy;
4581   basic_block def_bb;
4582   struct loop * def_stmt_loop, *outer_loop = NULL;
4583   tree def_arg;
4584   gimple def_arg_stmt;
4585   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4586   VEC (gimple, heap) *phis = NULL;
4587   int vec_num;
4588   tree def0, def1, tem, op0, op1 = NULL_TREE;
4589
4590   /* In case of reduction chain we switch to the first stmt in the chain, but
4591      we don't update STMT_INFO, since only the last stmt is marked as reduction
4592      and has reduction properties.  */
4593   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4594     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4595
4596   if (nested_in_vect_loop_p (loop, stmt))
4597     {
4598       outer_loop = loop;
4599       loop = loop->inner;
4600       nested_cycle = true;
4601     }
4602
4603   /* 1. Is vectorizable reduction?  */
4604   /* Not supportable if the reduction variable is used in the loop, unless
4605      it's a reduction chain.  */
4606   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4607       && !GROUP_FIRST_ELEMENT (stmt_info))
4608     return false;
4609
4610   /* Reductions that are not used even in an enclosing outer-loop,
4611      are expected to be "live" (used out of the loop).  */
4612   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4613       && !STMT_VINFO_LIVE_P (stmt_info))
4614     return false;
4615
4616   /* Make sure it was already recognized as a reduction computation.  */
4617   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4618       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4619     return false;
4620
4621   /* 2. Has this been recognized as a reduction pattern?
4622
4623      Check if STMT represents a pattern that has been recognized
4624      in earlier analysis stages.  For stmts that represent a pattern,
4625      the STMT_VINFO_RELATED_STMT field records the last stmt in
4626      the original sequence that constitutes the pattern.  */
4627
4628   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4629   if (orig_stmt)
4630     {
4631       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4632       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4633       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4634       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4635     }
4636
4637   /* 3. Check the operands of the operation.  The first operands are defined
4638         inside the loop body. The last operand is the reduction variable,
4639         which is defined by the loop-header-phi.  */
4640
4641   gcc_assert (is_gimple_assign (stmt));
4642
4643   /* Flatten RHS.  */
4644   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4645     {
4646     case GIMPLE_SINGLE_RHS:
4647       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4648       if (op_type == ternary_op)
4649         {
4650           tree rhs = gimple_assign_rhs1 (stmt);
4651           ops[0] = TREE_OPERAND (rhs, 0);
4652           ops[1] = TREE_OPERAND (rhs, 1);
4653           ops[2] = TREE_OPERAND (rhs, 2);
4654           code = TREE_CODE (rhs);
4655         }
4656       else
4657         return false;
4658       break;
4659
4660     case GIMPLE_BINARY_RHS:
4661       code = gimple_assign_rhs_code (stmt);
4662       op_type = TREE_CODE_LENGTH (code);
4663       gcc_assert (op_type == binary_op);
4664       ops[0] = gimple_assign_rhs1 (stmt);
4665       ops[1] = gimple_assign_rhs2 (stmt);
4666       break;
4667
4668     case GIMPLE_TERNARY_RHS:
4669       code = gimple_assign_rhs_code (stmt);
4670       op_type = TREE_CODE_LENGTH (code);
4671       gcc_assert (op_type == ternary_op);
4672       ops[0] = gimple_assign_rhs1 (stmt);
4673       ops[1] = gimple_assign_rhs2 (stmt);
4674       ops[2] = gimple_assign_rhs3 (stmt);
4675       break;
4676
4677     case GIMPLE_UNARY_RHS:
4678       return false;
4679
4680     default:
4681       gcc_unreachable ();
4682     }
4683
4684   if (code == COND_EXPR && slp_node)
4685     return false;
4686
4687   scalar_dest = gimple_assign_lhs (stmt);
4688   scalar_type = TREE_TYPE (scalar_dest);
4689   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4690       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4691     return false;
4692
4693   /* Do not try to vectorize bit-precision reductions.  */
4694   if ((TYPE_PRECISION (scalar_type)
4695        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4696     return false;
4697
4698   /* All uses but the last are expected to be defined in the loop.
4699      The last use is the reduction variable.  In case of nested cycle this
4700      assumption is not true: we use reduc_index to record the index of the
4701      reduction variable.  */
4702   for (i = 0; i < op_type-1; i++)
4703     {
4704       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4705       if (i == 0 && code == COND_EXPR)
4706         continue;
4707
4708       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4709                                             &def_stmt, &def, &dt, &tem);
4710       if (!vectype_in)
4711         vectype_in = tem;
4712       gcc_assert (is_simple_use);
4713
4714       if (dt != vect_internal_def
4715           && dt != vect_external_def
4716           && dt != vect_constant_def
4717           && dt != vect_induction_def
4718           && !(dt == vect_nested_cycle && nested_cycle))
4719         return false;
4720
4721       if (dt == vect_nested_cycle)
4722         {
4723           found_nested_cycle_def = true;
4724           reduc_def_stmt = def_stmt;
4725           reduc_index = i;
4726         }
4727     }
4728
4729   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4730                                         &def_stmt, &def, &dt, &tem);
4731   if (!vectype_in)
4732     vectype_in = tem;
4733   gcc_assert (is_simple_use);
4734   gcc_assert (dt == vect_reduction_def
4735               || dt == vect_nested_cycle
4736               || ((dt == vect_internal_def || dt == vect_external_def
4737                    || dt == vect_constant_def || dt == vect_induction_def)
4738                    && nested_cycle && found_nested_cycle_def));
4739   if (!found_nested_cycle_def)
4740     reduc_def_stmt = def_stmt;
4741
4742   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4743   if (orig_stmt)
4744     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4745                                                        reduc_def_stmt,
4746                                                        !nested_cycle,
4747                                                        &dummy));
4748   else
4749     {
4750       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4751                                              !nested_cycle, &dummy);
4752       /* We changed STMT to be the first stmt in reduction chain, hence we
4753          check that in this case the first element in the chain is STMT.  */
4754       gcc_assert (stmt == tmp
4755                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4756     }
4757
4758   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4759     return false;
4760
4761   if (slp_node || PURE_SLP_STMT (stmt_info))
4762     ncopies = 1;
4763   else
4764     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4765                / TYPE_VECTOR_SUBPARTS (vectype_in));
4766
4767   gcc_assert (ncopies >= 1);
4768
4769   vec_mode = TYPE_MODE (vectype_in);
4770
4771   if (code == COND_EXPR)
4772     {
4773       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4774         {
4775           if (dump_enabled_p ())
4776             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4777                              "unsupported condition in reduction");
4778
4779             return false;
4780         }
4781     }
4782   else
4783     {
4784       /* 4. Supportable by target?  */
4785
4786       /* 4.1. check support for the operation in the loop  */
4787       optab = optab_for_tree_code (code, vectype_in, optab_default);
4788       if (!optab)
4789         {
4790           if (dump_enabled_p ())
4791             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4792                              "no optab.");
4793
4794           return false;
4795         }
4796
4797       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4798         {
4799           if (dump_enabled_p ())
4800             dump_printf (MSG_NOTE, "op not supported by target.");
4801
4802           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4803               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4804                   < vect_min_worthwhile_factor (code))
4805             return false;
4806
4807           if (dump_enabled_p ())
4808             dump_printf (MSG_NOTE, "proceeding using word mode.");
4809         }
4810
4811       /* Worthwhile without SIMD support?  */
4812       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4813           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4814              < vect_min_worthwhile_factor (code))
4815         {
4816           if (dump_enabled_p ())
4817             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4818                              "not worthwhile without SIMD support.");
4819
4820           return false;
4821         }
4822     }
4823
4824   /* 4.2. Check support for the epilog operation.
4825
4826           If STMT represents a reduction pattern, then the type of the
4827           reduction variable may be different than the type of the rest
4828           of the arguments.  For example, consider the case of accumulation
4829           of shorts into an int accumulator; The original code:
4830                         S1: int_a = (int) short_a;
4831           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4832
4833           was replaced with:
4834                         STMT: int_acc = widen_sum <short_a, int_acc>
4835
4836           This means that:
4837           1. The tree-code that is used to create the vector operation in the
4838              epilog code (that reduces the partial results) is not the
4839              tree-code of STMT, but is rather the tree-code of the original
4840              stmt from the pattern that STMT is replacing.  I.e, in the example
4841              above we want to use 'widen_sum' in the loop, but 'plus' in the
4842              epilog.
4843           2. The type (mode) we use to check available target support
4844              for the vector operation to be created in the *epilog*, is
4845              determined by the type of the reduction variable (in the example
4846              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4847              However the type (mode) we use to check available target support
4848              for the vector operation to be created *inside the loop*, is
4849              determined by the type of the other arguments to STMT (in the
4850              example we'd check this: optab_handler (widen_sum_optab,
4851              vect_short_mode)).
4852
4853           This is contrary to "regular" reductions, in which the types of all
4854           the arguments are the same as the type of the reduction variable.
4855           For "regular" reductions we can therefore use the same vector type
4856           (and also the same tree-code) when generating the epilog code and
4857           when generating the code inside the loop.  */
4858
4859   if (orig_stmt)
4860     {
4861       /* This is a reduction pattern: get the vectype from the type of the
4862          reduction variable, and get the tree-code from orig_stmt.  */
4863       orig_code = gimple_assign_rhs_code (orig_stmt);
4864       gcc_assert (vectype_out);
4865       vec_mode = TYPE_MODE (vectype_out);
4866     }
4867   else
4868     {
4869       /* Regular reduction: use the same vectype and tree-code as used for
4870          the vector code inside the loop can be used for the epilog code. */
4871       orig_code = code;
4872     }
4873
4874   if (nested_cycle)
4875     {
4876       def_bb = gimple_bb (reduc_def_stmt);
4877       def_stmt_loop = def_bb->loop_father;
4878       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4879                                        loop_preheader_edge (def_stmt_loop));
4880       if (TREE_CODE (def_arg) == SSA_NAME
4881           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4882           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4883           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4884           && vinfo_for_stmt (def_arg_stmt)
4885           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4886               == vect_double_reduction_def)
4887         double_reduc = true;
4888     }
4889
4890   epilog_reduc_code = ERROR_MARK;
4891   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4892     {
4893       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4894                                          optab_default);
4895       if (!reduc_optab)
4896         {
4897           if (dump_enabled_p ())
4898             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4899                              "no optab for reduction.");
4900
4901           epilog_reduc_code = ERROR_MARK;
4902         }
4903
4904       if (reduc_optab
4905           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4906         {
4907           if (dump_enabled_p ())
4908             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4909                              "reduc op not supported by target.");
4910
4911           epilog_reduc_code = ERROR_MARK;
4912         }
4913     }
4914   else
4915     {
4916       if (!nested_cycle || double_reduc)
4917         {
4918           if (dump_enabled_p ())
4919             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4920                              "no reduc code for scalar code.");
4921
4922           return false;
4923         }
4924     }
4925
4926   if (double_reduc && ncopies > 1)
4927     {
4928       if (dump_enabled_p ())
4929         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4930                          "multiple types in double reduction");
4931
4932       return false;
4933     }
4934
4935   /* In case of widenning multiplication by a constant, we update the type
4936      of the constant to be the type of the other operand.  We check that the
4937      constant fits the type in the pattern recognition pass.  */
4938   if (code == DOT_PROD_EXPR
4939       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4940     {
4941       if (TREE_CODE (ops[0]) == INTEGER_CST)
4942         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4943       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4944         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4945       else
4946         {
4947           if (dump_enabled_p ())
4948             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4949                              "invalid types in dot-prod");
4950
4951           return false;
4952         }
4953     }
4954
4955   if (!vec_stmt) /* transformation not required.  */
4956     {
4957       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4958         return false;
4959       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4960       return true;
4961     }
4962
4963   /** Transform.  **/
4964
4965   if (dump_enabled_p ())
4966     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4967
4968   /* FORNOW: Multiple types are not supported for condition.  */
4969   if (code == COND_EXPR)
4970     gcc_assert (ncopies == 1);
4971
4972   /* Create the destination vector  */
4973   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4974
4975   /* In case the vectorization factor (VF) is bigger than the number
4976      of elements that we can fit in a vectype (nunits), we have to generate
4977      more than one vector stmt - i.e - we need to "unroll" the
4978      vector stmt by a factor VF/nunits.  For more details see documentation
4979      in vectorizable_operation.  */
4980
4981   /* If the reduction is used in an outer loop we need to generate
4982      VF intermediate results, like so (e.g. for ncopies=2):
4983         r0 = phi (init, r0)
4984         r1 = phi (init, r1)
4985         r0 = x0 + r0;
4986         r1 = x1 + r1;
4987     (i.e. we generate VF results in 2 registers).
4988     In this case we have a separate def-use cycle for each copy, and therefore
4989     for each copy we get the vector def for the reduction variable from the
4990     respective phi node created for this copy.
4991
4992     Otherwise (the reduction is unused in the loop nest), we can combine
4993     together intermediate results, like so (e.g. for ncopies=2):
4994         r = phi (init, r)
4995         r = x0 + r;
4996         r = x1 + r;
4997    (i.e. we generate VF/2 results in a single register).
4998    In this case for each copy we get the vector def for the reduction variable
4999    from the vectorized reduction operation generated in the previous iteration.
5000   */
5001
5002   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5003     {
5004       single_defuse_cycle = true;
5005       epilog_copies = 1;
5006     }
5007   else
5008     epilog_copies = ncopies;
5009
5010   prev_stmt_info = NULL;
5011   prev_phi_info = NULL;
5012   if (slp_node)
5013     {
5014       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5015       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5016                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5017     }
5018   else
5019     {
5020       vec_num = 1;
5021       vec_oprnds0 = VEC_alloc (tree, heap, 1);
5022       if (op_type == ternary_op)
5023         vec_oprnds1 = VEC_alloc (tree, heap, 1);
5024     }
5025
5026   phis = VEC_alloc (gimple, heap, vec_num);
5027   vect_defs = VEC_alloc (tree, heap, vec_num);
5028   if (!slp_node)
5029     VEC_quick_push (tree, vect_defs, NULL_TREE);
5030
5031   for (j = 0; j < ncopies; j++)
5032     {
5033       if (j == 0 || !single_defuse_cycle)
5034         {
5035           for (i = 0; i < vec_num; i++)
5036             {
5037               /* Create the reduction-phi that defines the reduction
5038                  operand.  */
5039               new_phi = create_phi_node (vec_dest, loop->header);
5040               set_vinfo_for_stmt (new_phi,
5041                                   new_stmt_vec_info (new_phi, loop_vinfo,
5042                                                      NULL));
5043                if (j == 0 || slp_node)
5044                  VEC_quick_push (gimple, phis, new_phi);
5045             }
5046         }
5047
5048       if (code == COND_EXPR)
5049         {
5050           gcc_assert (!slp_node);
5051           vectorizable_condition (stmt, gsi, vec_stmt,
5052                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
5053                                   reduc_index, NULL);
5054           /* Multiple types are not supported for condition.  */
5055           break;
5056         }
5057
5058       /* Handle uses.  */
5059       if (j == 0)
5060         {
5061           op0 = ops[!reduc_index];
5062           if (op_type == ternary_op)
5063             {
5064               if (reduc_index == 0)
5065                 op1 = ops[2];
5066               else
5067                 op1 = ops[1];
5068             }
5069
5070           if (slp_node)
5071             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5072                                slp_node, -1);
5073           else
5074             {
5075               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5076                                                             stmt, NULL);
5077               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
5078               if (op_type == ternary_op)
5079                {
5080                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5081                                                                NULL);
5082                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
5083                }
5084             }
5085         }
5086       else
5087         {
5088           if (!slp_node)
5089             {
5090               enum vect_def_type dt;
5091               gimple dummy_stmt;
5092               tree dummy;
5093
5094               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5095                                   &dummy_stmt, &dummy, &dt);
5096               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5097                                                               loop_vec_def0);
5098               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
5099               if (op_type == ternary_op)
5100                 {
5101                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5102                                       &dummy, &dt);
5103                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5104                                                                 loop_vec_def1);
5105                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
5106                 }
5107             }
5108
5109           if (single_defuse_cycle)
5110             reduc_def = gimple_assign_lhs (new_stmt);
5111
5112           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5113         }
5114
5115       FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
5116         {
5117           if (slp_node)
5118             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
5119           else
5120             {
5121               if (!single_defuse_cycle || j == 0)
5122                 reduc_def = PHI_RESULT (new_phi);
5123             }
5124
5125           def1 = ((op_type == ternary_op)
5126                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
5127           if (op_type == binary_op)
5128             {
5129               if (reduc_index == 0)
5130                 expr = build2 (code, vectype_out, reduc_def, def0);
5131               else
5132                 expr = build2 (code, vectype_out, def0, reduc_def);
5133             }
5134           else
5135             {
5136               if (reduc_index == 0)
5137                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5138               else
5139                 {
5140                   if (reduc_index == 1)
5141                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5142                   else
5143                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5144                 }
5145             }
5146
5147           new_stmt = gimple_build_assign (vec_dest, expr);
5148           new_temp = make_ssa_name (vec_dest, new_stmt);
5149           gimple_assign_set_lhs (new_stmt, new_temp);
5150           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5151
5152           if (slp_node)
5153             {
5154               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5155               VEC_quick_push (tree, vect_defs, new_temp);
5156             }
5157           else
5158             VEC_replace (tree, vect_defs, 0, new_temp);
5159         }
5160
5161       if (slp_node)
5162         continue;
5163
5164       if (j == 0)
5165         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5166       else
5167         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5168
5169       prev_stmt_info = vinfo_for_stmt (new_stmt);
5170       prev_phi_info = vinfo_for_stmt (new_phi);
5171     }
5172
5173   /* Finalize the reduction-phi (set its arguments) and create the
5174      epilog reduction code.  */
5175   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5176     {
5177       new_temp = gimple_assign_lhs (*vec_stmt);
5178       VEC_replace (tree, vect_defs, 0, new_temp);
5179     }
5180
5181   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5182                                     epilog_reduc_code, phis, reduc_index,
5183                                     double_reduc, slp_node);
5184
5185   VEC_free (gimple, heap, phis);
5186   VEC_free (tree, heap, vec_oprnds0);
5187   if (vec_oprnds1)
5188     VEC_free (tree, heap, vec_oprnds1);
5189
5190   return true;
5191 }
5192
5193 /* Function vect_min_worthwhile_factor.
5194
5195    For a loop where we could vectorize the operation indicated by CODE,
5196    return the minimum vectorization factor that makes it worthwhile
5197    to use generic vectors.  */
5198 int
5199 vect_min_worthwhile_factor (enum tree_code code)
5200 {
5201   switch (code)
5202     {
5203     case PLUS_EXPR:
5204     case MINUS_EXPR:
5205     case NEGATE_EXPR:
5206       return 4;
5207
5208     case BIT_AND_EXPR:
5209     case BIT_IOR_EXPR:
5210     case BIT_XOR_EXPR:
5211     case BIT_NOT_EXPR:
5212       return 2;
5213
5214     default:
5215       return INT_MAX;
5216     }
5217 }
5218
5219
5220 /* Function vectorizable_induction
5221
5222    Check if PHI performs an induction computation that can be vectorized.
5223    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5224    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5225    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5226
5227 bool
5228 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5229                         gimple *vec_stmt)
5230 {
5231   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5232   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5233   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5234   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5235   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5236   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5237   tree vec_def;
5238
5239   gcc_assert (ncopies >= 1);
5240   /* FORNOW. These restrictions should be relaxed.  */
5241   if (nested_in_vect_loop_p (loop, phi))
5242     {
5243       imm_use_iterator imm_iter;
5244       use_operand_p use_p;
5245       gimple exit_phi;
5246       edge latch_e;
5247       tree loop_arg;
5248
5249       if (ncopies > 1)
5250         {
5251           if (dump_enabled_p ())
5252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5253                              "multiple types in nested loop.");
5254           return false;
5255         }
5256
5257       exit_phi = NULL;
5258       latch_e = loop_latch_edge (loop->inner);
5259       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5260       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5261         {
5262           if (!flow_bb_inside_loop_p (loop->inner,
5263                                       gimple_bb (USE_STMT (use_p))))
5264             {
5265               exit_phi = USE_STMT (use_p);
5266               break;
5267             }
5268         }
5269       if (exit_phi)
5270         {
5271           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5272           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5273                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5274             {
5275               if (dump_enabled_p ())
5276                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5277                                  "inner-loop induction only used outside "
5278                                  "of the outer vectorized loop.");
5279               return false;
5280             }
5281         }
5282     }
5283
5284   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5285     return false;
5286
5287   /* FORNOW: SLP not supported.  */
5288   if (STMT_SLP_TYPE (stmt_info))
5289     return false;
5290
5291   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5292
5293   if (gimple_code (phi) != GIMPLE_PHI)
5294     return false;
5295
5296   if (!vec_stmt) /* transformation not required.  */
5297     {
5298       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5299       if (dump_enabled_p ())
5300         dump_printf_loc (MSG_NOTE, vect_location,
5301                          "=== vectorizable_induction ===");
5302       vect_model_induction_cost (stmt_info, ncopies);
5303       return true;
5304     }
5305
5306   /** Transform.  **/
5307
5308   if (dump_enabled_p ())
5309     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5310
5311   vec_def = get_initial_def_for_induction (phi);
5312   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5313   return true;
5314 }
5315
5316 /* Function vectorizable_live_operation.
5317
5318    STMT computes a value that is used outside the loop.  Check if
5319    it can be supported.  */
5320
5321 bool
5322 vectorizable_live_operation (gimple stmt,
5323                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5324                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5325 {
5326   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5327   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5328   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5329   int i;
5330   int op_type;
5331   tree op;
5332   tree def;
5333   gimple def_stmt;
5334   enum vect_def_type dt;
5335   enum tree_code code;
5336   enum gimple_rhs_class rhs_class;
5337
5338   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5339
5340   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5341     return false;
5342
5343   if (!is_gimple_assign (stmt))
5344     return false;
5345
5346   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5347     return false;
5348
5349   /* FORNOW. CHECKME. */
5350   if (nested_in_vect_loop_p (loop, stmt))
5351     return false;
5352
5353   code = gimple_assign_rhs_code (stmt);
5354   op_type = TREE_CODE_LENGTH (code);
5355   rhs_class = get_gimple_rhs_class (code);
5356   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5357   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5358
5359   /* FORNOW: support only if all uses are invariant.  This means
5360      that the scalar operations can remain in place, unvectorized.
5361      The original last scalar value that they compute will be used.  */
5362
5363   for (i = 0; i < op_type; i++)
5364     {
5365       if (rhs_class == GIMPLE_SINGLE_RHS)
5366         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5367       else
5368         op = gimple_op (stmt, i + 1);
5369       if (op
5370           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5371                                   &dt))
5372         {
5373           if (dump_enabled_p ())
5374             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5375                              "use not simple.");
5376           return false;
5377         }
5378
5379       if (dt != vect_external_def && dt != vect_constant_def)
5380         return false;
5381     }
5382
5383   /* No transformation is required for the cases we currently support.  */
5384   return true;
5385 }
5386
5387 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5388
5389 static void
5390 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5391 {
5392   ssa_op_iter op_iter;
5393   imm_use_iterator imm_iter;
5394   def_operand_p def_p;
5395   gimple ustmt;
5396
5397   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5398     {
5399       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5400         {
5401           basic_block bb;
5402
5403           if (!is_gimple_debug (ustmt))
5404             continue;
5405
5406           bb = gimple_bb (ustmt);
5407
5408           if (!flow_bb_inside_loop_p (loop, bb))
5409             {
5410               if (gimple_debug_bind_p (ustmt))
5411                 {
5412                   if (dump_enabled_p ())
5413                     dump_printf_loc (MSG_NOTE, vect_location,
5414                                      "killing debug use");
5415
5416                   gimple_debug_bind_reset_value (ustmt);
5417                   update_stmt (ustmt);
5418                 }
5419               else
5420                 gcc_unreachable ();
5421             }
5422         }
5423     }
5424 }
5425
5426 /* Function vect_transform_loop.
5427
5428    The analysis phase has determined that the loop is vectorizable.
5429    Vectorize the loop - created vectorized stmts to replace the scalar
5430    stmts in the loop, and update the loop exit condition.  */
5431
5432 void
5433 vect_transform_loop (loop_vec_info loop_vinfo)
5434 {
5435   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5436   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5437   int nbbs = loop->num_nodes;
5438   gimple_stmt_iterator si;
5439   int i;
5440   tree ratio = NULL;
5441   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5442   bool grouped_store;
5443   bool slp_scheduled = false;
5444   unsigned int nunits;
5445   gimple stmt, pattern_stmt;
5446   gimple_seq pattern_def_seq = NULL;
5447   gimple_stmt_iterator pattern_def_si = gsi_none ();
5448   bool transform_pattern_stmt = false;
5449   bool check_profitability = false;
5450   int th;
5451
5452   if (dump_enabled_p ())
5453     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5454
5455   /* Use the more conservative vectorization threshold.  If the number
5456      of iterations is constant assume the cost check has been performed
5457      by our caller.  If the threshold makes all loops profitable that
5458      run at least the vectorization factor number of times checking
5459      is pointless, too.  */
5460   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5461          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5462   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5463   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5464       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5465     {
5466       if (dump_enabled_p ())
5467         dump_printf_loc (MSG_NOTE, vect_location,
5468                          "Profitability threshold is %d loop iterations.", th);
5469       check_profitability = true;
5470     }
5471
5472   /* Peel the loop if there are data refs with unknown alignment.
5473      Only one data ref with unknown store is allowed.  */
5474
5475   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5476     {
5477       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5478       check_profitability = false;
5479     }
5480
5481   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5482       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5483     {
5484       vect_loop_versioning (loop_vinfo, th, check_profitability);
5485       check_profitability = false;
5486     }
5487
5488   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5489      compile time constant), or it is a constant that doesn't divide by the
5490      vectorization factor, then an epilog loop needs to be created.
5491      We therefore duplicate the loop: the original loop will be vectorized,
5492      and will compute the first (n/VF) iterations.  The second copy of the loop
5493      will remain scalar and will compute the remaining (n%VF) iterations.
5494      (VF is the vectorization factor).  */
5495
5496   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5497        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5498            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5499        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5500     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5501                                     th, check_profitability);
5502   else
5503     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5504                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5505
5506   /* 1) Make sure the loop header has exactly two entries
5507      2) Make sure we have a preheader basic block.  */
5508
5509   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5510
5511   split_edge (loop_preheader_edge (loop));
5512
5513   /* FORNOW: the vectorizer supports only loops which body consist
5514      of one basic block (header + empty latch). When the vectorizer will
5515      support more involved loop forms, the order by which the BBs are
5516      traversed need to be reconsidered.  */
5517
5518   for (i = 0; i < nbbs; i++)
5519     {
5520       basic_block bb = bbs[i];
5521       stmt_vec_info stmt_info;
5522       gimple phi;
5523
5524       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5525         {
5526           phi = gsi_stmt (si);
5527           if (dump_enabled_p ())
5528             {
5529               dump_printf_loc (MSG_NOTE, vect_location,
5530                                "------>vectorizing phi: ");
5531               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5532             }
5533           stmt_info = vinfo_for_stmt (phi);
5534           if (!stmt_info)
5535             continue;
5536
5537           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5538             vect_loop_kill_debug_uses (loop, phi);
5539
5540           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5541               && !STMT_VINFO_LIVE_P (stmt_info))
5542             continue;
5543
5544           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5545                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5546               && dump_enabled_p ())
5547             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5548
5549           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5550             {
5551               if (dump_enabled_p ())
5552                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5553               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5554             }
5555         }
5556
5557       pattern_stmt = NULL;
5558       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5559         {
5560           bool is_store;
5561
5562           if (transform_pattern_stmt)
5563             stmt = pattern_stmt;
5564           else
5565             stmt = gsi_stmt (si);
5566
5567           if (dump_enabled_p ())
5568             {
5569               dump_printf_loc (MSG_NOTE, vect_location,
5570                                "------>vectorizing statement: ");
5571               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5572             }
5573
5574           stmt_info = vinfo_for_stmt (stmt);
5575
5576           /* vector stmts created in the outer-loop during vectorization of
5577              stmts in an inner-loop may not have a stmt_info, and do not
5578              need to be vectorized.  */
5579           if (!stmt_info)
5580             {
5581               gsi_next (&si);
5582               continue;
5583             }
5584
5585           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5586             vect_loop_kill_debug_uses (loop, stmt);
5587
5588           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5589               && !STMT_VINFO_LIVE_P (stmt_info))
5590             {
5591               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5592                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5593                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5594                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5595                 {
5596                   stmt = pattern_stmt;
5597                   stmt_info = vinfo_for_stmt (stmt);
5598                 }
5599               else
5600                 {
5601                   gsi_next (&si);
5602                   continue;
5603                 }
5604             }
5605           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5606                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5607                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5608                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5609             transform_pattern_stmt = true;
5610
5611           /* If pattern statement has def stmts, vectorize them too.  */
5612           if (is_pattern_stmt_p (stmt_info))
5613             {
5614               if (pattern_def_seq == NULL)
5615                 {
5616                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5617                   pattern_def_si = gsi_start (pattern_def_seq);
5618                 }
5619               else if (!gsi_end_p (pattern_def_si))
5620                 gsi_next (&pattern_def_si);
5621               if (pattern_def_seq != NULL)
5622                 {
5623                   gimple pattern_def_stmt = NULL;
5624                   stmt_vec_info pattern_def_stmt_info = NULL;
5625
5626                   while (!gsi_end_p (pattern_def_si))
5627                     {
5628                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5629                       pattern_def_stmt_info
5630                         = vinfo_for_stmt (pattern_def_stmt);
5631                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5632                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5633                         break;
5634                       gsi_next (&pattern_def_si);
5635                     }
5636
5637                   if (!gsi_end_p (pattern_def_si))
5638                     {
5639                       if (dump_enabled_p ())
5640                         {
5641                           dump_printf_loc (MSG_NOTE, vect_location,
5642                                            "==> vectorizing pattern def "
5643                                            "stmt: ");
5644                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5645                                             pattern_def_stmt, 0);
5646                         }
5647
5648                       stmt = pattern_def_stmt;
5649                       stmt_info = pattern_def_stmt_info;
5650                     }
5651                   else
5652                     {
5653                       pattern_def_si = gsi_none ();
5654                       transform_pattern_stmt = false;
5655                     }
5656                 }
5657               else
5658                 transform_pattern_stmt = false;
5659             }
5660
5661           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5662           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5663                                                STMT_VINFO_VECTYPE (stmt_info));
5664           if (!STMT_SLP_TYPE (stmt_info)
5665               && nunits != (unsigned int) vectorization_factor
5666               && dump_enabled_p ())
5667             /* For SLP VF is set according to unrolling factor, and not to
5668                vector size, hence for SLP this print is not valid.  */
5669             dump_printf_loc (MSG_NOTE, vect_location,
5670                              "multiple-types.");
5671
5672           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5673              reached.  */
5674           if (STMT_SLP_TYPE (stmt_info))
5675             {
5676               if (!slp_scheduled)
5677                 {
5678                   slp_scheduled = true;
5679
5680                   if (dump_enabled_p ())
5681                     dump_printf_loc (MSG_NOTE, vect_location,
5682                                      "=== scheduling SLP instances ===");
5683
5684                   vect_schedule_slp (loop_vinfo, NULL);
5685                 }
5686
5687               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5688               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5689                 {
5690                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5691                     {
5692                       pattern_def_seq = NULL;
5693                       gsi_next (&si);
5694                     }
5695                   continue;
5696                 }
5697             }
5698
5699           /* -------- vectorize statement ------------ */
5700           if (dump_enabled_p ())
5701             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5702
5703           grouped_store = false;
5704           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5705           if (is_store)
5706             {
5707               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5708                 {
5709                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5710                      interleaving chain was completed - free all the stores in
5711                      the chain.  */
5712                   gsi_next (&si);
5713                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5714                   continue;
5715                 }
5716               else
5717                 {
5718                   /* Free the attached stmt_vec_info and remove the stmt.  */
5719                   gimple store = gsi_stmt (si);
5720                   free_stmt_vec_info (store);
5721                   unlink_stmt_vdef (store);
5722                   gsi_remove (&si, true);
5723                   release_defs (store);
5724                   continue;
5725                 }
5726             }
5727
5728           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5729             {
5730               pattern_def_seq = NULL;
5731               gsi_next (&si);
5732             }
5733         }                       /* stmts in BB */
5734     }                           /* BBs in loop */
5735
5736   slpeel_make_loop_iterate_ntimes (loop, ratio);
5737
5738   /* The memory tags and pointers in vectorized statements need to
5739      have their SSA forms updated.  FIXME, why can't this be delayed
5740      until all the loops have been transformed?  */
5741   update_ssa (TODO_update_ssa);
5742
5743   if (dump_enabled_p ())
5744     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5745   if (loop->inner && dump_enabled_p ())
5746     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5747                      "OUTER LOOP VECTORIZED.");
5748 }