gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-flow.h"
  32 #include "tree-pass.h"
  33 #include "cfgloop.h"
  34 #include "expr.h"
  35 #include "recog.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "diagnostic-core.h"
  39 #include "tree-chrec.h"
  40 #include "tree-scalar-evolution.h"
  41 #include "tree-vectorizer.h"
  42 #include "target.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs").  These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized.  It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S).  This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 127    Targets that can support different sizes of vectors, for now will need
 128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 129    flexibility will be added in the future.
 130
 131         Since we only vectorize operations which vector form can be
 132    expressed using existing tree codes, to verify that an operation is
 133    supported, the vectorizer checks the relevant optab at the relevant
 134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 135    the value found is CODE_FOR_nothing, then there's no target support, and
 136    we can't vectorize the stmt.
 137
 138    For additional information on this project see:
 139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 140 */
 141
 142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (dump_enabled_p ())
 190     dump_printf_loc (MSG_NOTE, vect_location,
 191                      "=== vect_determine_vectorization_factor ===");
 192
 193   for (i = 0; i < nbbs; i++)
 194     {
 195       basic_block bb = bbs[i];
 196
 197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 198         {
 199           phi = gsi_stmt (si);
 200           stmt_info = vinfo_for_stmt (phi);
 201           if (dump_enabled_p ())
 202             {
 203               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 204               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 205             }
 206
 207           gcc_assert (stmt_info);
 208
 209           if (STMT_VINFO_RELEVANT_P (stmt_info))
 210             {
 211               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 213
 214               if (dump_enabled_p ())
 215                 {
 216                   dump_printf_loc (MSG_NOTE, vect_location,
 217                                    "get vectype for scalar type:  ");
 218                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 219                 }
 220
 221               vectype = get_vectype_for_scalar_type (scalar_type);
 222               if (!vectype)
 223                 {
 224                   if (dump_enabled_p ())
 225                     {
 226                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 227                                        "not vectorized: unsupported "
 228                                        "data-type ");
 229                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 230                                          scalar_type);
 231                     }
 232                   return false;
 233                 }
 234               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 235
 236               if (dump_enabled_p ())
 237                 {
 238                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 239                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 240                 }
 241
 242               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 243               if (dump_enabled_p ())
 244                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 245
 246               if (!vectorization_factor
 247                   || (nunits > vectorization_factor))
 248                 vectorization_factor = nunits;
 249             }
 250         }
 251
 252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 253         {
 254           tree vf_vectype;
 255
 256           if (analyze_pattern_stmt)
 257             stmt = pattern_stmt;
 258           else
 259             stmt = gsi_stmt (si);
 260
 261           stmt_info = vinfo_for_stmt (stmt);
 262
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_NOTE, vect_location,
 266                                "==> examining statement: ");
 267               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 268             }
 269
 270           gcc_assert (stmt_info);
 271
 272           /* Skip stmts which do not need to be vectorized.  */
 273           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 274               && !STMT_VINFO_LIVE_P (stmt_info))
 275             {
 276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 280                 {
 281                   stmt = pattern_stmt;
 282                   stmt_info = vinfo_for_stmt (pattern_stmt);
 283                   if (dump_enabled_p ())
 284                     {
 285                       dump_printf_loc (MSG_NOTE, vect_location,
 286                                        "==> examining pattern statement: ");
 287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288                     }
 289                 }
 290               else
 291                 {
 292                   if (dump_enabled_p ())
 293                     dump_printf_loc (MSG_NOTE, vect_location, "skip.");
 294                   gsi_next (&si);
 295                   continue;
 296                 }
 297             }
 298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302             analyze_pattern_stmt = true;
 303
 304           /* If a pattern statement has def stmts, analyze them too.  */
 305           if (is_pattern_stmt_p (stmt_info))
 306             {
 307               if (pattern_def_seq == NULL)
 308                 {
 309                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 310                   pattern_def_si = gsi_start (pattern_def_seq);
 311                 }
 312               else if (!gsi_end_p (pattern_def_si))
 313                 gsi_next (&pattern_def_si);
 314               if (pattern_def_seq != NULL)
 315                 {
 316                   gimple pattern_def_stmt = NULL;
 317                   stmt_vec_info pattern_def_stmt_info = NULL;
 318
 319                   while (!gsi_end_p (pattern_def_si))
 320                     {
 321                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 322                       pattern_def_stmt_info
 323                         = vinfo_for_stmt (pattern_def_stmt);
 324                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 325                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 326                         break;
 327                       gsi_next (&pattern_def_si);
 328                     }
 329
 330                   if (!gsi_end_p (pattern_def_si))
 331                     {
 332                       if (dump_enabled_p ())
 333                         {
 334                           dump_printf_loc (MSG_NOTE, vect_location,
 335                                            "==> examining pattern def stmt: ");
 336                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 337                                             pattern_def_stmt, 0);
 338                         }
 339
 340                       stmt = pattern_def_stmt;
 341                       stmt_info = pattern_def_stmt_info;
 342                     }
 343                   else
 344                     {
 345                       pattern_def_si = gsi_none ();
 346                       analyze_pattern_stmt = false;
 347                     }
 348                 }
 349               else
 350                 analyze_pattern_stmt = false;
 351             }
 352
 353           if (gimple_get_lhs (stmt) == NULL_TREE)
 354             {
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 358                                    "not vectorized: irregular stmt.");
 359                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 360                                     0);
 361                 }
 362               return false;
 363             }
 364
 365           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 366             {
 367               if (dump_enabled_p ())
 368                 {
 369                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                                    "not vectorized: vector stmt in loop:");
 371                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 372                 }
 373               return false;
 374             }
 375
 376           if (STMT_VINFO_VECTYPE (stmt_info))
 377             {
 378               /* The only case when a vectype had been already set is for stmts
 379                  that contain a dataref, or for "pattern-stmts" (stmts
 380                  generated by the vectorizer to represent/replace a certain
 381                  idiom).  */
 382               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 383                           || is_pattern_stmt_p (stmt_info)
 384                           || !gsi_end_p (pattern_def_si));
 385               vectype = STMT_VINFO_VECTYPE (stmt_info);
 386             }
 387           else
 388             {
 389               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 390               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_NOTE, vect_location,
 394                                    "get vectype for scalar type:  ");
 395                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 396                 }
 397               vectype = get_vectype_for_scalar_type (scalar_type);
 398               if (!vectype)
 399                 {
 400                   if (dump_enabled_p ())
 401                     {
 402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                        "not vectorized: unsupported "
 404                                        "data-type ");
 405                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                          scalar_type);
 407                     }
 408                   return false;
 409                 }
 410
 411               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 412
 413               if (dump_enabled_p ())
 414                 {
 415                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 416                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 417                 }
 418             }
 419
 420           /* The vectorization factor is according to the smallest
 421              scalar type (or the largest vector size, but we only
 422              support one vector size per loop).  */
 423           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 424                                                        &dummy);
 425           if (dump_enabled_p ())
 426             {
 427               dump_printf_loc (MSG_NOTE, vect_location,
 428                                "get vectype for scalar type:  ");
 429               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 430             }
 431           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 432           if (!vf_vectype)
 433             {
 434               if (dump_enabled_p ())
 435                 {
 436                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                                    "not vectorized: unsupported data-type ");
 438                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 439                                      scalar_type);
 440                 }
 441               return false;
 442             }
 443
 444           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 445                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 446             {
 447               if (dump_enabled_p ())
 448                 {
 449                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 450                                    "not vectorized: different sized vector "
 451                                    "types in statement, ");
 452                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 453                                      vectype);
 454                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 455                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 456                                      vf_vectype);
 457                 }
 458               return false;
 459             }
 460
 461           if (dump_enabled_p ())
 462             {
 463               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 464               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 465             }
 466
 467           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 470           if (!vectorization_factor
 471               || (nunits > vectorization_factor))
 472             vectorization_factor = nunits;
 473
 474           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 475             {
 476               pattern_def_seq = NULL;
 477               gsi_next (&si);
 478             }
 479         }
 480     }
 481
 482   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 483   if (dump_enabled_p ())
 484     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
 485                      vectorization_factor);
 486   if (vectorization_factor <= 1)
 487     {
 488       if (dump_enabled_p ())
 489         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                          "not vectorized: unsupported data-type");
 491       return false;
 492     }
 493   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 494
 495   return true;
 496 }
 497
 498
 499 /* Function vect_is_simple_iv_evolution.
 500
 501    FORNOW: A simple evolution of an induction variables in the loop is
 502    considered a polynomial evolution with constant step.  */
 503
 504 static bool
 505 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 506                              tree * step)
 507 {
 508   tree init_expr;
 509   tree step_expr;
 510   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 511
 512   /* When there is no evolution in this loop, the evolution function
 513      is not "simple".  */
 514   if (evolution_part == NULL_TREE)
 515     return false;
 516
 517   /* When the evolution is a polynomial of degree >= 2
 518      the evolution function is not "simple".  */
 519   if (tree_is_chrec (evolution_part))
 520     return false;
 521
 522   step_expr = evolution_part;
 523   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 524
 525   if (dump_enabled_p ())
 526     {
 527       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 528       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 529       dump_printf (MSG_NOTE, ",  init: ");
 530       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 531     }
 532
 533   *init = init_expr;
 534   *step = step_expr;
 535
 536   if (TREE_CODE (step_expr) != INTEGER_CST)
 537     {
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 540                          "step unknown.");
 541       return false;
 542     }
 543
 544   return true;
 545 }
 546
 547 /* Function vect_analyze_scalar_cycles_1.
 548
 549    Examine the cross iteration def-use cycles of scalar variables
 550    in LOOP.  LOOP_VINFO represents the loop that is now being
 551    considered for vectorization (can be LOOP, or an outer-loop
 552    enclosing LOOP).  */
 553
 554 static void
 555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 556 {
 557   basic_block bb = loop->header;
 558   tree dumy;
 559   vec<gimple> worklist;
 560   worklist.create (64);
 561   gimple_stmt_iterator gsi;
 562   bool double_reduc;
 563
 564   if (dump_enabled_p ())
 565     dump_printf_loc (MSG_NOTE, vect_location,
 566                      "=== vect_analyze_scalar_cycles ===");
 567
 568   /* First - identify all inductions.  Reduction detection assumes that all the
 569      inductions have been identified, therefore, this order must not be
 570      changed.  */
 571   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 572     {
 573       gimple phi = gsi_stmt (gsi);
 574       tree access_fn = NULL;
 575       tree def = PHI_RESULT (phi);
 576       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 577
 578       if (dump_enabled_p ())
 579         {
 580           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 581           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 582         }
 583
 584       /* Skip virtual phi's.  The data dependences that are associated with
 585          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 586       if (virtual_operand_p (def))
 587         continue;
 588
 589       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 590
 591       /* Analyze the evolution function.  */
 592       access_fn = analyze_scalar_evolution (loop, def);
 593       if (access_fn)
 594         {
 595           STRIP_NOPS (access_fn);
 596           if (dump_enabled_p ())
 597             {
 598               dump_printf_loc (MSG_NOTE, vect_location,
 599                                "Access function of PHI: ");
 600               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 601             }
 602           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 603             = evolution_part_in_loop_num (access_fn, loop->num);
 604         }
 605
 606       if (!access_fn
 607           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 608         {
 609           worklist.safe_push (phi);
 610           continue;
 611         }
 612
 613       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 614
 615       if (dump_enabled_p ())
 616         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
 617       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 618     }
 619
 620
 621   /* Second - identify all reductions and nested cycles.  */
 622   while (worklist.length () > 0)
 623     {
 624       gimple phi = worklist.pop ();
 625       tree def = PHI_RESULT (phi);
 626       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 627       gimple reduc_stmt;
 628       bool nested_cycle;
 629
 630       if (dump_enabled_p ())
 631         {
 632           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 633           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 634         }
 635
 636       gcc_assert (!virtual_operand_p (def)
 637                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 638
 639       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 640       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 641                                                 &double_reduc);
 642       if (reduc_stmt)
 643         {
 644           if (double_reduc)
 645             {
 646               if (dump_enabled_p ())
 647                 dump_printf_loc (MSG_NOTE, vect_location,
 648                                  "Detected double reduction.");
 649
 650               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 651               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 652                                                     vect_double_reduction_def;
 653             }
 654           else
 655             {
 656               if (nested_cycle)
 657                 {
 658                   if (dump_enabled_p ())
 659                     dump_printf_loc (MSG_NOTE, vect_location,
 660                                      "Detected vectorizable nested cycle.");
 661
 662                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 663                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 664                                                              vect_nested_cycle;
 665                 }
 666               else
 667                 {
 668                   if (dump_enabled_p ())
 669                     dump_printf_loc (MSG_NOTE, vect_location,
 670                                      "Detected reduction.");
 671
 672                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 673                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 674                                                            vect_reduction_def;
 675                   /* Store the reduction cycles for possible vectorization in
 676                      loop-aware SLP.  */
 677                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 678                 }
 679             }
 680         }
 681       else
 682         if (dump_enabled_p ())
 683           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 684                            "Unknown def-use cycle pattern.");
 685     }
 686
 687   worklist.release ();
 688 }
 689
 690
 691 /* Function vect_analyze_scalar_cycles.
 692
 693    Examine the cross iteration def-use cycles of scalar variables, by
 694    analyzing the loop-header PHIs of scalar variables.  Classify each
 695    cycle as one of the following: invariant, induction, reduction, unknown.
 696    We do that for the loop represented by LOOP_VINFO, and also to its
 697    inner-loop, if exists.
 698    Examples for scalar cycles:
 699
 700    Example1: reduction:
 701
 702               loop1:
 703               for (i=0; i<N; i++)
 704                  sum += a[i];
 705
 706    Example2: induction:
 707
 708               loop2:
 709               for (i=0; i<N; i++)
 710                  a[i] = i;  */
 711
 712 static void
 713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 714 {
 715   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 716
 717   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 718
 719   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 720      Reductions in such inner-loop therefore have different properties than
 721      the reductions in the nest that gets vectorized:
 722      1. When vectorized, they are executed in the same order as in the original
 723         scalar loop, so we can't change the order of computation when
 724         vectorizing them.
 725      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 726         current checks are too strict.  */
 727
 728   if (loop->inner)
 729     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 730 }
 731
 732 /* Function vect_get_loop_niters.
 733
 734    Determine how many iterations the loop is executed.
 735    If an expression that represents the number of iterations
 736    can be constructed, place it in NUMBER_OF_ITERATIONS.
 737    Return the loop exit condition.  */
 738
 739 static gimple
 740 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 741 {
 742   tree niters;
 743
 744   if (dump_enabled_p ())
 745     dump_printf_loc (MSG_NOTE, vect_location,
 746                      "=== get_loop_niters ===");
 747   niters = number_of_exit_cond_executions (loop);
 748
 749   if (niters != NULL_TREE
 750       && niters != chrec_dont_know)
 751     {
 752       *number_of_iterations = niters;
 753
 754       if (dump_enabled_p ())
 755         {
 756           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 757           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 758         }
 759     }
 760
 761   return get_loop_exit_condition (loop);
 762 }
 763
 764
 765 /* Function bb_in_loop_p
 766
 767    Used as predicate for dfs order traversal of the loop bbs.  */
 768
 769 static bool
 770 bb_in_loop_p (const_basic_block bb, const void *data)
 771 {
 772   const struct loop *const loop = (const struct loop *)data;
 773   if (flow_bb_inside_loop_p (loop, bb))
 774     return true;
 775   return false;
 776 }
 777
 778
 779 /* Function new_loop_vec_info.
 780
 781    Create and initialize a new loop_vec_info struct for LOOP, as well as
 782    stmt_vec_info structs for all the stmts in LOOP.  */
 783
 784 static loop_vec_info
 785 new_loop_vec_info (struct loop *loop)
 786 {
 787   loop_vec_info res;
 788   basic_block *bbs;
 789   gimple_stmt_iterator si;
 790   unsigned int i, nbbs;
 791
 792   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 793   LOOP_VINFO_LOOP (res) = loop;
 794
 795   bbs = get_loop_body (loop);
 796
 797   /* Create/Update stmt_info for all stmts in the loop.  */
 798   for (i = 0; i < loop->num_nodes; i++)
 799     {
 800       basic_block bb = bbs[i];
 801
 802       /* BBs in a nested inner-loop will have been already processed (because
 803          we will have called vect_analyze_loop_form for any nested inner-loop).
 804          Therefore, for stmts in an inner-loop we just want to update the
 805          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 806          loop_info of the outer-loop we are currently considering to vectorize
 807          (instead of the loop_info of the inner-loop).
 808          For stmts in other BBs we need to create a stmt_info from scratch.  */
 809       if (bb->loop_father != loop)
 810         {
 811           /* Inner-loop bb.  */
 812           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 813           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 814             {
 815               gimple phi = gsi_stmt (si);
 816               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 817               loop_vec_info inner_loop_vinfo =
 818                 STMT_VINFO_LOOP_VINFO (stmt_info);
 819               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 820               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 821             }
 822           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 823            {
 824               gimple stmt = gsi_stmt (si);
 825               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 826               loop_vec_info inner_loop_vinfo =
 827                  STMT_VINFO_LOOP_VINFO (stmt_info);
 828               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 829               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 830            }
 831         }
 832       else
 833         {
 834           /* bb in current nest.  */
 835           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 836             {
 837               gimple phi = gsi_stmt (si);
 838               gimple_set_uid (phi, 0);
 839               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 840             }
 841
 842           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 843             {
 844               gimple stmt = gsi_stmt (si);
 845               gimple_set_uid (stmt, 0);
 846               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 847             }
 848         }
 849     }
 850
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856    free (bbs);
 857    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 858    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 859                               bbs, loop->num_nodes, loop);
 860    gcc_assert (nbbs == loop->num_nodes);
 861
 862   LOOP_VINFO_BBS (res) = bbs;
 863   LOOP_VINFO_NITERS (res) = NULL;
 864   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 865   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 866   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 867   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 868   LOOP_VINFO_VECT_FACTOR (res) = 0;
 869   LOOP_VINFO_LOOP_NEST (res).create (3);
 870   LOOP_VINFO_DATAREFS (res).create (10);
 871   LOOP_VINFO_DDRS (res).create (10 * 10);
 872   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 873   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 874              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 875   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 876              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 877   LOOP_VINFO_GROUPED_STORES (res).create (10);
 878   LOOP_VINFO_REDUCTIONS (res).create (10);
 879   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 880   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 881   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 882   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 883   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 884   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 885   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 886
 887   return res;
 888 }
 889
 890
 891 /* Function destroy_loop_vec_info.
 892
 893    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 894    stmts in the loop.  */
 895
 896 void
 897 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 898 {
 899   struct loop *loop;
 900   basic_block *bbs;
 901   int nbbs;
 902   gimple_stmt_iterator si;
 903   int j;
 904   vec<slp_instance> slp_instances;
 905   slp_instance instance;
 906   bool swapped;
 907
 908   if (!loop_vinfo)
 909     return;
 910
 911   loop = LOOP_VINFO_LOOP (loop_vinfo);
 912
 913   bbs = LOOP_VINFO_BBS (loop_vinfo);
 914   nbbs = clean_stmts ? loop->num_nodes : 0;
 915   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 916
 917   for (j = 0; j < nbbs; j++)
 918     {
 919       basic_block bb = bbs[j];
 920       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 921         free_stmt_vec_info (gsi_stmt (si));
 922
 923       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 924         {
 925           gimple stmt = gsi_stmt (si);
 926
 927           /* We may have broken canonical form by moving a constant
 928              into RHS1 of a commutative op.  Fix such occurrences.  */
 929           if (swapped && is_gimple_assign (stmt))
 930             {
 931               enum tree_code code = gimple_assign_rhs_code (stmt);
 932
 933               if ((code == PLUS_EXPR
 934                    || code == POINTER_PLUS_EXPR
 935                    || code == MULT_EXPR)
 936                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 937                 swap_tree_operands (stmt,
 938                                     gimple_assign_rhs1_ptr (stmt),
 939                                     gimple_assign_rhs2_ptr (stmt));
 940             }
 941
 942           /* Free stmt_vec_info.  */
 943           free_stmt_vec_info (stmt);
 944           gsi_next (&si);
 945         }
 946     }
 947
 948   free (LOOP_VINFO_BBS (loop_vinfo));
 949   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 950   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 951   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 952   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 953   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 954   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 955   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 956     vect_free_slp_instance (instance);
 957
 958   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 959   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 960   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 961   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 962
 963   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 964     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 965
 966   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 967
 968   free (loop_vinfo);
 969   loop->aux = NULL;
 970 }
 971
 972
 973 /* Function vect_analyze_loop_1.
 974
 975    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 976    for it. The different analyses will record information in the
 977    loop_vec_info struct.  This is a subset of the analyses applied in
 978    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 979    that is now considered for (outer-loop) vectorization.  */
 980
 981 static loop_vec_info
 982 vect_analyze_loop_1 (struct loop *loop)
 983 {
 984   loop_vec_info loop_vinfo;
 985
 986   if (dump_enabled_p ())
 987     dump_printf_loc (MSG_NOTE, vect_location,
 988                      "===== analyze_loop_nest_1 =====");
 989
 990   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 991
 992   loop_vinfo = vect_analyze_loop_form (loop);
 993   if (!loop_vinfo)
 994     {
 995       if (dump_enabled_p ())
 996         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 997                          "bad inner-loop form.");
 998       return NULL;
 999     }
1000
1001   return loop_vinfo;
1002 }
1003
1004
1005 /* Function vect_analyze_loop_form.
1006
1007    Verify that certain CFG restrictions hold, including:
1008    - the loop has a pre-header
1009    - the loop has a single entry and exit
1010    - the loop exit condition is simple enough, and the number of iterations
1011      can be analyzed (a countable loop).  */
1012
1013 loop_vec_info
1014 vect_analyze_loop_form (struct loop *loop)
1015 {
1016   loop_vec_info loop_vinfo;
1017   gimple loop_cond;
1018   tree number_of_iterations = NULL;
1019   loop_vec_info inner_loop_vinfo = NULL;
1020
1021   if (dump_enabled_p ())
1022     dump_printf_loc (MSG_NOTE, vect_location,
1023                      "=== vect_analyze_loop_form ===");
1024
1025   /* Different restrictions apply when we are considering an inner-most loop,
1026      vs. an outer (nested) loop.
1027      (FORNOW. May want to relax some of these restrictions in the future).  */
1028
1029   if (!loop->inner)
1030     {
1031       /* Inner-most loop.  We currently require that the number of BBs is
1032          exactly 2 (the header and latch).  Vectorizable inner-most loops
1033          look like this:
1034
1035                         (pre-header)
1036                            |
1037                           header <--------+
1038                            | |            |
1039                            | +--> latch --+
1040                            |
1041                         (exit-bb)  */
1042
1043       if (loop->num_nodes != 2)
1044         {
1045           if (dump_enabled_p ())
1046             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047                              "not vectorized: control flow in loop.");
1048           return NULL;
1049         }
1050
1051       if (empty_block_p (loop->header))
1052     {
1053           if (dump_enabled_p ())
1054             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                              "not vectorized: empty loop.");
1056       return NULL;
1057     }
1058     }
1059   else
1060     {
1061       struct loop *innerloop = loop->inner;
1062       edge entryedge;
1063
1064       /* Nested loop. We currently require that the loop is doubly-nested,
1065          contains a single inner loop, and the number of BBs is exactly 5.
1066          Vectorizable outer-loops look like this:
1067
1068                         (pre-header)
1069                            |
1070                           header <---+
1071                            |         |
1072                           inner-loop |
1073                            |         |
1074                           tail ------+
1075                            |
1076                         (exit-bb)
1077
1078          The inner-loop has the properties expected of inner-most loops
1079          as described above.  */
1080
1081       if ((loop->inner)->inner || (loop->inner)->next)
1082         {
1083           if (dump_enabled_p ())
1084             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1085                              "not vectorized: multiple nested loops.");
1086           return NULL;
1087         }
1088
1089       /* Analyze the inner-loop.  */
1090       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1091       if (!inner_loop_vinfo)
1092         {
1093           if (dump_enabled_p ())
1094             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095                              "not vectorized: Bad inner loop.");
1096           return NULL;
1097         }
1098
1099       if (!expr_invariant_in_loop_p (loop,
1100                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1101         {
1102           if (dump_enabled_p ())
1103             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1104                              "not vectorized: inner-loop count not invariant.");
1105           destroy_loop_vec_info (inner_loop_vinfo, true);
1106           return NULL;
1107         }
1108
1109       if (loop->num_nodes != 5)
1110         {
1111           if (dump_enabled_p ())
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "not vectorized: control flow in loop.");
1114           destroy_loop_vec_info (inner_loop_vinfo, true);
1115           return NULL;
1116         }
1117
1118       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1119       entryedge = EDGE_PRED (innerloop->header, 0);
1120       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1121         entryedge = EDGE_PRED (innerloop->header, 1);
1122
1123       if (entryedge->src != loop->header
1124           || !single_exit (innerloop)
1125           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1126         {
1127           if (dump_enabled_p ())
1128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129                              "not vectorized: unsupported outerloop form.");
1130           destroy_loop_vec_info (inner_loop_vinfo, true);
1131           return NULL;
1132         }
1133
1134       if (dump_enabled_p ())
1135         dump_printf_loc (MSG_NOTE, vect_location,
1136                          "Considering outer-loop vectorization.");
1137     }
1138
1139   if (!single_exit (loop)
1140       || EDGE_COUNT (loop->header->preds) != 2)
1141     {
1142       if (dump_enabled_p ())
1143         {
1144           if (!single_exit (loop))
1145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146                              "not vectorized: multiple exits.");
1147           else if (EDGE_COUNT (loop->header->preds) != 2)
1148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1149                              "not vectorized: too many incoming edges.");
1150         }
1151       if (inner_loop_vinfo)
1152         destroy_loop_vec_info (inner_loop_vinfo, true);
1153       return NULL;
1154     }
1155
1156   /* We assume that the loop exit condition is at the end of the loop. i.e,
1157      that the loop is represented as a do-while (with a proper if-guard
1158      before the loop if needed), where the loop header contains all the
1159      executable statements, and the latch is empty.  */
1160   if (!empty_block_p (loop->latch)
1161       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1162     {
1163       if (dump_enabled_p ())
1164         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                          "not vectorized: latch block not empty.");
1166       if (inner_loop_vinfo)
1167         destroy_loop_vec_info (inner_loop_vinfo, true);
1168       return NULL;
1169     }
1170
1171   /* Make sure there exists a single-predecessor exit bb:  */
1172   if (!single_pred_p (single_exit (loop)->dest))
1173     {
1174       edge e = single_exit (loop);
1175       if (!(e->flags & EDGE_ABNORMAL))
1176         {
1177           split_loop_exit_edge (e);
1178           if (dump_enabled_p ())
1179             dump_printf (MSG_NOTE, "split exit edge.");
1180         }
1181       else
1182         {
1183           if (dump_enabled_p ())
1184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185                              "not vectorized: abnormal loop exit edge.");
1186           if (inner_loop_vinfo)
1187             destroy_loop_vec_info (inner_loop_vinfo, true);
1188           return NULL;
1189         }
1190     }
1191
1192   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1193   if (!loop_cond)
1194     {
1195       if (dump_enabled_p ())
1196         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197                          "not vectorized: complicated exit condition.");
1198       if (inner_loop_vinfo)
1199         destroy_loop_vec_info (inner_loop_vinfo, true);
1200       return NULL;
1201     }
1202
1203   if (!number_of_iterations)
1204     {
1205       if (dump_enabled_p ())
1206         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207                          "not vectorized: number of iterations cannot be "
1208                          "computed.");
1209       if (inner_loop_vinfo)
1210         destroy_loop_vec_info (inner_loop_vinfo, true);
1211       return NULL;
1212     }
1213
1214   if (chrec_contains_undetermined (number_of_iterations))
1215     {
1216       if (dump_enabled_p ())
1217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                              "Infinite number of iterations.");
1219       if (inner_loop_vinfo)
1220         destroy_loop_vec_info (inner_loop_vinfo, true);
1221       return NULL;
1222     }
1223
1224   if (!NITERS_KNOWN_P (number_of_iterations))
1225     {
1226       if (dump_enabled_p ())
1227         {
1228           dump_printf_loc (MSG_NOTE, vect_location,
1229                            "Symbolic number of iterations is ");
1230           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1231         }
1232     }
1233   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1234     {
1235       if (dump_enabled_p ())
1236         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                          "not vectorized: number of iterations = 0.");
1238       if (inner_loop_vinfo)
1239         destroy_loop_vec_info (inner_loop_vinfo, true);
1240       return NULL;
1241     }
1242
1243   loop_vinfo = new_loop_vec_info (loop);
1244   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1245   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1246
1247   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1248
1249   /* CHECKME: May want to keep it around it in the future.  */
1250   if (inner_loop_vinfo)
1251     destroy_loop_vec_info (inner_loop_vinfo, false);
1252
1253   gcc_assert (!loop->aux);
1254   loop->aux = loop_vinfo;
1255   return loop_vinfo;
1256 }
1257
1258
1259 /* Function vect_analyze_loop_operations.
1260
1261    Scan the loop stmts and make sure they are all vectorizable.  */
1262
1263 static bool
1264 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1265 {
1266   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1267   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1268   int nbbs = loop->num_nodes;
1269   gimple_stmt_iterator si;
1270   unsigned int vectorization_factor = 0;
1271   int i;
1272   gimple phi;
1273   stmt_vec_info stmt_info;
1274   bool need_to_vectorize = false;
1275   int min_profitable_iters;
1276   int min_scalar_loop_bound;
1277   unsigned int th;
1278   bool only_slp_in_loop = true, ok;
1279   HOST_WIDE_INT max_niter;
1280   HOST_WIDE_INT estimated_niter;
1281   int min_profitable_estimate;
1282
1283   if (dump_enabled_p ())
1284     dump_printf_loc (MSG_NOTE, vect_location,
1285                      "=== vect_analyze_loop_operations ===");
1286
1287   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1288   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1289   if (slp)
1290     {
1291       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1292          vectorization factor of the loop is the unrolling factor required by
1293          the SLP instances.  If that unrolling factor is 1, we say, that we
1294          perform pure SLP on loop - cross iteration parallelism is not
1295          exploited.  */
1296       for (i = 0; i < nbbs; i++)
1297         {
1298           basic_block bb = bbs[i];
1299           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1300             {
1301               gimple stmt = gsi_stmt (si);
1302               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1303               gcc_assert (stmt_info);
1304               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1305                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1306                   && !PURE_SLP_STMT (stmt_info))
1307                 /* STMT needs both SLP and loop-based vectorization.  */
1308                 only_slp_in_loop = false;
1309             }
1310         }
1311
1312       if (only_slp_in_loop)
1313         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1314       else
1315         vectorization_factor = least_common_multiple (vectorization_factor,
1316                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1317
1318       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1319       if (dump_enabled_p ())
1320         dump_printf_loc (MSG_NOTE, vect_location,
1321                          "Updating vectorization factor to %d ",
1322                          vectorization_factor);
1323     }
1324
1325   for (i = 0; i < nbbs; i++)
1326     {
1327       basic_block bb = bbs[i];
1328
1329       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1330         {
1331           phi = gsi_stmt (si);
1332           ok = true;
1333
1334           stmt_info = vinfo_for_stmt (phi);
1335           if (dump_enabled_p ())
1336             {
1337               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1338               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1339             }
1340
1341           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1342              (i.e., a phi in the tail of the outer-loop).  */
1343           if (! is_loop_header_bb_p (bb))
1344             {
1345               /* FORNOW: we currently don't support the case that these phis
1346                  are not used in the outerloop (unless it is double reduction,
1347                  i.e., this phi is vect_reduction_def), cause this case
1348                  requires to actually do something here.  */
1349               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1350                    || STMT_VINFO_LIVE_P (stmt_info))
1351                   && STMT_VINFO_DEF_TYPE (stmt_info)
1352                      != vect_double_reduction_def)
1353                 {
1354                   if (dump_enabled_p ())
1355                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1356                                      "Unsupported loop-closed phi in "
1357                                      "outer-loop.");
1358                   return false;
1359                 }
1360
1361               /* If PHI is used in the outer loop, we check that its operand
1362                  is defined in the inner loop.  */
1363               if (STMT_VINFO_RELEVANT_P (stmt_info))
1364                 {
1365                   tree phi_op;
1366                   gimple op_def_stmt;
1367
1368                   if (gimple_phi_num_args (phi) != 1)
1369                     return false;
1370
1371                   phi_op = PHI_ARG_DEF (phi, 0);
1372                   if (TREE_CODE (phi_op) != SSA_NAME)
1373                     return false;
1374
1375                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1376                   if (!op_def_stmt
1377                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1378                       || !vinfo_for_stmt (op_def_stmt))
1379                     return false;
1380
1381                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1382                         != vect_used_in_outer
1383                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1384                            != vect_used_in_outer_by_reduction)
1385                     return false;
1386                 }
1387
1388               continue;
1389             }
1390
1391           gcc_assert (stmt_info);
1392
1393           if (STMT_VINFO_LIVE_P (stmt_info))
1394             {
1395               /* FORNOW: not yet supported.  */
1396               if (dump_enabled_p ())
1397                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398                                  "not vectorized: value used after loop.");
1399               return false;
1400             }
1401
1402           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1403               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1404             {
1405               /* A scalar-dependence cycle that we don't support.  */
1406               if (dump_enabled_p ())
1407                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1408                                  "not vectorized: scalar dependence cycle.");
1409               return false;
1410             }
1411
1412           if (STMT_VINFO_RELEVANT_P (stmt_info))
1413             {
1414               need_to_vectorize = true;
1415               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1416                 ok = vectorizable_induction (phi, NULL, NULL);
1417             }
1418
1419           if (!ok)
1420             {
1421               if (dump_enabled_p ())
1422                 {
1423                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1424                                    "not vectorized: relevant phi not "
1425                                    "supported: ");
1426                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1427                 }
1428               return false;
1429             }
1430         }
1431
1432       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1433         {
1434           gimple stmt = gsi_stmt (si);
1435           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1436             return false;
1437         }
1438     } /* bbs */
1439
1440   /* All operations in the loop are either irrelevant (deal with loop
1441      control, or dead), or only used outside the loop and can be moved
1442      out of the loop (e.g. invariants, inductions).  The loop can be
1443      optimized away by scalar optimizations.  We're better off not
1444      touching this loop.  */
1445   if (!need_to_vectorize)
1446     {
1447       if (dump_enabled_p ())
1448         dump_printf_loc (MSG_NOTE, vect_location,
1449                          "All the computation can be taken out of the loop.");
1450       if (dump_enabled_p ())
1451         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452                          "not vectorized: redundant loop. no profit to "
1453                          "vectorize.");
1454       return false;
1455     }
1456
1457   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1458     dump_printf_loc (MSG_NOTE, vect_location,
1459                      "vectorization_factor = %d, niters = "
1460                      HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1461                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1462
1463   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1464        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1465       || ((max_niter = max_stmt_executions_int (loop)) != -1
1466           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1467     {
1468       if (dump_enabled_p ())
1469         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470                          "not vectorized: iteration count too small.");
1471       if (dump_enabled_p ())
1472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1473                          "not vectorized: iteration count smaller than "
1474                          "vectorization factor.");
1475       return false;
1476     }
1477
1478   /* Analyze cost.  Decide if worth while to vectorize.  */
1479
1480   /* Once VF is set, SLP costs should be updated since the number of created
1481      vector stmts depends on VF.  */
1482   vect_update_slp_costs_according_to_vf (loop_vinfo);
1483
1484   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1485                                       &min_profitable_estimate);
1486   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1487
1488   if (min_profitable_iters < 0)
1489     {
1490       if (dump_enabled_p ())
1491         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492                          "not vectorized: vectorization not profitable.");
1493       if (dump_enabled_p ())
1494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495                          "not vectorized: vector version will never be "
1496                          "profitable.");
1497       return false;
1498     }
1499
1500   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1501                             * vectorization_factor) - 1);
1502
1503
1504   /* Use the cost model only if it is more conservative than user specified
1505      threshold.  */
1506
1507   th = (unsigned) min_scalar_loop_bound;
1508   if (min_profitable_iters
1509       && (!min_scalar_loop_bound
1510           || min_profitable_iters > min_scalar_loop_bound))
1511     th = (unsigned) min_profitable_iters;
1512
1513   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1514       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1515     {
1516       if (dump_enabled_p ())
1517         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518                          "not vectorized: vectorization not profitable.");
1519       if (dump_enabled_p ())
1520         dump_printf_loc (MSG_NOTE, vect_location,
1521                          "not vectorized: iteration count smaller than user "
1522                          "specified loop bound parameter or minimum profitable "
1523                          "iterations (whichever is more conservative).");
1524       return false;
1525     }
1526
1527   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1528       && ((unsigned HOST_WIDE_INT) estimated_niter
1529           <= MAX (th, (unsigned)min_profitable_estimate)))
1530     {
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                          "not vectorized: estimated iteration count too "
1534                          "small.");
1535       if (dump_enabled_p ())
1536         dump_printf_loc (MSG_NOTE, vect_location,
1537                          "not vectorized: estimated iteration count smaller "
1538                          "than specified loop bound parameter or minimum "
1539                          "profitable iterations (whichever is more "
1540                          "conservative).");
1541       return false;
1542     }
1543
1544   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1546       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1547     {
1548       if (dump_enabled_p ())
1549         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1550       if (!vect_can_advance_ivs_p (loop_vinfo))
1551         {
1552           if (dump_enabled_p ())
1553             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1554                              "not vectorized: can't create epilog loop 1.");
1555           return false;
1556         }
1557       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1558         {
1559           if (dump_enabled_p ())
1560             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561                              "not vectorized: can't create epilog loop 2.");
1562           return false;
1563         }
1564     }
1565
1566   return true;
1567 }
1568
1569
1570 /* Function vect_analyze_loop_2.
1571
1572    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1573    for it.  The different analyses will record information in the
1574    loop_vec_info struct.  */
1575 static bool
1576 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1577 {
1578   bool ok, slp = false;
1579   int max_vf = MAX_VECTORIZATION_FACTOR;
1580   int min_vf = 2;
1581
1582   /* Find all data references in the loop (which correspond to vdefs/vuses)
1583      and analyze their evolution in the loop.  Also adjust the minimal
1584      vectorization factor according to the loads and stores.
1585
1586      FORNOW: Handle only simple, array references, which
1587      alignment can be forced, and aligned pointer-references.  */
1588
1589   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1590   if (!ok)
1591     {
1592       if (dump_enabled_p ())
1593         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1594                          "bad data references.");
1595       return false;
1596     }
1597
1598   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1599      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1600
1601   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1602   if (!ok)
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "bad data access.");
1607       return false;
1608     }
1609
1610   /* Classify all cross-iteration scalar data-flow cycles.
1611      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1612
1613   vect_analyze_scalar_cycles (loop_vinfo);
1614
1615   vect_pattern_recog (loop_vinfo, NULL);
1616
1617   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1618
1619   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1620   if (!ok)
1621     {
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624                          "unexpected pattern.");
1625       return false;
1626     }
1627
1628   /* Analyze data dependences between the data-refs in the loop
1629      and adjust the maximum vectorization factor according to
1630      the dependences.
1631      FORNOW: fail at the first data dependence that we encounter.  */
1632
1633   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1634   if (!ok
1635       || max_vf < min_vf)
1636     {
1637       if (dump_enabled_p ())
1638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                              "bad data dependence.");
1640       return false;
1641     }
1642
1643   ok = vect_determine_vectorization_factor (loop_vinfo);
1644   if (!ok)
1645     {
1646       if (dump_enabled_p ())
1647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648                          "can't determine vectorization factor.");
1649       return false;
1650     }
1651   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655                          "bad data dependence.");
1656       return false;
1657     }
1658
1659   /* Analyze the alignment of the data-refs in the loop.
1660      Fail if a data reference is found that cannot be vectorized.  */
1661
1662   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1663   if (!ok)
1664     {
1665       if (dump_enabled_p ())
1666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667                          "bad data alignment.");
1668       return false;
1669     }
1670
1671   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1672      It is important to call pruning after vect_analyze_data_ref_accesses,
1673      since we use grouping information gathered by interleaving analysis.  */
1674   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1675   if (!ok)
1676     {
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679                          "too long list of versioning for alias "
1680                          "run-time tests.");
1681       return false;
1682     }
1683
1684   /* This pass will decide on using loop versioning and/or loop peeling in
1685      order to enhance the alignment of data references in the loop.  */
1686
1687   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1688   if (!ok)
1689     {
1690       if (dump_enabled_p ())
1691         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692                          "bad data alignment.");
1693       return false;
1694     }
1695
1696   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1697   ok = vect_analyze_slp (loop_vinfo, NULL);
1698   if (ok)
1699     {
1700       /* Decide which possible SLP instances to SLP.  */
1701       slp = vect_make_slp_decision (loop_vinfo);
1702
1703       /* Find stmts that need to be both vectorized and SLPed.  */
1704       vect_detect_hybrid_slp (loop_vinfo);
1705     }
1706   else
1707     return false;
1708
1709   /* Scan all the operations in the loop and make sure they are
1710      vectorizable.  */
1711
1712   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1713   if (!ok)
1714     {
1715       if (dump_enabled_p ())
1716         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717                          "bad operation or unsupported loop bound.");
1718       return false;
1719     }
1720
1721   return true;
1722 }
1723
1724 /* Function vect_analyze_loop.
1725
1726    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1727    for it.  The different analyses will record information in the
1728    loop_vec_info struct.  */
1729 loop_vec_info
1730 vect_analyze_loop (struct loop *loop)
1731 {
1732   loop_vec_info loop_vinfo;
1733   unsigned int vector_sizes;
1734
1735   /* Autodetect first vector size we try.  */
1736   current_vector_size = 0;
1737   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1738
1739   if (dump_enabled_p ())
1740     dump_printf_loc (MSG_NOTE, vect_location,
1741                      "===== analyze_loop_nest =====");
1742
1743   if (loop_outer (loop)
1744       && loop_vec_info_for_loop (loop_outer (loop))
1745       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1746     {
1747       if (dump_enabled_p ())
1748         dump_printf_loc (MSG_NOTE, vect_location,
1749                          "outer-loop already vectorized.");
1750       return NULL;
1751     }
1752
1753   while (1)
1754     {
1755       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1756       loop_vinfo = vect_analyze_loop_form (loop);
1757       if (!loop_vinfo)
1758         {
1759           if (dump_enabled_p ())
1760             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                              "bad loop form.");
1762           return NULL;
1763         }
1764
1765       if (vect_analyze_loop_2 (loop_vinfo))
1766         {
1767           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1768
1769           return loop_vinfo;
1770         }
1771
1772       destroy_loop_vec_info (loop_vinfo, true);
1773
1774       vector_sizes &= ~current_vector_size;
1775       if (vector_sizes == 0
1776           || current_vector_size == 0)
1777         return NULL;
1778
1779       /* Try the next biggest vector size.  */
1780       current_vector_size = 1 << floor_log2 (vector_sizes);
1781       if (dump_enabled_p ())
1782         dump_printf_loc (MSG_NOTE, vect_location,
1783                          "***** Re-trying analysis with "
1784                          "vector size %d\n", current_vector_size);
1785     }
1786 }
1787
1788
1789 /* Function reduction_code_for_scalar_code
1790
1791    Input:
1792    CODE - tree_code of a reduction operations.
1793
1794    Output:
1795    REDUC_CODE - the corresponding tree-code to be used to reduce the
1796       vector of partial results into a single scalar result (which
1797       will also reside in a vector) or ERROR_MARK if the operation is
1798       a supported reduction operation, but does not have such tree-code.
1799
1800    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1801
1802 static bool
1803 reduction_code_for_scalar_code (enum tree_code code,
1804                                 enum tree_code *reduc_code)
1805 {
1806   switch (code)
1807     {
1808       case MAX_EXPR:
1809         *reduc_code = REDUC_MAX_EXPR;
1810         return true;
1811
1812       case MIN_EXPR:
1813         *reduc_code = REDUC_MIN_EXPR;
1814         return true;
1815
1816       case PLUS_EXPR:
1817         *reduc_code = REDUC_PLUS_EXPR;
1818         return true;
1819
1820       case MULT_EXPR:
1821       case MINUS_EXPR:
1822       case BIT_IOR_EXPR:
1823       case BIT_XOR_EXPR:
1824       case BIT_AND_EXPR:
1825         *reduc_code = ERROR_MARK;
1826         return true;
1827
1828       default:
1829        return false;
1830     }
1831 }
1832
1833
1834 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1835    STMT is printed with a message MSG. */
1836
1837 static void
1838 report_vect_op (int msg_type, gimple stmt, const char *msg)
1839 {
1840   dump_printf_loc (msg_type, vect_location, "%s", msg);
1841   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1842 }
1843
1844
1845 /* Detect SLP reduction of the form:
1846
1847    #a1 = phi <a5, a0>
1848    a2 = operation (a1)
1849    a3 = operation (a2)
1850    a4 = operation (a3)
1851    a5 = operation (a4)
1852
1853    #a = phi <a5>
1854
1855    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1856    FIRST_STMT is the first reduction stmt in the chain
1857    (a2 = operation (a1)).
1858
1859    Return TRUE if a reduction chain was detected.  */
1860
1861 static bool
1862 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1863 {
1864   struct loop *loop = (gimple_bb (phi))->loop_father;
1865   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1866   enum tree_code code;
1867   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1868   stmt_vec_info use_stmt_info, current_stmt_info;
1869   tree lhs;
1870   imm_use_iterator imm_iter;
1871   use_operand_p use_p;
1872   int nloop_uses, size = 0, n_out_of_loop_uses;
1873   bool found = false;
1874
1875   if (loop != vect_loop)
1876     return false;
1877
1878   lhs = PHI_RESULT (phi);
1879   code = gimple_assign_rhs_code (first_stmt);
1880   while (1)
1881     {
1882       nloop_uses = 0;
1883       n_out_of_loop_uses = 0;
1884       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1885         {
1886           gimple use_stmt = USE_STMT (use_p);
1887           if (is_gimple_debug (use_stmt))
1888             continue;
1889
1890           use_stmt = USE_STMT (use_p);
1891
1892           /* Check if we got back to the reduction phi.  */
1893           if (use_stmt == phi)
1894             {
1895               loop_use_stmt = use_stmt;
1896               found = true;
1897               break;
1898             }
1899
1900           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1901             {
1902               if (vinfo_for_stmt (use_stmt)
1903                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1904                 {
1905                   loop_use_stmt = use_stmt;
1906                   nloop_uses++;
1907                 }
1908             }
1909            else
1910              n_out_of_loop_uses++;
1911
1912            /* There are can be either a single use in the loop or two uses in
1913               phi nodes.  */
1914            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1915              return false;
1916         }
1917
1918       if (found)
1919         break;
1920
1921       /* We reached a statement with no loop uses.  */
1922       if (nloop_uses == 0)
1923         return false;
1924
1925       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1926       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1927         return false;
1928
1929       if (!is_gimple_assign (loop_use_stmt)
1930           || code != gimple_assign_rhs_code (loop_use_stmt)
1931           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1932         return false;
1933
1934       /* Insert USE_STMT into reduction chain.  */
1935       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1936       if (current_stmt)
1937         {
1938           current_stmt_info = vinfo_for_stmt (current_stmt);
1939           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1940           GROUP_FIRST_ELEMENT (use_stmt_info)
1941             = GROUP_FIRST_ELEMENT (current_stmt_info);
1942         }
1943       else
1944         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1945
1946       lhs = gimple_assign_lhs (loop_use_stmt);
1947       current_stmt = loop_use_stmt;
1948       size++;
1949    }
1950
1951   if (!found || loop_use_stmt != phi || size < 2)
1952     return false;
1953
1954   /* Swap the operands, if needed, to make the reduction operand be the second
1955      operand.  */
1956   lhs = PHI_RESULT (phi);
1957   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1958   while (next_stmt)
1959     {
1960       if (gimple_assign_rhs2 (next_stmt) == lhs)
1961         {
1962           tree op = gimple_assign_rhs1 (next_stmt);
1963           gimple def_stmt = NULL;
1964
1965           if (TREE_CODE (op) == SSA_NAME)
1966             def_stmt = SSA_NAME_DEF_STMT (op);
1967
1968           /* Check that the other def is either defined in the loop
1969              ("vect_internal_def"), or it's an induction (defined by a
1970              loop-header phi-node).  */
1971           if (def_stmt
1972               && gimple_bb (def_stmt)
1973               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1974               && (is_gimple_assign (def_stmt)
1975                   || is_gimple_call (def_stmt)
1976                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1977                            == vect_induction_def
1978                   || (gimple_code (def_stmt) == GIMPLE_PHI
1979                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1980                                   == vect_internal_def
1981                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1982             {
1983               lhs = gimple_assign_lhs (next_stmt);
1984               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1985               continue;
1986             }
1987
1988           return false;
1989         }
1990       else
1991         {
1992           tree op = gimple_assign_rhs2 (next_stmt);
1993           gimple def_stmt = NULL;
1994
1995           if (TREE_CODE (op) == SSA_NAME)
1996             def_stmt = SSA_NAME_DEF_STMT (op);
1997
1998           /* Check that the other def is either defined in the loop
1999             ("vect_internal_def"), or it's an induction (defined by a
2000             loop-header phi-node).  */
2001           if (def_stmt
2002               && gimple_bb (def_stmt)
2003               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2004               && (is_gimple_assign (def_stmt)
2005                   || is_gimple_call (def_stmt)
2006                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2007                               == vect_induction_def
2008                   || (gimple_code (def_stmt) == GIMPLE_PHI
2009                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2010                                   == vect_internal_def
2011                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2012             {
2013               if (dump_enabled_p ())
2014                 {
2015                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2016                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2017                 }
2018
2019               swap_tree_operands (next_stmt,
2020                                   gimple_assign_rhs1_ptr (next_stmt),
2021                                   gimple_assign_rhs2_ptr (next_stmt));
2022               update_stmt (next_stmt);
2023
2024               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2025                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2026             }
2027           else
2028             return false;
2029         }
2030
2031       lhs = gimple_assign_lhs (next_stmt);
2032       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2033     }
2034
2035   /* Save the chain for further analysis in SLP detection.  */
2036   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2037   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2038   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2039
2040   return true;
2041 }
2042
2043
2044 /* Function vect_is_simple_reduction_1
2045
2046    (1) Detect a cross-iteration def-use cycle that represents a simple
2047    reduction computation.  We look for the following pattern:
2048
2049    loop_header:
2050      a1 = phi < a0, a2 >
2051      a3 = ...
2052      a2 = operation (a3, a1)
2053
2054    such that:
2055    1. operation is commutative and associative and it is safe to
2056       change the order of the computation (if CHECK_REDUCTION is true)
2057    2. no uses for a2 in the loop (a2 is used out of the loop)
2058    3. no uses of a1 in the loop besides the reduction operation
2059    4. no uses of a1 outside the loop.
2060
2061    Conditions 1,4 are tested here.
2062    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2063
2064    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2065    nested cycles, if CHECK_REDUCTION is false.
2066
2067    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2068    reductions:
2069
2070      a1 = phi < a0, a2 >
2071      inner loop (def of a3)
2072      a2 = phi < a3 >
2073
2074    If MODIFY is true it tries also to rework the code in-place to enable
2075    detection of more reduction patterns.  For the time being we rewrite
2076    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2077 */
2078
2079 static gimple
2080 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2081                             bool check_reduction, bool *double_reduc,
2082                             bool modify)
2083 {
2084   struct loop *loop = (gimple_bb (phi))->loop_father;
2085   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2086   edge latch_e = loop_latch_edge (loop);
2087   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2088   gimple def_stmt, def1 = NULL, def2 = NULL;
2089   enum tree_code orig_code, code;
2090   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2091   tree type;
2092   int nloop_uses;
2093   tree name;
2094   imm_use_iterator imm_iter;
2095   use_operand_p use_p;
2096   bool phi_def;
2097
2098   *double_reduc = false;
2099
2100   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2101      otherwise, we assume outer loop vectorization.  */
2102   gcc_assert ((check_reduction && loop == vect_loop)
2103               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2104
2105   name = PHI_RESULT (phi);
2106   nloop_uses = 0;
2107   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2108     {
2109       gimple use_stmt = USE_STMT (use_p);
2110       if (is_gimple_debug (use_stmt))
2111         continue;
2112
2113       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2114         {
2115           if (dump_enabled_p ())
2116             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117                              "intermediate value used outside loop.");
2118
2119           return NULL;
2120         }
2121
2122       if (vinfo_for_stmt (use_stmt)
2123           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2124         nloop_uses++;
2125       if (nloop_uses > 1)
2126         {
2127           if (dump_enabled_p ())
2128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129                              "reduction used in loop.");
2130           return NULL;
2131         }
2132     }
2133
2134   if (TREE_CODE (loop_arg) != SSA_NAME)
2135     {
2136       if (dump_enabled_p ())
2137         {
2138           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139                            "reduction: not ssa_name: ");
2140           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2141         }
2142       return NULL;
2143     }
2144
2145   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2146   if (!def_stmt)
2147     {
2148       if (dump_enabled_p ())
2149         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150                          "reduction: no def_stmt.");
2151       return NULL;
2152     }
2153
2154   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2155     {
2156       if (dump_enabled_p ())
2157         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2158       return NULL;
2159     }
2160
2161   if (is_gimple_assign (def_stmt))
2162     {
2163       name = gimple_assign_lhs (def_stmt);
2164       phi_def = false;
2165     }
2166   else
2167     {
2168       name = PHI_RESULT (def_stmt);
2169       phi_def = true;
2170     }
2171
2172   nloop_uses = 0;
2173   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2174     {
2175       gimple use_stmt = USE_STMT (use_p);
2176       if (is_gimple_debug (use_stmt))
2177         continue;
2178       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2179           && vinfo_for_stmt (use_stmt)
2180           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2181         nloop_uses++;
2182       if (nloop_uses > 1)
2183         {
2184           if (dump_enabled_p ())
2185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186                              "reduction used in loop.");
2187           return NULL;
2188         }
2189     }
2190
2191   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2192      defined in the inner loop.  */
2193   if (phi_def)
2194     {
2195       op1 = PHI_ARG_DEF (def_stmt, 0);
2196
2197       if (gimple_phi_num_args (def_stmt) != 1
2198           || TREE_CODE (op1) != SSA_NAME)
2199         {
2200           if (dump_enabled_p ())
2201             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202                              "unsupported phi node definition.");
2203
2204           return NULL;
2205         }
2206
2207       def1 = SSA_NAME_DEF_STMT (op1);
2208       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2209           && loop->inner
2210           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2211           && is_gimple_assign (def1))
2212         {
2213           if (dump_enabled_p ())
2214             report_vect_op (MSG_NOTE, def_stmt,
2215                             "detected double reduction: ");
2216
2217           *double_reduc = true;
2218           return def_stmt;
2219         }
2220
2221       return NULL;
2222     }
2223
2224   code = orig_code = gimple_assign_rhs_code (def_stmt);
2225
2226   /* We can handle "res -= x[i]", which is non-associative by
2227      simply rewriting this into "res += -x[i]".  Avoid changing
2228      gimple instruction for the first simple tests and only do this
2229      if we're allowed to change code at all.  */
2230   if (code == MINUS_EXPR
2231       && modify
2232       && (op1 = gimple_assign_rhs1 (def_stmt))
2233       && TREE_CODE (op1) == SSA_NAME
2234       && SSA_NAME_DEF_STMT (op1) == phi)
2235     code = PLUS_EXPR;
2236
2237   if (check_reduction
2238       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2239     {
2240       if (dump_enabled_p ())
2241         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2242                         "reduction: not commutative/associative: ");
2243       return NULL;
2244     }
2245
2246   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2247     {
2248       if (code != COND_EXPR)
2249         {
2250           if (dump_enabled_p ())
2251             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2252                             "reduction: not binary operation: ");
2253
2254           return NULL;
2255         }
2256
2257       op3 = gimple_assign_rhs1 (def_stmt);
2258       if (COMPARISON_CLASS_P (op3))
2259         {
2260           op4 = TREE_OPERAND (op3, 1);
2261           op3 = TREE_OPERAND (op3, 0);
2262         }
2263
2264       op1 = gimple_assign_rhs2 (def_stmt);
2265       op2 = gimple_assign_rhs3 (def_stmt);
2266
2267       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2268         {
2269           if (dump_enabled_p ())
2270             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2271                             "reduction: uses not ssa_names: ");
2272
2273           return NULL;
2274         }
2275     }
2276   else
2277     {
2278       op1 = gimple_assign_rhs1 (def_stmt);
2279       op2 = gimple_assign_rhs2 (def_stmt);
2280
2281       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2282         {
2283           if (dump_enabled_p ())
2284             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2285                             "reduction: uses not ssa_names: ");
2286
2287           return NULL;
2288         }
2289    }
2290
2291   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2292   if ((TREE_CODE (op1) == SSA_NAME
2293        && !types_compatible_p (type,TREE_TYPE (op1)))
2294       || (TREE_CODE (op2) == SSA_NAME
2295           && !types_compatible_p (type, TREE_TYPE (op2)))
2296       || (op3 && TREE_CODE (op3) == SSA_NAME
2297           && !types_compatible_p (type, TREE_TYPE (op3)))
2298       || (op4 && TREE_CODE (op4) == SSA_NAME
2299           && !types_compatible_p (type, TREE_TYPE (op4))))
2300     {
2301       if (dump_enabled_p ())
2302         {
2303           dump_printf_loc (MSG_NOTE, vect_location,
2304                            "reduction: multiple types: operation type: ");
2305           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2306           dump_printf (MSG_NOTE, ", operands types: ");
2307           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2308                              TREE_TYPE (op1));
2309           dump_printf (MSG_NOTE, ",");
2310           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2311                              TREE_TYPE (op2));
2312           if (op3)
2313             {
2314               dump_printf (MSG_NOTE, ",");
2315               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2316                                  TREE_TYPE (op3));
2317             }
2318
2319           if (op4)
2320             {
2321               dump_printf (MSG_NOTE, ",");
2322               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323                                  TREE_TYPE (op4));
2324             }
2325         }
2326
2327       return NULL;
2328     }
2329
2330   /* Check that it's ok to change the order of the computation.
2331      Generally, when vectorizing a reduction we change the order of the
2332      computation.  This may change the behavior of the program in some
2333      cases, so we need to check that this is ok.  One exception is when
2334      vectorizing an outer-loop: the inner-loop is executed sequentially,
2335      and therefore vectorizing reductions in the inner-loop during
2336      outer-loop vectorization is safe.  */
2337
2338   /* CHECKME: check for !flag_finite_math_only too?  */
2339   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2340       && check_reduction)
2341     {
2342       /* Changing the order of operations changes the semantics.  */
2343       if (dump_enabled_p ())
2344         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2345                         "reduction: unsafe fp math optimization: ");
2346       return NULL;
2347     }
2348   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2349            && check_reduction)
2350     {
2351       /* Changing the order of operations changes the semantics.  */
2352       if (dump_enabled_p ())
2353         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2354                         "reduction: unsafe int math optimization: ");
2355       return NULL;
2356     }
2357   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2358     {
2359       /* Changing the order of operations changes the semantics.  */
2360       if (dump_enabled_p ())
2361         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2362                         "reduction: unsafe fixed-point math optimization: ");
2363       return NULL;
2364     }
2365
2366   /* If we detected "res -= x[i]" earlier, rewrite it into
2367      "res += -x[i]" now.  If this turns out to be useless reassoc
2368      will clean it up again.  */
2369   if (orig_code == MINUS_EXPR)
2370     {
2371       tree rhs = gimple_assign_rhs2 (def_stmt);
2372       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2373       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2374                                                          rhs, NULL);
2375       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2376       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2377                                                           loop_info, NULL));
2378       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2379       gimple_assign_set_rhs2 (def_stmt, negrhs);
2380       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2381       update_stmt (def_stmt);
2382     }
2383
2384   /* Reduction is safe. We're dealing with one of the following:
2385      1) integer arithmetic and no trapv
2386      2) floating point arithmetic, and special flags permit this optimization
2387      3) nested cycle (i.e., outer loop vectorization).  */
2388   if (TREE_CODE (op1) == SSA_NAME)
2389     def1 = SSA_NAME_DEF_STMT (op1);
2390
2391   if (TREE_CODE (op2) == SSA_NAME)
2392     def2 = SSA_NAME_DEF_STMT (op2);
2393
2394   if (code != COND_EXPR
2395       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2396     {
2397       if (dump_enabled_p ())
2398         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2399       return NULL;
2400     }
2401
2402   /* Check that one def is the reduction def, defined by PHI,
2403      the other def is either defined in the loop ("vect_internal_def"),
2404      or it's an induction (defined by a loop-header phi-node).  */
2405
2406   if (def2 && def2 == phi
2407       && (code == COND_EXPR
2408           || !def1 || gimple_nop_p (def1)
2409           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2410               && (is_gimple_assign (def1)
2411                   || is_gimple_call (def1)
2412                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2413                       == vect_induction_def
2414                   || (gimple_code (def1) == GIMPLE_PHI
2415                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2416                           == vect_internal_def
2417                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2418     {
2419       if (dump_enabled_p ())
2420         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2421       return def_stmt;
2422     }
2423
2424   if (def1 && def1 == phi
2425       && (code == COND_EXPR
2426           || !def2 || gimple_nop_p (def2)
2427           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2428               && (is_gimple_assign (def2)
2429                   || is_gimple_call (def2)
2430                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2431                       == vect_induction_def
2432                   || (gimple_code (def2) == GIMPLE_PHI
2433                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2434                           == vect_internal_def
2435                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2436     {
2437       if (check_reduction)
2438         {
2439           /* Swap operands (just for simplicity - so that the rest of the code
2440              can assume that the reduction variable is always the last (second)
2441              argument).  */
2442           if (dump_enabled_p ())
2443             report_vect_op (MSG_NOTE, def_stmt,
2444                             "detected reduction: need to swap operands: ");
2445
2446           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2447                               gimple_assign_rhs2_ptr (def_stmt));
2448
2449           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2450             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2451         }
2452       else
2453         {
2454           if (dump_enabled_p ())
2455             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2456         }
2457
2458       return def_stmt;
2459     }
2460
2461   /* Try to find SLP reduction chain.  */
2462   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2463     {
2464       if (dump_enabled_p ())
2465         report_vect_op (MSG_NOTE, def_stmt,
2466                         "reduction: detected reduction chain: ");
2467
2468       return def_stmt;
2469     }
2470
2471   if (dump_enabled_p ())
2472     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2473                     "reduction: unknown pattern: ");
2474
2475   return NULL;
2476 }
2477
2478 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2479    in-place.  Arguments as there.  */
2480
2481 static gimple
2482 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2483                           bool check_reduction, bool *double_reduc)
2484 {
2485   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2486                                      double_reduc, false);
2487 }
2488
2489 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2490    in-place if it enables detection of more reductions.  Arguments
2491    as there.  */
2492
2493 gimple
2494 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2495                           bool check_reduction, bool *double_reduc)
2496 {
2497   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498                                      double_reduc, true);
2499 }
2500
2501 /* Calculate the cost of one scalar iteration of the loop.  */
2502 int
2503 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2504 {
2505   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2506   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2507   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2508   int innerloop_iters, i, stmt_cost;
2509
2510   /* Count statements in scalar loop.  Using this as scalar cost for a single
2511      iteration for now.
2512
2513      TODO: Add outer loop support.
2514
2515      TODO: Consider assigning different costs to different scalar
2516      statements.  */
2517
2518   /* FORNOW.  */
2519   innerloop_iters = 1;
2520   if (loop->inner)
2521     innerloop_iters = 50; /* FIXME */
2522
2523   for (i = 0; i < nbbs; i++)
2524     {
2525       gimple_stmt_iterator si;
2526       basic_block bb = bbs[i];
2527
2528       if (bb->loop_father == loop->inner)
2529         factor = innerloop_iters;
2530       else
2531         factor = 1;
2532
2533       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2534         {
2535           gimple stmt = gsi_stmt (si);
2536           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2537
2538           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2539             continue;
2540
2541           /* Skip stmts that are not vectorized inside the loop.  */
2542           if (stmt_info
2543               && !STMT_VINFO_RELEVANT_P (stmt_info)
2544               && (!STMT_VINFO_LIVE_P (stmt_info)
2545                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2546               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2547             continue;
2548
2549           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2550             {
2551               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2552                stmt_cost = vect_get_stmt_cost (scalar_load);
2553              else
2554                stmt_cost = vect_get_stmt_cost (scalar_store);
2555             }
2556           else
2557             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2558
2559           scalar_single_iter_cost += stmt_cost * factor;
2560         }
2561     }
2562   return scalar_single_iter_cost;
2563 }
2564
2565 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2566 int
2567 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2568                              int *peel_iters_epilogue,
2569                              int scalar_single_iter_cost,
2570                              stmt_vector_for_cost *prologue_cost_vec,
2571                              stmt_vector_for_cost *epilogue_cost_vec)
2572 {
2573   int retval = 0;
2574   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2575
2576   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2577     {
2578       *peel_iters_epilogue = vf/2;
2579       if (dump_enabled_p ())
2580         dump_printf_loc (MSG_NOTE, vect_location,
2581                          "cost model: epilogue peel iters set to vf/2 "
2582                          "because loop iterations are unknown .");
2583
2584       /* If peeled iterations are known but number of scalar loop
2585          iterations are unknown, count a taken branch per peeled loop.  */
2586       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2587                                  NULL, 0, vect_prologue);
2588     }
2589   else
2590     {
2591       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2592       peel_iters_prologue = niters < peel_iters_prologue ?
2593                             niters : peel_iters_prologue;
2594       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2595       /* If we need to peel for gaps, but no peeling is required, we have to
2596          peel VF iterations.  */
2597       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2598         *peel_iters_epilogue = vf;
2599     }
2600
2601   if (peel_iters_prologue)
2602     retval += record_stmt_cost (prologue_cost_vec,
2603                                 peel_iters_prologue * scalar_single_iter_cost,
2604                                 scalar_stmt, NULL, 0, vect_prologue);
2605   if (*peel_iters_epilogue)
2606     retval += record_stmt_cost (epilogue_cost_vec,
2607                                 *peel_iters_epilogue * scalar_single_iter_cost,
2608                                 scalar_stmt, NULL, 0, vect_epilogue);
2609   return retval;
2610 }
2611
2612 /* Function vect_estimate_min_profitable_iters
2613
2614    Return the number of iterations required for the vector version of the
2615    loop to be profitable relative to the cost of the scalar version of the
2616    loop.  */
2617
2618 static void
2619 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2620                                     int *ret_min_profitable_niters,
2621                                     int *ret_min_profitable_estimate)
2622 {
2623   int min_profitable_iters;
2624   int min_profitable_estimate;
2625   int peel_iters_prologue;
2626   int peel_iters_epilogue;
2627   unsigned vec_inside_cost = 0;
2628   int vec_outside_cost = 0;
2629   unsigned vec_prologue_cost = 0;
2630   unsigned vec_epilogue_cost = 0;
2631   int scalar_single_iter_cost = 0;
2632   int scalar_outside_cost = 0;
2633   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2634   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2635   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2636
2637   /* Cost model disabled.  */
2638   if (!flag_vect_cost_model)
2639     {
2640       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2641       *ret_min_profitable_niters = 0;
2642       *ret_min_profitable_estimate = 0;
2643       return;
2644     }
2645
2646   /* Requires loop versioning tests to handle misalignment.  */
2647   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2648     {
2649       /*  FIXME: Make cost depend on complexity of individual check.  */
2650       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2651       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2652                             vect_prologue);
2653       dump_printf (MSG_NOTE,
2654                    "cost model: Adding cost of checks for loop "
2655                    "versioning to treat misalignment.\n");
2656     }
2657
2658   /* Requires loop versioning with alias checks.  */
2659   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2660     {
2661       /*  FIXME: Make cost depend on complexity of individual check.  */
2662       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2663       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2664                             vect_prologue);
2665       dump_printf (MSG_NOTE,
2666                    "cost model: Adding cost of checks for loop "
2667                    "versioning aliasing.\n");
2668     }
2669
2670   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2671       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2672     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2673                           vect_prologue);
2674
2675   /* Count statements in scalar loop.  Using this as scalar cost for a single
2676      iteration for now.
2677
2678      TODO: Add outer loop support.
2679
2680      TODO: Consider assigning different costs to different scalar
2681      statements.  */
2682
2683   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2684
2685   /* Add additional cost for the peeled instructions in prologue and epilogue
2686      loop.
2687
2688      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2689      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2690
2691      TODO: Build an expression that represents peel_iters for prologue and
2692      epilogue to be used in a run-time test.  */
2693
2694   if (npeel  < 0)
2695     {
2696       peel_iters_prologue = vf/2;
2697       dump_printf (MSG_NOTE, "cost model: "
2698                    "prologue peel iters set to vf/2.");
2699
2700       /* If peeling for alignment is unknown, loop bound of main loop becomes
2701          unknown.  */
2702       peel_iters_epilogue = vf/2;
2703       dump_printf (MSG_NOTE, "cost model: "
2704                    "epilogue peel iters set to vf/2 because "
2705                    "peeling for alignment is unknown.");
2706
2707       /* If peeled iterations are unknown, count a taken branch and a not taken
2708          branch per peeled loop. Even if scalar loop iterations are known,
2709          vector iterations are not known since peeled prologue iterations are
2710          not known. Hence guards remain the same.  */
2711       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2712                             NULL, 0, vect_prologue);
2713       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2714                             NULL, 0, vect_prologue);
2715       /* FORNOW: Don't attempt to pass individual scalar instructions to
2716          the model; just assume linear cost for scalar iterations.  */
2717       (void) add_stmt_cost (target_cost_data,
2718                             peel_iters_prologue * scalar_single_iter_cost,
2719                             scalar_stmt, NULL, 0, vect_prologue);
2720       (void) add_stmt_cost (target_cost_data,
2721                             peel_iters_epilogue * scalar_single_iter_cost,
2722                             scalar_stmt, NULL, 0, vect_epilogue);
2723     }
2724   else
2725     {
2726       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2727       stmt_info_for_cost *si;
2728       int j;
2729       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2730
2731       prologue_cost_vec.create (2);
2732       epilogue_cost_vec.create (2);
2733       peel_iters_prologue = npeel;
2734
2735       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2736                                           &peel_iters_epilogue,
2737                                           scalar_single_iter_cost,
2738                                           &prologue_cost_vec,
2739                                           &epilogue_cost_vec);
2740
2741       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2742         {
2743           struct _stmt_vec_info *stmt_info
2744             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2745           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2746                                 si->misalign, vect_prologue);
2747         }
2748
2749       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2750         {
2751           struct _stmt_vec_info *stmt_info
2752             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2753           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2754                                 si->misalign, vect_epilogue);
2755         }
2756
2757       prologue_cost_vec.release ();
2758       epilogue_cost_vec.release ();
2759     }
2760
2761   /* FORNOW: The scalar outside cost is incremented in one of the
2762      following ways:
2763
2764      1. The vectorizer checks for alignment and aliasing and generates
2765      a condition that allows dynamic vectorization.  A cost model
2766      check is ANDED with the versioning condition.  Hence scalar code
2767      path now has the added cost of the versioning check.
2768
2769        if (cost > th & versioning_check)
2770          jmp to vector code
2771
2772      Hence run-time scalar is incremented by not-taken branch cost.
2773
2774      2. The vectorizer then checks if a prologue is required.  If the
2775      cost model check was not done before during versioning, it has to
2776      be done before the prologue check.
2777
2778        if (cost <= th)
2779          prologue = scalar_iters
2780        if (prologue == 0)
2781          jmp to vector code
2782        else
2783          execute prologue
2784        if (prologue == num_iters)
2785          go to exit
2786
2787      Hence the run-time scalar cost is incremented by a taken branch,
2788      plus a not-taken branch, plus a taken branch cost.
2789
2790      3. The vectorizer then checks if an epilogue is required.  If the
2791      cost model check was not done before during prologue check, it
2792      has to be done with the epilogue check.
2793
2794        if (prologue == 0)
2795          jmp to vector code
2796        else
2797          execute prologue
2798        if (prologue == num_iters)
2799          go to exit
2800        vector code:
2801          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2802            jmp to epilogue
2803
2804      Hence the run-time scalar cost should be incremented by 2 taken
2805      branches.
2806
2807      TODO: The back end may reorder the BBS's differently and reverse
2808      conditions/branch directions.  Change the estimates below to
2809      something more reasonable.  */
2810
2811   /* If the number of iterations is known and we do not do versioning, we can
2812      decide whether to vectorize at compile time.  Hence the scalar version
2813      do not carry cost model guard costs.  */
2814   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2815       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2816       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2817     {
2818       /* Cost model check occurs at versioning.  */
2819       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2820           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2821         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2822       else
2823         {
2824           /* Cost model check occurs at prologue generation.  */
2825           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2826             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2827               + vect_get_stmt_cost (cond_branch_not_taken);
2828           /* Cost model check occurs at epilogue generation.  */
2829           else
2830             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2831         }
2832     }
2833
2834   /* Complete the target-specific cost calculations.  */
2835   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2836                &vec_inside_cost, &vec_epilogue_cost);
2837
2838   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2839
2840   /* Calculate number of iterations required to make the vector version
2841      profitable, relative to the loop bodies only.  The following condition
2842      must hold true:
2843      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2844      where
2845      SIC = scalar iteration cost, VIC = vector iteration cost,
2846      VOC = vector outside cost, VF = vectorization factor,
2847      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2848      SOC = scalar outside cost for run time cost model check.  */
2849
2850   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2851     {
2852       if (vec_outside_cost <= 0)
2853         min_profitable_iters = 1;
2854       else
2855         {
2856           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2857                                   - vec_inside_cost * peel_iters_prologue
2858                                   - vec_inside_cost * peel_iters_epilogue)
2859                                  / ((scalar_single_iter_cost * vf)
2860                                     - vec_inside_cost);
2861
2862           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2863               <= (((int) vec_inside_cost * min_profitable_iters)
2864                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2865             min_profitable_iters++;
2866         }
2867     }
2868   /* vector version will never be profitable.  */
2869   else
2870     {
2871       if (dump_enabled_p ())
2872         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2873                          "cost model: the vector iteration cost = %d "
2874                          "divided by the scalar iteration cost = %d "
2875                          "is greater or equal to the vectorization factor = %d.",
2876                          vec_inside_cost, scalar_single_iter_cost, vf);
2877       *ret_min_profitable_niters = -1;
2878       *ret_min_profitable_estimate = -1;
2879       return;
2880     }
2881
2882   if (dump_enabled_p ())
2883     {
2884       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2885       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2886                    vec_inside_cost);
2887       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2888                    vec_prologue_cost);
2889       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2890                    vec_epilogue_cost);
2891       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2892                    scalar_single_iter_cost);
2893       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2894                    scalar_outside_cost);
2895       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2896                    vec_outside_cost);
2897       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2898                    peel_iters_prologue);
2899       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2900                    peel_iters_epilogue);
2901       dump_printf (MSG_NOTE,
2902                    "  Calculated minimum iters for profitability: %d\n",
2903                    min_profitable_iters);
2904     }
2905
2906   min_profitable_iters =
2907         min_profitable_iters < vf ? vf : min_profitable_iters;
2908
2909   /* Because the condition we create is:
2910      if (niters <= min_profitable_iters)
2911        then skip the vectorized loop.  */
2912   min_profitable_iters--;
2913
2914   if (dump_enabled_p ())
2915     dump_printf_loc (MSG_NOTE, vect_location,
2916                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2917
2918   *ret_min_profitable_niters = min_profitable_iters;
2919
2920   /* Calculate number of iterations required to make the vector version
2921      profitable, relative to the loop bodies only.
2922
2923      Non-vectorized variant is SIC * niters and it must win over vector
2924      variant on the expected loop trip count.  The following condition must hold true:
2925      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2926
2927   if (vec_outside_cost <= 0)
2928     min_profitable_estimate = 1;
2929   else
2930     {
2931       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2932                                  - vec_inside_cost * peel_iters_prologue
2933                                  - vec_inside_cost * peel_iters_epilogue)
2934                                  / ((scalar_single_iter_cost * vf)
2935                                    - vec_inside_cost);
2936     }
2937   min_profitable_estimate --;
2938   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2939   if (dump_enabled_p ())
2940     dump_printf_loc (MSG_NOTE, vect_location,
2941                      "  Static estimate profitability threshold = %d\n",
2942                       min_profitable_iters);
2943
2944   *ret_min_profitable_estimate = min_profitable_estimate;
2945 }
2946
2947
2948 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2949    functions. Design better to avoid maintenance issues.  */
2950
2951 /* Function vect_model_reduction_cost.
2952
2953    Models cost for a reduction operation, including the vector ops
2954    generated within the strip-mine loop, the initial definition before
2955    the loop, and the epilogue code that must be generated.  */
2956
2957 static bool
2958 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2959                            int ncopies)
2960 {
2961   int prologue_cost = 0, epilogue_cost = 0;
2962   enum tree_code code;
2963   optab optab;
2964   tree vectype;
2965   gimple stmt, orig_stmt;
2966   tree reduction_op;
2967   enum machine_mode mode;
2968   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2969   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2970   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2971
2972   /* Cost of reduction op inside loop.  */
2973   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2974                                         stmt_info, 0, vect_body);
2975   stmt = STMT_VINFO_STMT (stmt_info);
2976
2977   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2978     {
2979     case GIMPLE_SINGLE_RHS:
2980       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2981       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2982       break;
2983     case GIMPLE_UNARY_RHS:
2984       reduction_op = gimple_assign_rhs1 (stmt);
2985       break;
2986     case GIMPLE_BINARY_RHS:
2987       reduction_op = gimple_assign_rhs2 (stmt);
2988       break;
2989     case GIMPLE_TERNARY_RHS:
2990       reduction_op = gimple_assign_rhs3 (stmt);
2991       break;
2992     default:
2993       gcc_unreachable ();
2994     }
2995
2996   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2997   if (!vectype)
2998     {
2999       if (dump_enabled_p ())
3000         {
3001           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002                            "unsupported data-type ");
3003           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3004                              TREE_TYPE (reduction_op));
3005         }
3006       return false;
3007    }
3008
3009   mode = TYPE_MODE (vectype);
3010   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3011
3012   if (!orig_stmt)
3013     orig_stmt = STMT_VINFO_STMT (stmt_info);
3014
3015   code = gimple_assign_rhs_code (orig_stmt);
3016
3017   /* Add in cost for initial definition.  */
3018   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3019                                   stmt_info, 0, vect_prologue);
3020
3021   /* Determine cost of epilogue code.
3022
3023      We have a reduction operator that will reduce the vector in one statement.
3024      Also requires scalar extract.  */
3025
3026   if (!nested_in_vect_loop_p (loop, orig_stmt))
3027     {
3028       if (reduc_code != ERROR_MARK)
3029         {
3030           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3031                                           stmt_info, 0, vect_epilogue);
3032           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3033                                           stmt_info, 0, vect_epilogue);
3034         }
3035       else
3036         {
3037           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3038           tree bitsize =
3039             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3040           int element_bitsize = tree_low_cst (bitsize, 1);
3041           int nelements = vec_size_in_bits / element_bitsize;
3042
3043           optab = optab_for_tree_code (code, vectype, optab_default);
3044
3045           /* We have a whole vector shift available.  */
3046           if (VECTOR_MODE_P (mode)
3047               && optab_handler (optab, mode) != CODE_FOR_nothing
3048               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3049             {
3050               /* Final reduction via vector shifts and the reduction operator.
3051                  Also requires scalar extract.  */
3052               epilogue_cost += add_stmt_cost (target_cost_data,
3053                                               exact_log2 (nelements) * 2,
3054                                               vector_stmt, stmt_info, 0,
3055                                               vect_epilogue);
3056               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3057                                               vec_to_scalar, stmt_info, 0,
3058                                               vect_epilogue);
3059             }
3060           else
3061             /* Use extracts and reduction op for final reduction.  For N
3062                elements, we have N extracts and N-1 reduction ops.  */
3063             epilogue_cost += add_stmt_cost (target_cost_data,
3064                                             nelements + nelements - 1,
3065                                             vector_stmt, stmt_info, 0,
3066                                             vect_epilogue);
3067         }
3068     }
3069
3070   if (dump_enabled_p ())
3071     dump_printf (MSG_NOTE,
3072                  "vect_model_reduction_cost: inside_cost = %d, "
3073                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3074                  prologue_cost, epilogue_cost);
3075
3076   return true;
3077 }
3078
3079
3080 /* Function vect_model_induction_cost.
3081
3082    Models cost for induction operations.  */
3083
3084 static void
3085 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3086 {
3087   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3088   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3089   unsigned inside_cost, prologue_cost;
3090
3091   /* loop cost for vec_loop.  */
3092   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3093                                stmt_info, 0, vect_body);
3094
3095   /* prologue cost for vec_init and vec_step.  */
3096   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3097                                  stmt_info, 0, vect_prologue);
3098
3099   if (dump_enabled_p ())
3100     dump_printf_loc (MSG_NOTE, vect_location,
3101                      "vect_model_induction_cost: inside_cost = %d, "
3102                      "prologue_cost = %d .", inside_cost, prologue_cost);
3103 }
3104
3105
3106 /* Function get_initial_def_for_induction
3107
3108    Input:
3109    STMT - a stmt that performs an induction operation in the loop.
3110    IV_PHI - the initial value of the induction variable
3111
3112    Output:
3113    Return a vector variable, initialized with the first VF values of
3114    the induction variable.  E.g., for an iv with IV_PHI='X' and
3115    evolution S, for a vector of 4 units, we want to return:
3116    [X, X + S, X + 2*S, X + 3*S].  */
3117
3118 static tree
3119 get_initial_def_for_induction (gimple iv_phi)
3120 {
3121   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3122   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3123   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3124   tree scalar_type;
3125   tree vectype;
3126   int nunits;
3127   edge pe = loop_preheader_edge (loop);
3128   struct loop *iv_loop;
3129   basic_block new_bb;
3130   tree new_vec, vec_init, vec_step, t;
3131   tree access_fn;
3132   tree new_var;
3133   tree new_name;
3134   gimple init_stmt, induction_phi, new_stmt;
3135   tree induc_def, vec_def, vec_dest;
3136   tree init_expr, step_expr;
3137   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3138   int i;
3139   bool ok;
3140   int ncopies;
3141   tree expr;
3142   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3143   bool nested_in_vect_loop = false;
3144   gimple_seq stmts = NULL;
3145   imm_use_iterator imm_iter;
3146   use_operand_p use_p;
3147   gimple exit_phi;
3148   edge latch_e;
3149   tree loop_arg;
3150   gimple_stmt_iterator si;
3151   basic_block bb = gimple_bb (iv_phi);
3152   tree stepvectype;
3153   tree resvectype;
3154
3155   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3156   if (nested_in_vect_loop_p (loop, iv_phi))
3157     {
3158       nested_in_vect_loop = true;
3159       iv_loop = loop->inner;
3160     }
3161   else
3162     iv_loop = loop;
3163   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3164
3165   latch_e = loop_latch_edge (iv_loop);
3166   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3167
3168   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3169   gcc_assert (access_fn);
3170   STRIP_NOPS (access_fn);
3171   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3172                                     &init_expr, &step_expr);
3173   gcc_assert (ok);
3174   pe = loop_preheader_edge (iv_loop);
3175
3176   scalar_type = TREE_TYPE (init_expr);
3177   vectype = get_vectype_for_scalar_type (scalar_type);
3178   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3179   gcc_assert (vectype);
3180   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3181   ncopies = vf / nunits;
3182
3183   gcc_assert (phi_info);
3184   gcc_assert (ncopies >= 1);
3185
3186   /* Find the first insertion point in the BB.  */
3187   si = gsi_after_labels (bb);
3188
3189   /* Create the vector that holds the initial_value of the induction.  */
3190   if (nested_in_vect_loop)
3191     {
3192       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3193          been created during vectorization of previous stmts.  We obtain it
3194          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3195       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3196                                            loop_preheader_edge (iv_loop));
3197       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3198       /* If the initial value is not of proper type, convert it.  */
3199       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3200         {
3201           new_stmt = gimple_build_assign_with_ops
3202               (VIEW_CONVERT_EXPR,
3203                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3204                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3205           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3206           gimple_assign_set_lhs (new_stmt, vec_init);
3207           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3208                                                  new_stmt);
3209           gcc_assert (!new_bb);
3210           set_vinfo_for_stmt (new_stmt,
3211                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3212         }
3213     }
3214   else
3215     {
3216       vec<constructor_elt, va_gc> *v;
3217
3218       /* iv_loop is the loop to be vectorized. Create:
3219          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3220       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3221       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3222       if (stmts)
3223         {
3224           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3225           gcc_assert (!new_bb);
3226         }
3227
3228       vec_alloc (v, nunits);
3229       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3230       for (i = 1; i < nunits; i++)
3231         {
3232           /* Create: new_name_i = new_name + step_expr  */
3233           enum tree_code code = POINTER_TYPE_P (scalar_type)
3234                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3235           init_stmt = gimple_build_assign_with_ops (code, new_var,
3236                                                     new_name, step_expr);
3237           new_name = make_ssa_name (new_var, init_stmt);
3238           gimple_assign_set_lhs (init_stmt, new_name);
3239
3240           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3241           gcc_assert (!new_bb);
3242
3243           if (dump_enabled_p ())
3244             {
3245               dump_printf_loc (MSG_NOTE, vect_location,
3246                                "created new init_stmt: ");
3247               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3248             }
3249           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3250         }
3251       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3252       new_vec = build_constructor (vectype, v);
3253       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3254     }
3255
3256
3257   /* Create the vector that holds the step of the induction.  */
3258   if (nested_in_vect_loop)
3259     /* iv_loop is nested in the loop to be vectorized. Generate:
3260        vec_step = [S, S, S, S]  */
3261     new_name = step_expr;
3262   else
3263     {
3264       /* iv_loop is the loop to be vectorized. Generate:
3265           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3266       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3267       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3268                               expr, step_expr);
3269     }
3270
3271   t = unshare_expr (new_name);
3272   gcc_assert (CONSTANT_CLASS_P (new_name));
3273   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3274   gcc_assert (stepvectype);
3275   new_vec = build_vector_from_val (stepvectype, t);
3276   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3277
3278
3279   /* Create the following def-use cycle:
3280      loop prolog:
3281          vec_init = ...
3282          vec_step = ...
3283      loop:
3284          vec_iv = PHI <vec_init, vec_loop>
3285          ...
3286          STMT
3287          ...
3288          vec_loop = vec_iv + vec_step;  */
3289
3290   /* Create the induction-phi that defines the induction-operand.  */
3291   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3292   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3293   set_vinfo_for_stmt (induction_phi,
3294                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3295   induc_def = PHI_RESULT (induction_phi);
3296
3297   /* Create the iv update inside the loop  */
3298   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3299                                            induc_def, vec_step);
3300   vec_def = make_ssa_name (vec_dest, new_stmt);
3301   gimple_assign_set_lhs (new_stmt, vec_def);
3302   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3303   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3304                                                    NULL));
3305
3306   /* Set the arguments of the phi node:  */
3307   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3308   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3309                UNKNOWN_LOCATION);
3310
3311
3312   /* In case that vectorization factor (VF) is bigger than the number
3313      of elements that we can fit in a vectype (nunits), we have to generate
3314      more than one vector stmt - i.e - we need to "unroll" the
3315      vector stmt by a factor VF/nunits.  For more details see documentation
3316      in vectorizable_operation.  */
3317
3318   if (ncopies > 1)
3319     {
3320       stmt_vec_info prev_stmt_vinfo;
3321       /* FORNOW. This restriction should be relaxed.  */
3322       gcc_assert (!nested_in_vect_loop);
3323
3324       /* Create the vector that holds the step of the induction.  */
3325       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3326       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3327                               expr, step_expr);
3328       t = unshare_expr (new_name);
3329       gcc_assert (CONSTANT_CLASS_P (new_name));
3330       new_vec = build_vector_from_val (stepvectype, t);
3331       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3332
3333       vec_def = induc_def;
3334       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3335       for (i = 1; i < ncopies; i++)
3336         {
3337           /* vec_i = vec_prev + vec_step  */
3338           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3339                                                    vec_def, vec_step);
3340           vec_def = make_ssa_name (vec_dest, new_stmt);
3341           gimple_assign_set_lhs (new_stmt, vec_def);
3342
3343           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3344           if (!useless_type_conversion_p (resvectype, vectype))
3345             {
3346               new_stmt = gimple_build_assign_with_ops
3347                   (VIEW_CONVERT_EXPR,
3348                    vect_get_new_vect_var (resvectype, vect_simple_var,
3349                                           "vec_iv_"),
3350                    build1 (VIEW_CONVERT_EXPR, resvectype,
3351                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3352               gimple_assign_set_lhs (new_stmt,
3353                                      make_ssa_name
3354                                        (gimple_assign_lhs (new_stmt), new_stmt));
3355               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3356             }
3357           set_vinfo_for_stmt (new_stmt,
3358                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3359           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3360           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3361         }
3362     }
3363
3364   if (nested_in_vect_loop)
3365     {
3366       /* Find the loop-closed exit-phi of the induction, and record
3367          the final vector of induction results:  */
3368       exit_phi = NULL;
3369       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3370         {
3371           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3372             {
3373               exit_phi = USE_STMT (use_p);
3374               break;
3375             }
3376         }
3377       if (exit_phi)
3378         {
3379           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3380           /* FORNOW. Currently not supporting the case that an inner-loop induction
3381              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3382           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3383                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3384
3385           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3386           if (dump_enabled_p ())
3387             {
3388               dump_printf_loc (MSG_NOTE, vect_location,
3389                                "vector of inductions after inner-loop:");
3390               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3391             }
3392         }
3393     }
3394
3395
3396   if (dump_enabled_p ())
3397     {
3398       dump_printf_loc (MSG_NOTE, vect_location,
3399                        "transform induction: created def-use cycle: ");
3400       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3401       dump_printf (MSG_NOTE, "\n");
3402       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3403                         SSA_NAME_DEF_STMT (vec_def), 0);
3404     }
3405
3406   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3407   if (!useless_type_conversion_p (resvectype, vectype))
3408     {
3409       new_stmt = gimple_build_assign_with_ops
3410          (VIEW_CONVERT_EXPR,
3411           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3412           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3413       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3414       gimple_assign_set_lhs (new_stmt, induc_def);
3415       si = gsi_after_labels (bb);
3416       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3417       set_vinfo_for_stmt (new_stmt,
3418                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3419       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3420         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3421     }
3422
3423   return induc_def;
3424 }
3425
3426
3427 /* Function get_initial_def_for_reduction
3428
3429    Input:
3430    STMT - a stmt that performs a reduction operation in the loop.
3431    INIT_VAL - the initial value of the reduction variable
3432
3433    Output:
3434    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3435         of the reduction (used for adjusting the epilog - see below).
3436    Return a vector variable, initialized according to the operation that STMT
3437         performs. This vector will be used as the initial value of the
3438         vector of partial results.
3439
3440    Option1 (adjust in epilog): Initialize the vector as follows:
3441      add/bit or/xor:    [0,0,...,0,0]
3442      mult/bit and:      [1,1,...,1,1]
3443      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3444    and when necessary (e.g. add/mult case) let the caller know
3445    that it needs to adjust the result by init_val.
3446
3447    Option2: Initialize the vector as follows:
3448      add/bit or/xor:    [init_val,0,0,...,0]
3449      mult/bit and:      [init_val,1,1,...,1]
3450      min/max/cond_expr: [init_val,init_val,...,init_val]
3451    and no adjustments are needed.
3452
3453    For example, for the following code:
3454
3455    s = init_val;
3456    for (i=0;i<n;i++)
3457      s = s + a[i];
3458
3459    STMT is 's = s + a[i]', and the reduction variable is 's'.
3460    For a vector of 4 units, we want to return either [0,0,0,init_val],
3461    or [0,0,0,0] and let the caller know that it needs to adjust
3462    the result at the end by 'init_val'.
3463
3464    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3465    initialization vector is simpler (same element in all entries), if
3466    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3467
3468    A cost model should help decide between these two schemes.  */
3469
3470 tree
3471 get_initial_def_for_reduction (gimple stmt, tree init_val,
3472                                tree *adjustment_def)
3473 {
3474   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3475   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3476   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3477   tree scalar_type = TREE_TYPE (init_val);
3478   tree vectype = get_vectype_for_scalar_type (scalar_type);
3479   int nunits;
3480   enum tree_code code = gimple_assign_rhs_code (stmt);
3481   tree def_for_init;
3482   tree init_def;
3483   tree *elts;
3484   int i;
3485   bool nested_in_vect_loop = false;
3486   tree init_value;
3487   REAL_VALUE_TYPE real_init_val = dconst0;
3488   int int_init_val = 0;
3489   gimple def_stmt = NULL;
3490
3491   gcc_assert (vectype);
3492   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3493
3494   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3495               || SCALAR_FLOAT_TYPE_P (scalar_type));
3496
3497   if (nested_in_vect_loop_p (loop, stmt))
3498     nested_in_vect_loop = true;
3499   else
3500     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3501
3502   /* In case of double reduction we only create a vector variable to be put
3503      in the reduction phi node.  The actual statement creation is done in
3504      vect_create_epilog_for_reduction.  */
3505   if (adjustment_def && nested_in_vect_loop
3506       && TREE_CODE (init_val) == SSA_NAME
3507       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3508       && gimple_code (def_stmt) == GIMPLE_PHI
3509       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3510       && vinfo_for_stmt (def_stmt)
3511       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3512           == vect_double_reduction_def)
3513     {
3514       *adjustment_def = NULL;
3515       return vect_create_destination_var (init_val, vectype);
3516     }
3517
3518   if (TREE_CONSTANT (init_val))
3519     {
3520       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3521         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3522       else
3523         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3524     }
3525   else
3526     init_value = init_val;
3527
3528   switch (code)
3529     {
3530       case WIDEN_SUM_EXPR:
3531       case DOT_PROD_EXPR:
3532       case PLUS_EXPR:
3533       case MINUS_EXPR:
3534       case BIT_IOR_EXPR:
3535       case BIT_XOR_EXPR:
3536       case MULT_EXPR:
3537       case BIT_AND_EXPR:
3538         /* ADJUSMENT_DEF is NULL when called from
3539            vect_create_epilog_for_reduction to vectorize double reduction.  */
3540         if (adjustment_def)
3541           {
3542             if (nested_in_vect_loop)
3543               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3544                                                               NULL);
3545             else
3546               *adjustment_def = init_val;
3547           }
3548
3549         if (code == MULT_EXPR)
3550           {
3551             real_init_val = dconst1;
3552             int_init_val = 1;
3553           }
3554
3555         if (code == BIT_AND_EXPR)
3556           int_init_val = -1;
3557
3558         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3559           def_for_init = build_real (scalar_type, real_init_val);
3560         else
3561           def_for_init = build_int_cst (scalar_type, int_init_val);
3562
3563         /* Create a vector of '0' or '1' except the first element.  */
3564         elts = XALLOCAVEC (tree, nunits);
3565         for (i = nunits - 2; i >= 0; --i)
3566           elts[i + 1] = def_for_init;
3567
3568         /* Option1: the first element is '0' or '1' as well.  */
3569         if (adjustment_def)
3570           {
3571             elts[0] = def_for_init;
3572             init_def = build_vector (vectype, elts);
3573             break;
3574           }
3575
3576         /* Option2: the first element is INIT_VAL.  */
3577         elts[0] = init_val;
3578         if (TREE_CONSTANT (init_val))
3579           init_def = build_vector (vectype, elts);
3580         else
3581           {
3582             vec<constructor_elt, va_gc> *v;
3583             vec_alloc (v, nunits);
3584             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3585             for (i = 1; i < nunits; ++i)
3586               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3587             init_def = build_constructor (vectype, v);
3588           }
3589
3590         break;
3591
3592       case MIN_EXPR:
3593       case MAX_EXPR:
3594       case COND_EXPR:
3595         if (adjustment_def)
3596           {
3597             *adjustment_def = NULL_TREE;
3598             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3599             break;
3600           }
3601
3602         init_def = build_vector_from_val (vectype, init_value);
3603         break;
3604
3605       default:
3606         gcc_unreachable ();
3607     }
3608
3609   return init_def;
3610 }
3611
3612
3613 /* Function vect_create_epilog_for_reduction
3614
3615    Create code at the loop-epilog to finalize the result of a reduction
3616    computation.
3617
3618    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3619      reduction statements.
3620    STMT is the scalar reduction stmt that is being vectorized.
3621    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3622      number of elements that we can fit in a vectype (nunits).  In this case
3623      we have to generate more than one vector stmt - i.e - we need to "unroll"
3624      the vector stmt by a factor VF/nunits.  For more details see documentation
3625      in vectorizable_operation.
3626    REDUC_CODE is the tree-code for the epilog reduction.
3627    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3628      computation.
3629    REDUC_INDEX is the index of the operand in the right hand side of the
3630      statement that is defined by REDUCTION_PHI.
3631    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3632    SLP_NODE is an SLP node containing a group of reduction statements. The
3633      first one in this group is STMT.
3634
3635    This function:
3636    1. Creates the reduction def-use cycles: sets the arguments for
3637       REDUCTION_PHIS:
3638       The loop-entry argument is the vectorized initial-value of the reduction.
3639       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3640       sums.
3641    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3642       by applying the operation specified by REDUC_CODE if available, or by
3643       other means (whole-vector shifts or a scalar loop).
3644       The function also creates a new phi node at the loop exit to preserve
3645       loop-closed form, as illustrated below.
3646
3647      The flow at the entry to this function:
3648
3649         loop:
3650           vec_def = phi <null, null>            # REDUCTION_PHI
3651           VECT_DEF = vector_stmt                # vectorized form of STMT
3652           s_loop = scalar_stmt                  # (scalar) STMT
3653         loop_exit:
3654           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3655           use <s_out0>
3656           use <s_out0>
3657
3658      The above is transformed by this function into:
3659
3660         loop:
3661           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3662           VECT_DEF = vector_stmt                # vectorized form of STMT
3663           s_loop = scalar_stmt                  # (scalar) STMT
3664         loop_exit:
3665           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3666           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3667           v_out2 = reduce <v_out1>
3668           s_out3 = extract_field <v_out2, 0>
3669           s_out4 = adjust_result <s_out3>
3670           use <s_out4>
3671           use <s_out4>
3672 */
3673
3674 static void
3675 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3676                                   int ncopies, enum tree_code reduc_code,
3677                                   vec<gimple> reduction_phis,
3678                                   int reduc_index, bool double_reduc,
3679                                   slp_tree slp_node)
3680 {
3681   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3682   stmt_vec_info prev_phi_info;
3683   tree vectype;
3684   enum machine_mode mode;
3685   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3686   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3687   basic_block exit_bb;
3688   tree scalar_dest;
3689   tree scalar_type;
3690   gimple new_phi = NULL, phi;
3691   gimple_stmt_iterator exit_gsi;
3692   tree vec_dest;
3693   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3694   gimple epilog_stmt = NULL;
3695   enum tree_code code = gimple_assign_rhs_code (stmt);
3696   gimple exit_phi;
3697   tree bitsize, bitpos;
3698   tree adjustment_def = NULL;
3699   tree vec_initial_def = NULL;
3700   tree reduction_op, expr, def;
3701   tree orig_name, scalar_result;
3702   imm_use_iterator imm_iter, phi_imm_iter;
3703   use_operand_p use_p, phi_use_p;
3704   bool extract_scalar_result = false;
3705   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3706   bool nested_in_vect_loop = false;
3707   vec<gimple> new_phis = vNULL;
3708   vec<gimple> inner_phis = vNULL;
3709   enum vect_def_type dt = vect_unknown_def_type;
3710   int j, i;
3711   vec<tree> scalar_results = vNULL;
3712   unsigned int group_size = 1, k, ratio;
3713   vec<tree> vec_initial_defs = vNULL;
3714   vec<gimple> phis;
3715   bool slp_reduc = false;
3716   tree new_phi_result;
3717   gimple inner_phi = NULL;
3718
3719   if (slp_node)
3720     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3721
3722   if (nested_in_vect_loop_p (loop, stmt))
3723     {
3724       outer_loop = loop;
3725       loop = loop->inner;
3726       nested_in_vect_loop = true;
3727       gcc_assert (!slp_node);
3728     }
3729
3730   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3731     {
3732     case GIMPLE_SINGLE_RHS:
3733       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3734                   == ternary_op);
3735       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3736       break;
3737     case GIMPLE_UNARY_RHS:
3738       reduction_op = gimple_assign_rhs1 (stmt);
3739       break;
3740     case GIMPLE_BINARY_RHS:
3741       reduction_op = reduc_index ?
3742                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3743       break;
3744     case GIMPLE_TERNARY_RHS:
3745       reduction_op = gimple_op (stmt, reduc_index + 1);
3746       break;
3747     default:
3748       gcc_unreachable ();
3749     }
3750
3751   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3752   gcc_assert (vectype);
3753   mode = TYPE_MODE (vectype);
3754
3755   /* 1. Create the reduction def-use cycle:
3756      Set the arguments of REDUCTION_PHIS, i.e., transform
3757
3758         loop:
3759           vec_def = phi <null, null>            # REDUCTION_PHI
3760           VECT_DEF = vector_stmt                # vectorized form of STMT
3761           ...
3762
3763      into:
3764
3765         loop:
3766           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3767           VECT_DEF = vector_stmt                # vectorized form of STMT
3768           ...
3769
3770      (in case of SLP, do it for all the phis). */
3771
3772   /* Get the loop-entry arguments.  */
3773   if (slp_node)
3774     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3775                        NULL, slp_node, reduc_index);
3776   else
3777     {
3778       vec_initial_defs.create (1);
3779      /* For the case of reduction, vect_get_vec_def_for_operand returns
3780         the scalar def before the loop, that defines the initial value
3781         of the reduction variable.  */
3782       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3783                                                       &adjustment_def);
3784       vec_initial_defs.quick_push (vec_initial_def);
3785     }
3786
3787   /* Set phi nodes arguments.  */
3788   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3789     {
3790       tree vec_init_def = vec_initial_defs[i];
3791       tree def = vect_defs[i];
3792       for (j = 0; j < ncopies; j++)
3793         {
3794           /* Set the loop-entry arg of the reduction-phi.  */
3795           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3796                        UNKNOWN_LOCATION);
3797
3798           /* Set the loop-latch arg for the reduction-phi.  */
3799           if (j > 0)
3800             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3801
3802           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3803
3804           if (dump_enabled_p ())
3805             {
3806               dump_printf_loc (MSG_NOTE, vect_location,
3807                                "transform reduction: created def-use cycle: ");
3808               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3809               dump_printf (MSG_NOTE, "\n");
3810               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3811             }
3812
3813           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3814         }
3815     }
3816
3817   vec_initial_defs.release ();
3818
3819   /* 2. Create epilog code.
3820         The reduction epilog code operates across the elements of the vector
3821         of partial results computed by the vectorized loop.
3822         The reduction epilog code consists of:
3823
3824         step 1: compute the scalar result in a vector (v_out2)
3825         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3826         step 3: adjust the scalar result (s_out3) if needed.
3827
3828         Step 1 can be accomplished using one the following three schemes:
3829           (scheme 1) using reduc_code, if available.
3830           (scheme 2) using whole-vector shifts, if available.
3831           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3832                      combined.
3833
3834           The overall epilog code looks like this:
3835
3836           s_out0 = phi <s_loop>         # original EXIT_PHI
3837           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3838           v_out2 = reduce <v_out1>              # step 1
3839           s_out3 = extract_field <v_out2, 0>    # step 2
3840           s_out4 = adjust_result <s_out3>       # step 3
3841
3842           (step 3 is optional, and steps 1 and 2 may be combined).
3843           Lastly, the uses of s_out0 are replaced by s_out4.  */
3844
3845
3846   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3847          v_out1 = phi <VECT_DEF>
3848          Store them in NEW_PHIS.  */
3849
3850   exit_bb = single_exit (loop)->dest;
3851   prev_phi_info = NULL;
3852   new_phis.create (vect_defs.length ());
3853   FOR_EACH_VEC_ELT (vect_defs, i, def)
3854     {
3855       for (j = 0; j < ncopies; j++)
3856         {
3857           tree new_def = copy_ssa_name (def, NULL);
3858           phi = create_phi_node (new_def, exit_bb);
3859           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3860           if (j == 0)
3861             new_phis.quick_push (phi);
3862           else
3863             {
3864               def = vect_get_vec_def_for_stmt_copy (dt, def);
3865               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3866             }
3867
3868           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3869           prev_phi_info = vinfo_for_stmt (phi);
3870         }
3871     }
3872
3873   /* The epilogue is created for the outer-loop, i.e., for the loop being
3874      vectorized.  Create exit phis for the outer loop.  */
3875   if (double_reduc)
3876     {
3877       loop = outer_loop;
3878       exit_bb = single_exit (loop)->dest;
3879       inner_phis.create (vect_defs.length ());
3880       FOR_EACH_VEC_ELT (new_phis, i, phi)
3881         {
3882           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3883           gimple outer_phi = create_phi_node (new_result, exit_bb);
3884           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3885                            PHI_RESULT (phi));
3886           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3887                                                             loop_vinfo, NULL));
3888           inner_phis.quick_push (phi);
3889           new_phis[i] = outer_phi;
3890           prev_phi_info = vinfo_for_stmt (outer_phi);
3891           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3892             {
3893               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3894               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3895               outer_phi = create_phi_node (new_result, exit_bb);
3896               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3897                                PHI_RESULT (phi));
3898               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3899                                                         loop_vinfo, NULL));
3900               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3901               prev_phi_info = vinfo_for_stmt (outer_phi);
3902             }
3903         }
3904     }
3905
3906   exit_gsi = gsi_after_labels (exit_bb);
3907
3908   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3909          (i.e. when reduc_code is not available) and in the final adjustment
3910          code (if needed).  Also get the original scalar reduction variable as
3911          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3912          represents a reduction pattern), the tree-code and scalar-def are
3913          taken from the original stmt that the pattern-stmt (STMT) replaces.
3914          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3915          are taken from STMT.  */
3916
3917   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3918   if (!orig_stmt)
3919     {
3920       /* Regular reduction  */
3921       orig_stmt = stmt;
3922     }
3923   else
3924     {
3925       /* Reduction pattern  */
3926       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3927       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3928       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3929     }
3930
3931   code = gimple_assign_rhs_code (orig_stmt);
3932   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3933      partial results are added and not subtracted.  */
3934   if (code == MINUS_EXPR)
3935     code = PLUS_EXPR;
3936
3937   scalar_dest = gimple_assign_lhs (orig_stmt);
3938   scalar_type = TREE_TYPE (scalar_dest);
3939   scalar_results.create (group_size);
3940   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3941   bitsize = TYPE_SIZE (scalar_type);
3942
3943   /* In case this is a reduction in an inner-loop while vectorizing an outer
3944      loop - we don't need to extract a single scalar result at the end of the
3945      inner-loop (unless it is double reduction, i.e., the use of reduction is
3946      outside the outer-loop).  The final vector of partial results will be used
3947      in the vectorized outer-loop, or reduced to a scalar result at the end of
3948      the outer-loop.  */
3949   if (nested_in_vect_loop && !double_reduc)
3950     goto vect_finalize_reduction;
3951
3952   /* SLP reduction without reduction chain, e.g.,
3953      # a1 = phi <a2, a0>
3954      # b1 = phi <b2, b0>
3955      a2 = operation (a1)
3956      b2 = operation (b1)  */
3957   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3958
3959   /* In case of reduction chain, e.g.,
3960      # a1 = phi <a3, a0>
3961      a2 = operation (a1)
3962      a3 = operation (a2),
3963
3964      we may end up with more than one vector result.  Here we reduce them to
3965      one vector.  */
3966   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3967     {
3968       tree first_vect = PHI_RESULT (new_phis[0]);
3969       tree tmp;
3970       gimple new_vec_stmt = NULL;
3971
3972       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3973       for (k = 1; k < new_phis.length (); k++)
3974         {
3975           gimple next_phi = new_phis[k];
3976           tree second_vect = PHI_RESULT (next_phi);
3977
3978           tmp = build2 (code, vectype,  first_vect, second_vect);
3979           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3980           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3981           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3982           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3983         }
3984
3985       new_phi_result = first_vect;
3986       if (new_vec_stmt)
3987         {
3988           new_phis.truncate (0);
3989           new_phis.safe_push (new_vec_stmt);
3990         }
3991     }
3992   else
3993     new_phi_result = PHI_RESULT (new_phis[0]);
3994
3995   /* 2.3 Create the reduction code, using one of the three schemes described
3996          above. In SLP we simply need to extract all the elements from the
3997          vector (without reducing them), so we use scalar shifts.  */
3998   if (reduc_code != ERROR_MARK && !slp_reduc)
3999     {
4000       tree tmp;
4001
4002       /*** Case 1:  Create:
4003            v_out2 = reduc_expr <v_out1>  */
4004
4005       if (dump_enabled_p ())
4006         dump_printf_loc (MSG_NOTE, vect_location,
4007                          "Reduce using direct vector reduction.");
4008
4009       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4010       tmp = build1 (reduc_code, vectype, new_phi_result);
4011       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4012       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4013       gimple_assign_set_lhs (epilog_stmt, new_temp);
4014       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4015
4016       extract_scalar_result = true;
4017     }
4018   else
4019     {
4020       enum tree_code shift_code = ERROR_MARK;
4021       bool have_whole_vector_shift = true;
4022       int bit_offset;
4023       int element_bitsize = tree_low_cst (bitsize, 1);
4024       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4025       tree vec_temp;
4026
4027       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4028         shift_code = VEC_RSHIFT_EXPR;
4029       else
4030         have_whole_vector_shift = false;
4031
4032       /* Regardless of whether we have a whole vector shift, if we're
4033          emulating the operation via tree-vect-generic, we don't want
4034          to use it.  Only the first round of the reduction is likely
4035          to still be profitable via emulation.  */
4036       /* ??? It might be better to emit a reduction tree code here, so that
4037          tree-vect-generic can expand the first round via bit tricks.  */
4038       if (!VECTOR_MODE_P (mode))
4039         have_whole_vector_shift = false;
4040       else
4041         {
4042           optab optab = optab_for_tree_code (code, vectype, optab_default);
4043           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4044             have_whole_vector_shift = false;
4045         }
4046
4047       if (have_whole_vector_shift && !slp_reduc)
4048         {
4049           /*** Case 2: Create:
4050              for (offset = VS/2; offset >= element_size; offset/=2)
4051                 {
4052                   Create:  va' = vec_shift <va, offset>
4053                   Create:  va = vop <va, va'>
4054                 }  */
4055
4056           if (dump_enabled_p ())
4057             dump_printf_loc (MSG_NOTE, vect_location,
4058                              "Reduce using vector shifts");
4059
4060           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4061           new_temp = new_phi_result;
4062           for (bit_offset = vec_size_in_bits/2;
4063                bit_offset >= element_bitsize;
4064                bit_offset /= 2)
4065             {
4066               tree bitpos = size_int (bit_offset);
4067
4068               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4069                                                vec_dest, new_temp, bitpos);
4070               new_name = make_ssa_name (vec_dest, epilog_stmt);
4071               gimple_assign_set_lhs (epilog_stmt, new_name);
4072               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4073
4074               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4075                                                           new_name, new_temp);
4076               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4077               gimple_assign_set_lhs (epilog_stmt, new_temp);
4078               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4079             }
4080
4081           extract_scalar_result = true;
4082         }
4083       else
4084         {
4085           tree rhs;
4086
4087           /*** Case 3: Create:
4088              s = extract_field <v_out2, 0>
4089              for (offset = element_size;
4090                   offset < vector_size;
4091                   offset += element_size;)
4092                {
4093                  Create:  s' = extract_field <v_out2, offset>
4094                  Create:  s = op <s, s'>  // For non SLP cases
4095                }  */
4096
4097           if (dump_enabled_p ())
4098             dump_printf_loc (MSG_NOTE, vect_location,
4099                              "Reduce using scalar code. ");
4100
4101           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4102           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4103             {
4104               if (gimple_code (new_phi) == GIMPLE_PHI)
4105                 vec_temp = PHI_RESULT (new_phi);
4106               else
4107                 vec_temp = gimple_assign_lhs (new_phi);
4108               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4109                             bitsize_zero_node);
4110               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4111               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4112               gimple_assign_set_lhs (epilog_stmt, new_temp);
4113               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4114
4115               /* In SLP we don't need to apply reduction operation, so we just
4116                  collect s' values in SCALAR_RESULTS.  */
4117               if (slp_reduc)
4118                 scalar_results.safe_push (new_temp);
4119
4120               for (bit_offset = element_bitsize;
4121                    bit_offset < vec_size_in_bits;
4122                    bit_offset += element_bitsize)
4123                 {
4124                   tree bitpos = bitsize_int (bit_offset);
4125                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4126                                      bitsize, bitpos);
4127
4128                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4129                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4130                   gimple_assign_set_lhs (epilog_stmt, new_name);
4131                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4132
4133                   if (slp_reduc)
4134                     {
4135                       /* In SLP we don't need to apply reduction operation, so
4136                          we just collect s' values in SCALAR_RESULTS.  */
4137                       new_temp = new_name;
4138                       scalar_results.safe_push (new_name);
4139                     }
4140                   else
4141                     {
4142                       epilog_stmt = gimple_build_assign_with_ops (code,
4143                                           new_scalar_dest, new_name, new_temp);
4144                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4145                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4146                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4147                     }
4148                 }
4149             }
4150
4151           /* The only case where we need to reduce scalar results in SLP, is
4152              unrolling.  If the size of SCALAR_RESULTS is greater than
4153              GROUP_SIZE, we reduce them combining elements modulo
4154              GROUP_SIZE.  */
4155           if (slp_reduc)
4156             {
4157               tree res, first_res, new_res;
4158               gimple new_stmt;
4159
4160               /* Reduce multiple scalar results in case of SLP unrolling.  */
4161               for (j = group_size; scalar_results.iterate (j, &res);
4162                    j++)
4163                 {
4164                   first_res = scalar_results[j % group_size];
4165                   new_stmt = gimple_build_assign_with_ops (code,
4166                                               new_scalar_dest, first_res, res);
4167                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4168                   gimple_assign_set_lhs (new_stmt, new_res);
4169                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4170                   scalar_results[j % group_size] = new_res;
4171                 }
4172             }
4173           else
4174             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4175             scalar_results.safe_push (new_temp);
4176
4177           extract_scalar_result = false;
4178         }
4179     }
4180
4181   /* 2.4  Extract the final scalar result.  Create:
4182           s_out3 = extract_field <v_out2, bitpos>  */
4183
4184   if (extract_scalar_result)
4185     {
4186       tree rhs;
4187
4188       if (dump_enabled_p ())
4189         dump_printf_loc (MSG_NOTE, vect_location,
4190                          "extract scalar result");
4191
4192       if (BYTES_BIG_ENDIAN)
4193         bitpos = size_binop (MULT_EXPR,
4194                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4195                              TYPE_SIZE (scalar_type));
4196       else
4197         bitpos = bitsize_zero_node;
4198
4199       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4200       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4201       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4202       gimple_assign_set_lhs (epilog_stmt, new_temp);
4203       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4204       scalar_results.safe_push (new_temp);
4205     }
4206
4207 vect_finalize_reduction:
4208
4209   if (double_reduc)
4210     loop = loop->inner;
4211
4212   /* 2.5 Adjust the final result by the initial value of the reduction
4213          variable. (When such adjustment is not needed, then
4214          'adjustment_def' is zero).  For example, if code is PLUS we create:
4215          new_temp = loop_exit_def + adjustment_def  */
4216
4217   if (adjustment_def)
4218     {
4219       gcc_assert (!slp_reduc);
4220       if (nested_in_vect_loop)
4221         {
4222           new_phi = new_phis[0];
4223           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4224           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4225           new_dest = vect_create_destination_var (scalar_dest, vectype);
4226         }
4227       else
4228         {
4229           new_temp = scalar_results[0];
4230           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4231           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4232           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4233         }
4234
4235       epilog_stmt = gimple_build_assign (new_dest, expr);
4236       new_temp = make_ssa_name (new_dest, epilog_stmt);
4237       gimple_assign_set_lhs (epilog_stmt, new_temp);
4238       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4239       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4240       if (nested_in_vect_loop)
4241         {
4242           set_vinfo_for_stmt (epilog_stmt,
4243                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4244                                                  NULL));
4245           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4246                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4247
4248           if (!double_reduc)
4249             scalar_results.quick_push (new_temp);
4250           else
4251             scalar_results[0] = new_temp;
4252         }
4253       else
4254         scalar_results[0] = new_temp;
4255
4256       new_phis[0] = epilog_stmt;
4257     }
4258
4259   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4260           phis with new adjusted scalar results, i.e., replace use <s_out0>
4261           with use <s_out4>.
4262
4263      Transform:
4264         loop_exit:
4265           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4266           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4267           v_out2 = reduce <v_out1>
4268           s_out3 = extract_field <v_out2, 0>
4269           s_out4 = adjust_result <s_out3>
4270           use <s_out0>
4271           use <s_out0>
4272
4273      into:
4274
4275         loop_exit:
4276           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4277           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4278           v_out2 = reduce <v_out1>
4279           s_out3 = extract_field <v_out2, 0>
4280           s_out4 = adjust_result <s_out3>
4281           use <s_out4>
4282           use <s_out4> */
4283
4284
4285   /* In SLP reduction chain we reduce vector results into one vector if
4286      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4287      the last stmt in the reduction chain, since we are looking for the loop
4288      exit phi node.  */
4289   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4290     {
4291       scalar_dest = gimple_assign_lhs (
4292                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4293       group_size = 1;
4294     }
4295
4296   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4297      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4298      need to match SCALAR_RESULTS with corresponding statements.  The first
4299      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4300      the first vector stmt, etc.
4301      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4302   if (group_size > new_phis.length ())
4303     {
4304       ratio = group_size / new_phis.length ();
4305       gcc_assert (!(group_size % new_phis.length ()));
4306     }
4307   else
4308     ratio = 1;
4309
4310   for (k = 0; k < group_size; k++)
4311     {
4312       if (k % ratio == 0)
4313         {
4314           epilog_stmt = new_phis[k / ratio];
4315           reduction_phi = reduction_phis[k / ratio];
4316           if (double_reduc)
4317             inner_phi = inner_phis[k / ratio];
4318         }
4319
4320       if (slp_reduc)
4321         {
4322           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4323
4324           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4325           /* SLP statements can't participate in patterns.  */
4326           gcc_assert (!orig_stmt);
4327           scalar_dest = gimple_assign_lhs (current_stmt);
4328         }
4329
4330       phis.create (3);
4331       /* Find the loop-closed-use at the loop exit of the original scalar
4332          result.  (The reduction result is expected to have two immediate uses -
4333          one at the latch block, and one at the loop exit).  */
4334       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4335         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4336           phis.safe_push (USE_STMT (use_p));
4337
4338       /* We expect to have found an exit_phi because of loop-closed-ssa
4339          form.  */
4340       gcc_assert (!phis.is_empty ());
4341
4342       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4343         {
4344           if (outer_loop)
4345             {
4346               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4347               gimple vect_phi;
4348
4349               /* FORNOW. Currently not supporting the case that an inner-loop
4350                  reduction is not used in the outer-loop (but only outside the
4351                  outer-loop), unless it is double reduction.  */
4352               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4353                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4354                           || double_reduc);
4355
4356               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4357               if (!double_reduc
4358                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4359                       != vect_double_reduction_def)
4360                 continue;
4361
4362               /* Handle double reduction:
4363
4364                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4365                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4366                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4367                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4368
4369                  At that point the regular reduction (stmt2 and stmt3) is
4370                  already vectorized, as well as the exit phi node, stmt4.
4371                  Here we vectorize the phi node of double reduction, stmt1, and
4372                  update all relevant statements.  */
4373
4374               /* Go through all the uses of s2 to find double reduction phi
4375                  node, i.e., stmt1 above.  */
4376               orig_name = PHI_RESULT (exit_phi);
4377               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4378                 {
4379                   stmt_vec_info use_stmt_vinfo;
4380                   stmt_vec_info new_phi_vinfo;
4381                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4382                   basic_block bb = gimple_bb (use_stmt);
4383                   gimple use;
4384
4385                   /* Check that USE_STMT is really double reduction phi
4386                      node.  */
4387                   if (gimple_code (use_stmt) != GIMPLE_PHI
4388                       || gimple_phi_num_args (use_stmt) != 2
4389                       || bb->loop_father != outer_loop)
4390                     continue;
4391                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4392                   if (!use_stmt_vinfo
4393                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4394                           != vect_double_reduction_def)
4395                     continue;
4396
4397                   /* Create vector phi node for double reduction:
4398                      vs1 = phi <vs0, vs2>
4399                      vs1 was created previously in this function by a call to
4400                        vect_get_vec_def_for_operand and is stored in
4401                        vec_initial_def;
4402                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4403                      vs0 is created here.  */
4404
4405                   /* Create vector phi node.  */
4406                   vect_phi = create_phi_node (vec_initial_def, bb);
4407                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4408                                     loop_vec_info_for_loop (outer_loop), NULL);
4409                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4410
4411                   /* Create vs0 - initial def of the double reduction phi.  */
4412                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4413                                              loop_preheader_edge (outer_loop));
4414                   init_def = get_initial_def_for_reduction (stmt,
4415                                                           preheader_arg, NULL);
4416                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4417                                                     vectype, NULL);
4418
4419                   /* Update phi node arguments with vs0 and vs2.  */
4420                   add_phi_arg (vect_phi, vect_phi_init,
4421                                loop_preheader_edge (outer_loop),
4422                                UNKNOWN_LOCATION);
4423                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4424                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4425                   if (dump_enabled_p ())
4426                     {
4427                       dump_printf_loc (MSG_NOTE, vect_location,
4428                                        "created double reduction phi node: ");
4429                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4430                     }
4431
4432                   vect_phi_res = PHI_RESULT (vect_phi);
4433
4434                   /* Replace the use, i.e., set the correct vs1 in the regular
4435                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4436                      loop is redundant.  */
4437                   use = reduction_phi;
4438                   for (j = 0; j < ncopies; j++)
4439                     {
4440                       edge pr_edge = loop_preheader_edge (loop);
4441                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4442                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4443                     }
4444                 }
4445             }
4446         }
4447
4448       phis.release ();
4449       if (nested_in_vect_loop)
4450         {
4451           if (double_reduc)
4452             loop = outer_loop;
4453           else
4454             continue;
4455         }
4456
4457       phis.create (3);
4458       /* Find the loop-closed-use at the loop exit of the original scalar
4459          result.  (The reduction result is expected to have two immediate uses,
4460          one at the latch block, and one at the loop exit).  For double
4461          reductions we are looking for exit phis of the outer loop.  */
4462       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4463         {
4464           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4465             phis.safe_push (USE_STMT (use_p));
4466           else
4467             {
4468               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4469                 {
4470                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4471
4472                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4473                     {
4474                       if (!flow_bb_inside_loop_p (loop,
4475                                              gimple_bb (USE_STMT (phi_use_p))))
4476                         phis.safe_push (USE_STMT (phi_use_p));
4477                     }
4478                 }
4479             }
4480         }
4481
4482       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4483         {
4484           /* Replace the uses:  */
4485           orig_name = PHI_RESULT (exit_phi);
4486           scalar_result = scalar_results[k];
4487           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4488             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4489               SET_USE (use_p, scalar_result);
4490         }
4491
4492       phis.release ();
4493     }
4494
4495   scalar_results.release ();
4496   inner_phis.release ();
4497   new_phis.release ();
4498 }
4499
4500
4501 /* Function vectorizable_reduction.
4502
4503    Check if STMT performs a reduction operation that can be vectorized.
4504    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4505    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4506    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4507
4508    This function also handles reduction idioms (patterns) that have been
4509    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4510    of this form:
4511      X = pattern_expr (arg0, arg1, ..., X)
4512    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4513    sequence that had been detected and replaced by the pattern-stmt (STMT).
4514
4515    In some cases of reduction patterns, the type of the reduction variable X is
4516    different than the type of the other arguments of STMT.
4517    In such cases, the vectype that is used when transforming STMT into a vector
4518    stmt is different than the vectype that is used to determine the
4519    vectorization factor, because it consists of a different number of elements
4520    than the actual number of elements that are being operated upon in parallel.
4521
4522    For example, consider an accumulation of shorts into an int accumulator.
4523    On some targets it's possible to vectorize this pattern operating on 8
4524    shorts at a time (hence, the vectype for purposes of determining the
4525    vectorization factor should be V8HI); on the other hand, the vectype that
4526    is used to create the vector form is actually V4SI (the type of the result).
4527
4528    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4529    indicates what is the actual level of parallelism (V8HI in the example), so
4530    that the right vectorization factor would be derived.  This vectype
4531    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4532    be used to create the vectorized stmt.  The right vectype for the vectorized
4533    stmt is obtained from the type of the result X:
4534         get_vectype_for_scalar_type (TREE_TYPE (X))
4535
4536    This means that, contrary to "regular" reductions (or "regular" stmts in
4537    general), the following equation:
4538       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4539    does *NOT* necessarily hold for reduction patterns.  */
4540
4541 bool
4542 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4543                         gimple *vec_stmt, slp_tree slp_node)
4544 {
4545   tree vec_dest;
4546   tree scalar_dest;
4547   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4548   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4549   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4550   tree vectype_in = NULL_TREE;
4551   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4552   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4553   enum tree_code code, orig_code, epilog_reduc_code;
4554   enum machine_mode vec_mode;
4555   int op_type;
4556   optab optab, reduc_optab;
4557   tree new_temp = NULL_TREE;
4558   tree def;
4559   gimple def_stmt;
4560   enum vect_def_type dt;
4561   gimple new_phi = NULL;
4562   tree scalar_type;
4563   bool is_simple_use;
4564   gimple orig_stmt;
4565   stmt_vec_info orig_stmt_info;
4566   tree expr = NULL_TREE;
4567   int i;
4568   int ncopies;
4569   int epilog_copies;
4570   stmt_vec_info prev_stmt_info, prev_phi_info;
4571   bool single_defuse_cycle = false;
4572   tree reduc_def = NULL_TREE;
4573   gimple new_stmt = NULL;
4574   int j;
4575   tree ops[3];
4576   bool nested_cycle = false, found_nested_cycle_def = false;
4577   gimple reduc_def_stmt = NULL;
4578   /* The default is that the reduction variable is the last in statement.  */
4579   int reduc_index = 2;
4580   bool double_reduc = false, dummy;
4581   basic_block def_bb;
4582   struct loop * def_stmt_loop, *outer_loop = NULL;
4583   tree def_arg;
4584   gimple def_arg_stmt;
4585   vec<tree> vec_oprnds0 = vNULL;
4586   vec<tree> vec_oprnds1 = vNULL;
4587   vec<tree> vect_defs = vNULL;
4588   vec<gimple> phis = vNULL;
4589   int vec_num;
4590   tree def0, def1, tem, op0, op1 = NULL_TREE;
4591
4592   /* In case of reduction chain we switch to the first stmt in the chain, but
4593      we don't update STMT_INFO, since only the last stmt is marked as reduction
4594      and has reduction properties.  */
4595   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4596     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4597
4598   if (nested_in_vect_loop_p (loop, stmt))
4599     {
4600       outer_loop = loop;
4601       loop = loop->inner;
4602       nested_cycle = true;
4603     }
4604
4605   /* 1. Is vectorizable reduction?  */
4606   /* Not supportable if the reduction variable is used in the loop, unless
4607      it's a reduction chain.  */
4608   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4609       && !GROUP_FIRST_ELEMENT (stmt_info))
4610     return false;
4611
4612   /* Reductions that are not used even in an enclosing outer-loop,
4613      are expected to be "live" (used out of the loop).  */
4614   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4615       && !STMT_VINFO_LIVE_P (stmt_info))
4616     return false;
4617
4618   /* Make sure it was already recognized as a reduction computation.  */
4619   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4620       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4621     return false;
4622
4623   /* 2. Has this been recognized as a reduction pattern?
4624
4625      Check if STMT represents a pattern that has been recognized
4626      in earlier analysis stages.  For stmts that represent a pattern,
4627      the STMT_VINFO_RELATED_STMT field records the last stmt in
4628      the original sequence that constitutes the pattern.  */
4629
4630   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4631   if (orig_stmt)
4632     {
4633       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4634       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4635       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4636     }
4637
4638   /* 3. Check the operands of the operation.  The first operands are defined
4639         inside the loop body. The last operand is the reduction variable,
4640         which is defined by the loop-header-phi.  */
4641
4642   gcc_assert (is_gimple_assign (stmt));
4643
4644   /* Flatten RHS.  */
4645   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4646     {
4647     case GIMPLE_SINGLE_RHS:
4648       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4649       if (op_type == ternary_op)
4650         {
4651           tree rhs = gimple_assign_rhs1 (stmt);
4652           ops[0] = TREE_OPERAND (rhs, 0);
4653           ops[1] = TREE_OPERAND (rhs, 1);
4654           ops[2] = TREE_OPERAND (rhs, 2);
4655           code = TREE_CODE (rhs);
4656         }
4657       else
4658         return false;
4659       break;
4660
4661     case GIMPLE_BINARY_RHS:
4662       code = gimple_assign_rhs_code (stmt);
4663       op_type = TREE_CODE_LENGTH (code);
4664       gcc_assert (op_type == binary_op);
4665       ops[0] = gimple_assign_rhs1 (stmt);
4666       ops[1] = gimple_assign_rhs2 (stmt);
4667       break;
4668
4669     case GIMPLE_TERNARY_RHS:
4670       code = gimple_assign_rhs_code (stmt);
4671       op_type = TREE_CODE_LENGTH (code);
4672       gcc_assert (op_type == ternary_op);
4673       ops[0] = gimple_assign_rhs1 (stmt);
4674       ops[1] = gimple_assign_rhs2 (stmt);
4675       ops[2] = gimple_assign_rhs3 (stmt);
4676       break;
4677
4678     case GIMPLE_UNARY_RHS:
4679       return false;
4680
4681     default:
4682       gcc_unreachable ();
4683     }
4684
4685   if (code == COND_EXPR && slp_node)
4686     return false;
4687
4688   scalar_dest = gimple_assign_lhs (stmt);
4689   scalar_type = TREE_TYPE (scalar_dest);
4690   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4691       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4692     return false;
4693
4694   /* Do not try to vectorize bit-precision reductions.  */
4695   if ((TYPE_PRECISION (scalar_type)
4696        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4697     return false;
4698
4699   /* All uses but the last are expected to be defined in the loop.
4700      The last use is the reduction variable.  In case of nested cycle this
4701      assumption is not true: we use reduc_index to record the index of the
4702      reduction variable.  */
4703   for (i = 0; i < op_type - 1; i++)
4704     {
4705       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4706       if (i == 0 && code == COND_EXPR)
4707         continue;
4708
4709       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4710                                             &def_stmt, &def, &dt, &tem);
4711       if (!vectype_in)
4712         vectype_in = tem;
4713       gcc_assert (is_simple_use);
4714
4715       if (dt != vect_internal_def
4716           && dt != vect_external_def
4717           && dt != vect_constant_def
4718           && dt != vect_induction_def
4719           && !(dt == vect_nested_cycle && nested_cycle))
4720         return false;
4721
4722       if (dt == vect_nested_cycle)
4723         {
4724           found_nested_cycle_def = true;
4725           reduc_def_stmt = def_stmt;
4726           reduc_index = i;
4727         }
4728     }
4729
4730   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4731                                         &def_stmt, &def, &dt, &tem);
4732   if (!vectype_in)
4733     vectype_in = tem;
4734   gcc_assert (is_simple_use);
4735   if (!(dt == vect_reduction_def
4736         || dt == vect_nested_cycle
4737         || ((dt == vect_internal_def || dt == vect_external_def
4738              || dt == vect_constant_def || dt == vect_induction_def)
4739             && nested_cycle && found_nested_cycle_def)))
4740     {
4741       /* For pattern recognized stmts, orig_stmt might be a reduction,
4742          but some helper statements for the pattern might not, or
4743          might be COND_EXPRs with reduction uses in the condition.  */
4744       gcc_assert (orig_stmt);
4745       return false;
4746     }
4747   if (!found_nested_cycle_def)
4748     reduc_def_stmt = def_stmt;
4749
4750   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4751   if (orig_stmt)
4752     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4753                                                        reduc_def_stmt,
4754                                                        !nested_cycle,
4755                                                        &dummy));
4756   else
4757     {
4758       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4759                                              !nested_cycle, &dummy);
4760       /* We changed STMT to be the first stmt in reduction chain, hence we
4761          check that in this case the first element in the chain is STMT.  */
4762       gcc_assert (stmt == tmp
4763                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4764     }
4765
4766   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4767     return false;
4768
4769   if (slp_node || PURE_SLP_STMT (stmt_info))
4770     ncopies = 1;
4771   else
4772     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4773                / TYPE_VECTOR_SUBPARTS (vectype_in));
4774
4775   gcc_assert (ncopies >= 1);
4776
4777   vec_mode = TYPE_MODE (vectype_in);
4778
4779   if (code == COND_EXPR)
4780     {
4781       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4782         {
4783           if (dump_enabled_p ())
4784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4785                              "unsupported condition in reduction");
4786
4787             return false;
4788         }
4789     }
4790   else
4791     {
4792       /* 4. Supportable by target?  */
4793
4794       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4795           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4796         {
4797           /* Shifts and rotates are only supported by vectorizable_shifts,
4798              not vectorizable_reduction.  */
4799           if (dump_enabled_p ())
4800             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4801                              "unsupported shift or rotation.");
4802           return false;
4803         }
4804
4805       /* 4.1. check support for the operation in the loop  */
4806       optab = optab_for_tree_code (code, vectype_in, optab_default);
4807       if (!optab)
4808         {
4809           if (dump_enabled_p ())
4810             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4811                              "no optab.");
4812
4813           return false;
4814         }
4815
4816       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4817         {
4818           if (dump_enabled_p ())
4819             dump_printf (MSG_NOTE, "op not supported by target.");
4820
4821           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4822               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4823                   < vect_min_worthwhile_factor (code))
4824             return false;
4825
4826           if (dump_enabled_p ())
4827             dump_printf (MSG_NOTE, "proceeding using word mode.");
4828         }
4829
4830       /* Worthwhile without SIMD support?  */
4831       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4832           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4833              < vect_min_worthwhile_factor (code))
4834         {
4835           if (dump_enabled_p ())
4836             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4837                              "not worthwhile without SIMD support.");
4838
4839           return false;
4840         }
4841     }
4842
4843   /* 4.2. Check support for the epilog operation.
4844
4845           If STMT represents a reduction pattern, then the type of the
4846           reduction variable may be different than the type of the rest
4847           of the arguments.  For example, consider the case of accumulation
4848           of shorts into an int accumulator; The original code:
4849                         S1: int_a = (int) short_a;
4850           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4851
4852           was replaced with:
4853                         STMT: int_acc = widen_sum <short_a, int_acc>
4854
4855           This means that:
4856           1. The tree-code that is used to create the vector operation in the
4857              epilog code (that reduces the partial results) is not the
4858              tree-code of STMT, but is rather the tree-code of the original
4859              stmt from the pattern that STMT is replacing.  I.e, in the example
4860              above we want to use 'widen_sum' in the loop, but 'plus' in the
4861              epilog.
4862           2. The type (mode) we use to check available target support
4863              for the vector operation to be created in the *epilog*, is
4864              determined by the type of the reduction variable (in the example
4865              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4866              However the type (mode) we use to check available target support
4867              for the vector operation to be created *inside the loop*, is
4868              determined by the type of the other arguments to STMT (in the
4869              example we'd check this: optab_handler (widen_sum_optab,
4870              vect_short_mode)).
4871
4872           This is contrary to "regular" reductions, in which the types of all
4873           the arguments are the same as the type of the reduction variable.
4874           For "regular" reductions we can therefore use the same vector type
4875           (and also the same tree-code) when generating the epilog code and
4876           when generating the code inside the loop.  */
4877
4878   if (orig_stmt)
4879     {
4880       /* This is a reduction pattern: get the vectype from the type of the
4881          reduction variable, and get the tree-code from orig_stmt.  */
4882       orig_code = gimple_assign_rhs_code (orig_stmt);
4883       gcc_assert (vectype_out);
4884       vec_mode = TYPE_MODE (vectype_out);
4885     }
4886   else
4887     {
4888       /* Regular reduction: use the same vectype and tree-code as used for
4889          the vector code inside the loop can be used for the epilog code. */
4890       orig_code = code;
4891     }
4892
4893   if (nested_cycle)
4894     {
4895       def_bb = gimple_bb (reduc_def_stmt);
4896       def_stmt_loop = def_bb->loop_father;
4897       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4898                                        loop_preheader_edge (def_stmt_loop));
4899       if (TREE_CODE (def_arg) == SSA_NAME
4900           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4901           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4902           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4903           && vinfo_for_stmt (def_arg_stmt)
4904           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4905               == vect_double_reduction_def)
4906         double_reduc = true;
4907     }
4908
4909   epilog_reduc_code = ERROR_MARK;
4910   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4911     {
4912       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4913                                          optab_default);
4914       if (!reduc_optab)
4915         {
4916           if (dump_enabled_p ())
4917             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4918                              "no optab for reduction.");
4919
4920           epilog_reduc_code = ERROR_MARK;
4921         }
4922
4923       if (reduc_optab
4924           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4925         {
4926           if (dump_enabled_p ())
4927             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4928                              "reduc op not supported by target.");
4929
4930           epilog_reduc_code = ERROR_MARK;
4931         }
4932     }
4933   else
4934     {
4935       if (!nested_cycle || double_reduc)
4936         {
4937           if (dump_enabled_p ())
4938             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4939                              "no reduc code for scalar code.");
4940
4941           return false;
4942         }
4943     }
4944
4945   if (double_reduc && ncopies > 1)
4946     {
4947       if (dump_enabled_p ())
4948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4949                          "multiple types in double reduction");
4950
4951       return false;
4952     }
4953
4954   /* In case of widenning multiplication by a constant, we update the type
4955      of the constant to be the type of the other operand.  We check that the
4956      constant fits the type in the pattern recognition pass.  */
4957   if (code == DOT_PROD_EXPR
4958       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4959     {
4960       if (TREE_CODE (ops[0]) == INTEGER_CST)
4961         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4962       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4963         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4964       else
4965         {
4966           if (dump_enabled_p ())
4967             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4968                              "invalid types in dot-prod");
4969
4970           return false;
4971         }
4972     }
4973
4974   if (!vec_stmt) /* transformation not required.  */
4975     {
4976       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4977         return false;
4978       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4979       return true;
4980     }
4981
4982   /** Transform.  **/
4983
4984   if (dump_enabled_p ())
4985     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4986
4987   /* FORNOW: Multiple types are not supported for condition.  */
4988   if (code == COND_EXPR)
4989     gcc_assert (ncopies == 1);
4990
4991   /* Create the destination vector  */
4992   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4993
4994   /* In case the vectorization factor (VF) is bigger than the number
4995      of elements that we can fit in a vectype (nunits), we have to generate
4996      more than one vector stmt - i.e - we need to "unroll" the
4997      vector stmt by a factor VF/nunits.  For more details see documentation
4998      in vectorizable_operation.  */
4999
5000   /* If the reduction is used in an outer loop we need to generate
5001      VF intermediate results, like so (e.g. for ncopies=2):
5002         r0 = phi (init, r0)
5003         r1 = phi (init, r1)
5004         r0 = x0 + r0;
5005         r1 = x1 + r1;
5006     (i.e. we generate VF results in 2 registers).
5007     In this case we have a separate def-use cycle for each copy, and therefore
5008     for each copy we get the vector def for the reduction variable from the
5009     respective phi node created for this copy.
5010
5011     Otherwise (the reduction is unused in the loop nest), we can combine
5012     together intermediate results, like so (e.g. for ncopies=2):
5013         r = phi (init, r)
5014         r = x0 + r;
5015         r = x1 + r;
5016    (i.e. we generate VF/2 results in a single register).
5017    In this case for each copy we get the vector def for the reduction variable
5018    from the vectorized reduction operation generated in the previous iteration.
5019   */
5020
5021   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5022     {
5023       single_defuse_cycle = true;
5024       epilog_copies = 1;
5025     }
5026   else
5027     epilog_copies = ncopies;
5028
5029   prev_stmt_info = NULL;
5030   prev_phi_info = NULL;
5031   if (slp_node)
5032     {
5033       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5034       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5035                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5036     }
5037   else
5038     {
5039       vec_num = 1;
5040       vec_oprnds0.create (1);
5041       if (op_type == ternary_op)
5042         vec_oprnds1.create (1);
5043     }
5044
5045   phis.create (vec_num);
5046   vect_defs.create (vec_num);
5047   if (!slp_node)
5048     vect_defs.quick_push (NULL_TREE);
5049
5050   for (j = 0; j < ncopies; j++)
5051     {
5052       if (j == 0 || !single_defuse_cycle)
5053         {
5054           for (i = 0; i < vec_num; i++)
5055             {
5056               /* Create the reduction-phi that defines the reduction
5057                  operand.  */
5058               new_phi = create_phi_node (vec_dest, loop->header);
5059               set_vinfo_for_stmt (new_phi,
5060                                   new_stmt_vec_info (new_phi, loop_vinfo,
5061                                                      NULL));
5062                if (j == 0 || slp_node)
5063                  phis.quick_push (new_phi);
5064             }
5065         }
5066
5067       if (code == COND_EXPR)
5068         {
5069           gcc_assert (!slp_node);
5070           vectorizable_condition (stmt, gsi, vec_stmt,
5071                                   PHI_RESULT (phis[0]),
5072                                   reduc_index, NULL);
5073           /* Multiple types are not supported for condition.  */
5074           break;
5075         }
5076
5077       /* Handle uses.  */
5078       if (j == 0)
5079         {
5080           op0 = ops[!reduc_index];
5081           if (op_type == ternary_op)
5082             {
5083               if (reduc_index == 0)
5084                 op1 = ops[2];
5085               else
5086                 op1 = ops[1];
5087             }
5088
5089           if (slp_node)
5090             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5091                                slp_node, -1);
5092           else
5093             {
5094               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5095                                                             stmt, NULL);
5096               vec_oprnds0.quick_push (loop_vec_def0);
5097               if (op_type == ternary_op)
5098                {
5099                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5100                                                                NULL);
5101                  vec_oprnds1.quick_push (loop_vec_def1);
5102                }
5103             }
5104         }
5105       else
5106         {
5107           if (!slp_node)
5108             {
5109               enum vect_def_type dt;
5110               gimple dummy_stmt;
5111               tree dummy;
5112
5113               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5114                                   &dummy_stmt, &dummy, &dt);
5115               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5116                                                               loop_vec_def0);
5117               vec_oprnds0[0] = loop_vec_def0;
5118               if (op_type == ternary_op)
5119                 {
5120                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5121                                       &dummy, &dt);
5122                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5123                                                                 loop_vec_def1);
5124                   vec_oprnds1[0] = loop_vec_def1;
5125                 }
5126             }
5127
5128           if (single_defuse_cycle)
5129             reduc_def = gimple_assign_lhs (new_stmt);
5130
5131           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5132         }
5133
5134       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5135         {
5136           if (slp_node)
5137             reduc_def = PHI_RESULT (phis[i]);
5138           else
5139             {
5140               if (!single_defuse_cycle || j == 0)
5141                 reduc_def = PHI_RESULT (new_phi);
5142             }
5143
5144           def1 = ((op_type == ternary_op)
5145                   ? vec_oprnds1[i] : NULL);
5146           if (op_type == binary_op)
5147             {
5148               if (reduc_index == 0)
5149                 expr = build2 (code, vectype_out, reduc_def, def0);
5150               else
5151                 expr = build2 (code, vectype_out, def0, reduc_def);
5152             }
5153           else
5154             {
5155               if (reduc_index == 0)
5156                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5157               else
5158                 {
5159                   if (reduc_index == 1)
5160                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5161                   else
5162                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5163                 }
5164             }
5165
5166           new_stmt = gimple_build_assign (vec_dest, expr);
5167           new_temp = make_ssa_name (vec_dest, new_stmt);
5168           gimple_assign_set_lhs (new_stmt, new_temp);
5169           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5170
5171           if (slp_node)
5172             {
5173               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5174               vect_defs.quick_push (new_temp);
5175             }
5176           else
5177             vect_defs[0] = new_temp;
5178         }
5179
5180       if (slp_node)
5181         continue;
5182
5183       if (j == 0)
5184         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5185       else
5186         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5187
5188       prev_stmt_info = vinfo_for_stmt (new_stmt);
5189       prev_phi_info = vinfo_for_stmt (new_phi);
5190     }
5191
5192   /* Finalize the reduction-phi (set its arguments) and create the
5193      epilog reduction code.  */
5194   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5195     {
5196       new_temp = gimple_assign_lhs (*vec_stmt);
5197       vect_defs[0] = new_temp;
5198     }
5199
5200   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5201                                     epilog_reduc_code, phis, reduc_index,
5202                                     double_reduc, slp_node);
5203
5204   phis.release ();
5205   vect_defs.release ();
5206   vec_oprnds0.release ();
5207   vec_oprnds1.release ();
5208
5209   return true;
5210 }
5211
5212 /* Function vect_min_worthwhile_factor.
5213
5214    For a loop where we could vectorize the operation indicated by CODE,
5215    return the minimum vectorization factor that makes it worthwhile
5216    to use generic vectors.  */
5217 int
5218 vect_min_worthwhile_factor (enum tree_code code)
5219 {
5220   switch (code)
5221     {
5222     case PLUS_EXPR:
5223     case MINUS_EXPR:
5224     case NEGATE_EXPR:
5225       return 4;
5226
5227     case BIT_AND_EXPR:
5228     case BIT_IOR_EXPR:
5229     case BIT_XOR_EXPR:
5230     case BIT_NOT_EXPR:
5231       return 2;
5232
5233     default:
5234       return INT_MAX;
5235     }
5236 }
5237
5238
5239 /* Function vectorizable_induction
5240
5241    Check if PHI performs an induction computation that can be vectorized.
5242    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5243    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5244    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5245
5246 bool
5247 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5248                         gimple *vec_stmt)
5249 {
5250   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5251   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5252   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5253   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5254   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5255   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5256   tree vec_def;
5257
5258   gcc_assert (ncopies >= 1);
5259   /* FORNOW. These restrictions should be relaxed.  */
5260   if (nested_in_vect_loop_p (loop, phi))
5261     {
5262       imm_use_iterator imm_iter;
5263       use_operand_p use_p;
5264       gimple exit_phi;
5265       edge latch_e;
5266       tree loop_arg;
5267
5268       if (ncopies > 1)
5269         {
5270           if (dump_enabled_p ())
5271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5272                              "multiple types in nested loop.");
5273           return false;
5274         }
5275
5276       exit_phi = NULL;
5277       latch_e = loop_latch_edge (loop->inner);
5278       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5279       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5280         {
5281           if (!flow_bb_inside_loop_p (loop->inner,
5282                                       gimple_bb (USE_STMT (use_p))))
5283             {
5284               exit_phi = USE_STMT (use_p);
5285               break;
5286             }
5287         }
5288       if (exit_phi)
5289         {
5290           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5291           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5292                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5293             {
5294               if (dump_enabled_p ())
5295                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5296                                  "inner-loop induction only used outside "
5297                                  "of the outer vectorized loop.");
5298               return false;
5299             }
5300         }
5301     }
5302
5303   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5304     return false;
5305
5306   /* FORNOW: SLP not supported.  */
5307   if (STMT_SLP_TYPE (stmt_info))
5308     return false;
5309
5310   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5311
5312   if (gimple_code (phi) != GIMPLE_PHI)
5313     return false;
5314
5315   if (!vec_stmt) /* transformation not required.  */
5316     {
5317       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5318       if (dump_enabled_p ())
5319         dump_printf_loc (MSG_NOTE, vect_location,
5320                          "=== vectorizable_induction ===");
5321       vect_model_induction_cost (stmt_info, ncopies);
5322       return true;
5323     }
5324
5325   /** Transform.  **/
5326
5327   if (dump_enabled_p ())
5328     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5329
5330   vec_def = get_initial_def_for_induction (phi);
5331   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5332   return true;
5333 }
5334
5335 /* Function vectorizable_live_operation.
5336
5337    STMT computes a value that is used outside the loop.  Check if
5338    it can be supported.  */
5339
5340 bool
5341 vectorizable_live_operation (gimple stmt,
5342                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5343                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5344 {
5345   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5346   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5347   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5348   int i;
5349   int op_type;
5350   tree op;
5351   tree def;
5352   gimple def_stmt;
5353   enum vect_def_type dt;
5354   enum tree_code code;
5355   enum gimple_rhs_class rhs_class;
5356
5357   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5358
5359   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5360     return false;
5361
5362   if (!is_gimple_assign (stmt))
5363     return false;
5364
5365   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5366     return false;
5367
5368   /* FORNOW. CHECKME. */
5369   if (nested_in_vect_loop_p (loop, stmt))
5370     return false;
5371
5372   code = gimple_assign_rhs_code (stmt);
5373   op_type = TREE_CODE_LENGTH (code);
5374   rhs_class = get_gimple_rhs_class (code);
5375   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5376   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5377
5378   /* FORNOW: support only if all uses are invariant.  This means
5379      that the scalar operations can remain in place, unvectorized.
5380      The original last scalar value that they compute will be used.  */
5381
5382   for (i = 0; i < op_type; i++)
5383     {
5384       if (rhs_class == GIMPLE_SINGLE_RHS)
5385         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5386       else
5387         op = gimple_op (stmt, i + 1);
5388       if (op
5389           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5390                                   &dt))
5391         {
5392           if (dump_enabled_p ())
5393             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5394                              "use not simple.");
5395           return false;
5396         }
5397
5398       if (dt != vect_external_def && dt != vect_constant_def)
5399         return false;
5400     }
5401
5402   /* No transformation is required for the cases we currently support.  */
5403   return true;
5404 }
5405
5406 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5407
5408 static void
5409 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5410 {
5411   ssa_op_iter op_iter;
5412   imm_use_iterator imm_iter;
5413   def_operand_p def_p;
5414   gimple ustmt;
5415
5416   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5417     {
5418       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5419         {
5420           basic_block bb;
5421
5422           if (!is_gimple_debug (ustmt))
5423             continue;
5424
5425           bb = gimple_bb (ustmt);
5426
5427           if (!flow_bb_inside_loop_p (loop, bb))
5428             {
5429               if (gimple_debug_bind_p (ustmt))
5430                 {
5431                   if (dump_enabled_p ())
5432                     dump_printf_loc (MSG_NOTE, vect_location,
5433                                      "killing debug use");
5434
5435                   gimple_debug_bind_reset_value (ustmt);
5436                   update_stmt (ustmt);
5437                 }
5438               else
5439                 gcc_unreachable ();
5440             }
5441         }
5442     }
5443 }
5444
5445 /* Function vect_transform_loop.
5446
5447    The analysis phase has determined that the loop is vectorizable.
5448    Vectorize the loop - created vectorized stmts to replace the scalar
5449    stmts in the loop, and update the loop exit condition.  */
5450
5451 void
5452 vect_transform_loop (loop_vec_info loop_vinfo)
5453 {
5454   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5455   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5456   int nbbs = loop->num_nodes;
5457   gimple_stmt_iterator si;
5458   int i;
5459   tree ratio = NULL;
5460   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5461   bool grouped_store;
5462   bool slp_scheduled = false;
5463   unsigned int nunits;
5464   gimple stmt, pattern_stmt;
5465   gimple_seq pattern_def_seq = NULL;
5466   gimple_stmt_iterator pattern_def_si = gsi_none ();
5467   bool transform_pattern_stmt = false;
5468   bool check_profitability = false;
5469   int th;
5470   /* Record number of iterations before we started tampering with the profile. */
5471   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5472
5473   if (dump_enabled_p ())
5474     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5475
5476   /* If profile is inprecise, we have chance to fix it up.  */
5477   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5478     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5479
5480   /* Use the more conservative vectorization threshold.  If the number
5481      of iterations is constant assume the cost check has been performed
5482      by our caller.  If the threshold makes all loops profitable that
5483      run at least the vectorization factor number of times checking
5484      is pointless, too.  */
5485   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5486          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5487   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5488   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5489       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5490     {
5491       if (dump_enabled_p ())
5492         dump_printf_loc (MSG_NOTE, vect_location,
5493                          "Profitability threshold is %d loop iterations.", th);
5494       check_profitability = true;
5495     }
5496
5497   /* Peel the loop if there are data refs with unknown alignment.
5498      Only one data ref with unknown store is allowed.  */
5499
5500   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5501     {
5502       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5503       check_profitability = false;
5504     }
5505
5506   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5507       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5508     {
5509       vect_loop_versioning (loop_vinfo, th, check_profitability);
5510       check_profitability = false;
5511     }
5512
5513   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5514      compile time constant), or it is a constant that doesn't divide by the
5515      vectorization factor, then an epilog loop needs to be created.
5516      We therefore duplicate the loop: the original loop will be vectorized,
5517      and will compute the first (n/VF) iterations.  The second copy of the loop
5518      will remain scalar and will compute the remaining (n%VF) iterations.
5519      (VF is the vectorization factor).  */
5520
5521   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5522        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5523            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5524        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5525     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5526                                     th, check_profitability);
5527   else
5528     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5529                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5530
5531   /* 1) Make sure the loop header has exactly two entries
5532      2) Make sure we have a preheader basic block.  */
5533
5534   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5535
5536   split_edge (loop_preheader_edge (loop));
5537
5538   /* FORNOW: the vectorizer supports only loops which body consist
5539      of one basic block (header + empty latch). When the vectorizer will
5540      support more involved loop forms, the order by which the BBs are
5541      traversed need to be reconsidered.  */
5542
5543   for (i = 0; i < nbbs; i++)
5544     {
5545       basic_block bb = bbs[i];
5546       stmt_vec_info stmt_info;
5547       gimple phi;
5548
5549       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5550         {
5551           phi = gsi_stmt (si);
5552           if (dump_enabled_p ())
5553             {
5554               dump_printf_loc (MSG_NOTE, vect_location,
5555                                "------>vectorizing phi: ");
5556               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5557             }
5558           stmt_info = vinfo_for_stmt (phi);
5559           if (!stmt_info)
5560             continue;
5561
5562           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5563             vect_loop_kill_debug_uses (loop, phi);
5564
5565           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5566               && !STMT_VINFO_LIVE_P (stmt_info))
5567             continue;
5568
5569           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5570                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5571               && dump_enabled_p ())
5572             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5573
5574           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5575             {
5576               if (dump_enabled_p ())
5577                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5578               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5579             }
5580         }
5581
5582       pattern_stmt = NULL;
5583       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5584         {
5585           bool is_store;
5586
5587           if (transform_pattern_stmt)
5588             stmt = pattern_stmt;
5589           else
5590             stmt = gsi_stmt (si);
5591
5592           if (dump_enabled_p ())
5593             {
5594               dump_printf_loc (MSG_NOTE, vect_location,
5595                                "------>vectorizing statement: ");
5596               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5597             }
5598
5599           stmt_info = vinfo_for_stmt (stmt);
5600
5601           /* vector stmts created in the outer-loop during vectorization of
5602              stmts in an inner-loop may not have a stmt_info, and do not
5603              need to be vectorized.  */
5604           if (!stmt_info)
5605             {
5606               gsi_next (&si);
5607               continue;
5608             }
5609
5610           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5611             vect_loop_kill_debug_uses (loop, stmt);
5612
5613           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5614               && !STMT_VINFO_LIVE_P (stmt_info))
5615             {
5616               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5617                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5618                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5619                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5620                 {
5621                   stmt = pattern_stmt;
5622                   stmt_info = vinfo_for_stmt (stmt);
5623                 }
5624               else
5625                 {
5626                   gsi_next (&si);
5627                   continue;
5628                 }
5629             }
5630           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5631                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5632                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5633                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5634             transform_pattern_stmt = true;
5635
5636           /* If pattern statement has def stmts, vectorize them too.  */
5637           if (is_pattern_stmt_p (stmt_info))
5638             {
5639               if (pattern_def_seq == NULL)
5640                 {
5641                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5642                   pattern_def_si = gsi_start (pattern_def_seq);
5643                 }
5644               else if (!gsi_end_p (pattern_def_si))
5645                 gsi_next (&pattern_def_si);
5646               if (pattern_def_seq != NULL)
5647                 {
5648                   gimple pattern_def_stmt = NULL;
5649                   stmt_vec_info pattern_def_stmt_info = NULL;
5650
5651                   while (!gsi_end_p (pattern_def_si))
5652                     {
5653                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5654                       pattern_def_stmt_info
5655                         = vinfo_for_stmt (pattern_def_stmt);
5656                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5657                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5658                         break;
5659                       gsi_next (&pattern_def_si);
5660                     }
5661
5662                   if (!gsi_end_p (pattern_def_si))
5663                     {
5664                       if (dump_enabled_p ())
5665                         {
5666                           dump_printf_loc (MSG_NOTE, vect_location,
5667                                            "==> vectorizing pattern def "
5668                                            "stmt: ");
5669                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5670                                             pattern_def_stmt, 0);
5671                         }
5672
5673                       stmt = pattern_def_stmt;
5674                       stmt_info = pattern_def_stmt_info;
5675                     }
5676                   else
5677                     {
5678                       pattern_def_si = gsi_none ();
5679                       transform_pattern_stmt = false;
5680                     }
5681                 }
5682               else
5683                 transform_pattern_stmt = false;
5684             }
5685
5686           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5687           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5688                                                STMT_VINFO_VECTYPE (stmt_info));
5689           if (!STMT_SLP_TYPE (stmt_info)
5690               && nunits != (unsigned int) vectorization_factor
5691               && dump_enabled_p ())
5692             /* For SLP VF is set according to unrolling factor, and not to
5693                vector size, hence for SLP this print is not valid.  */
5694             dump_printf_loc (MSG_NOTE, vect_location,
5695                              "multiple-types.");
5696
5697           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5698              reached.  */
5699           if (STMT_SLP_TYPE (stmt_info))
5700             {
5701               if (!slp_scheduled)
5702                 {
5703                   slp_scheduled = true;
5704
5705                   if (dump_enabled_p ())
5706                     dump_printf_loc (MSG_NOTE, vect_location,
5707                                      "=== scheduling SLP instances ===");
5708
5709                   vect_schedule_slp (loop_vinfo, NULL);
5710                 }
5711
5712               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5713               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5714                 {
5715                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5716                     {
5717                       pattern_def_seq = NULL;
5718                       gsi_next (&si);
5719                     }
5720                   continue;
5721                 }
5722             }
5723
5724           /* -------- vectorize statement ------------ */
5725           if (dump_enabled_p ())
5726             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5727
5728           grouped_store = false;
5729           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5730           if (is_store)
5731             {
5732               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5733                 {
5734                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5735                      interleaving chain was completed - free all the stores in
5736                      the chain.  */
5737                   gsi_next (&si);
5738                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5739                   continue;
5740                 }
5741               else
5742                 {
5743                   /* Free the attached stmt_vec_info and remove the stmt.  */
5744                   gimple store = gsi_stmt (si);
5745                   free_stmt_vec_info (store);
5746                   unlink_stmt_vdef (store);
5747                   gsi_remove (&si, true);
5748                   release_defs (store);
5749                   continue;
5750                 }
5751             }
5752
5753           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5754             {
5755               pattern_def_seq = NULL;
5756               gsi_next (&si);
5757             }
5758         }                       /* stmts in BB */
5759     }                           /* BBs in loop */
5760
5761   slpeel_make_loop_iterate_ntimes (loop, ratio);
5762
5763   /* Reduce loop iterations by the vectorization factor.  */
5764   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5765                       expected_iterations / vectorization_factor);
5766   loop->nb_iterations_upper_bound
5767     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5768                                             FLOOR_DIV_EXPR);
5769   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5770       && loop->nb_iterations_upper_bound != double_int_zero)
5771     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5772   if (loop->any_estimate)
5773     {
5774       loop->nb_iterations_estimate
5775         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5776                                              FLOOR_DIV_EXPR);
5777        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5778            && loop->nb_iterations_estimate != double_int_zero)
5779          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5780     }
5781
5782   if (dump_enabled_p ())
5783     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5784   if (loop->inner && dump_enabled_p ())
5785     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5786                      "OUTER LOOP VECTORIZED.");
5787 }