gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "input.h"
  28 #include "alias.h"
  29 #include "symtab.h"
  30 #include "tree.h"
  31 #include "fold-const.h"
  32 #include "stor-layout.h"
  33 #include "predict.h"
  34 #include "hard-reg-set.h"
  35 #include "function.h"
  36 #include "dominance.h"
  37 #include "cfg.h"
  38 #include "cfganal.h"
  39 #include "basic-block.h"
  40 #include "gimple-pretty-print.h"
  41 #include "tree-ssa-alias.h"
  42 #include "internal-fn.h"
  43 #include "gimple-expr.h"
  44 #include "is-a.h"
  45 #include "gimple.h"
  46 #include "gimplify.h"
  47 #include "gimple-iterator.h"
  48 #include "gimplify-me.h"
  49 #include "gimple-ssa.h"
  50 #include "tree-phinodes.h"
  51 #include "ssa-iterators.h"
  52 #include "stringpool.h"
  53 #include "tree-ssanames.h"
  54 #include "tree-ssa-loop-ivopts.h"
  55 #include "tree-ssa-loop-manip.h"
  56 #include "tree-ssa-loop-niter.h"
  57 #include "tree-pass.h"
  58 #include "cfgloop.h"
  59 #include "rtl.h"
  60 #include "flags.h"
  61 #include "insn-config.h"
  62 #include "expmed.h"
  63 #include "dojump.h"
  64 #include "explow.h"
  65 #include "calls.h"
  66 #include "emit-rtl.h"
  67 #include "varasm.h"
  68 #include "stmt.h"
  69 #include "expr.h"
  70 #include "recog.h"
  71 #include "insn-codes.h"
  72 #include "optabs.h"
  73 #include "params.h"
  74 #include "diagnostic-core.h"
  75 #include "tree-chrec.h"
  76 #include "tree-scalar-evolution.h"
  77 #include "tree-vectorizer.h"
  78 #include "target.h"
  79
  80 /* Loop Vectorization Pass.
  81
  82    This pass tries to vectorize loops.
  83
  84    For example, the vectorizer transforms the following simple loop:
  85
  86         short a[N]; short b[N]; short c[N]; int i;
  87
  88         for (i=0; i<N; i++){
  89           a[i] = b[i] + c[i];
  90         }
  91
  92    as if it was manually vectorized by rewriting the source code into:
  93
  94         typedef int __attribute__((mode(V8HI))) v8hi;
  95         short a[N];  short b[N]; short c[N];   int i;
  96         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  97         v8hi va, vb, vc;
  98
  99         for (i=0; i<N/8; i++){
 100           vb = pb[i];
 101           vc = pc[i];
 102           va = vb + vc;
 103           pa[i] = va;
 104         }
 105
 106         The main entry to this pass is vectorize_loops(), in which
 107    the vectorizer applies a set of analyses on a given set of loops,
 108    followed by the actual vectorization transformation for the loops that
 109    had successfully passed the analysis phase.
 110         Throughout this pass we make a distinction between two types of
 111    data: scalars (which are represented by SSA_NAMES), and memory references
 112    ("data-refs").  These two types of data require different handling both
 113    during analysis and transformation. The types of data-refs that the
 114    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 115    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 116    accesses are required to have a simple (consecutive) access pattern.
 117
 118    Analysis phase:
 119    ===============
 120         The driver for the analysis phase is vect_analyze_loop().
 121    It applies a set of analyses, some of which rely on the scalar evolution
 122    analyzer (scev) developed by Sebastian Pop.
 123
 124         During the analysis phase the vectorizer records some information
 125    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 126    loop, as well as general information about the loop as a whole, which is
 127    recorded in a "loop_vec_info" struct attached to each loop.
 128
 129    Transformation phase:
 130    =====================
 131         The loop transformation phase scans all the stmts in the loop, and
 132    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 133    the loop that needs to be vectorized.  It inserts the vector code sequence
 134    just before the scalar stmt S, and records a pointer to the vector code
 135    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 136    attached to S).  This pointer will be used for the vectorization of following
 137    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 138    otherwise, we rely on dead code elimination for removing it.
 139
 140         For example, say stmt S1 was vectorized into stmt VS1:
 141
 142    VS1: vb = px[i];
 143    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 144    S2:  a = b;
 145
 146    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 147    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 148    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 149    resulting sequence would be:
 150
 151    VS1: vb = px[i];
 152    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 153    VS2: va = vb;
 154    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 155
 156         Operands that are not SSA_NAMEs, are data-refs that appear in
 157    load/store operations (like 'x[i]' in S1), and are handled differently.
 158
 159    Target modeling:
 160    =================
 161         Currently the only target specific information that is used is the
 162    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 163    Targets that can support different sizes of vectors, for now will need
 164    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 165    flexibility will be added in the future.
 166
 167         Since we only vectorize operations which vector form can be
 168    expressed using existing tree codes, to verify that an operation is
 169    supported, the vectorizer checks the relevant optab at the relevant
 170    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 171    the value found is CODE_FOR_nothing, then there's no target support, and
 172    we can't vectorize the stmt.
 173
 174    For additional information on this project see:
 175    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 176 */
 177
 178 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 179
 180 /* Function vect_determine_vectorization_factor
 181
 182    Determine the vectorization factor (VF).  VF is the number of data elements
 183    that are operated upon in parallel in a single iteration of the vectorized
 184    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 185    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 186    elements can fit in a single vector register.
 187
 188    We currently support vectorization of loops in which all types operated upon
 189    are of the same size.  Therefore this function currently sets VF according to
 190    the size of the types operated upon, and fails if there are multiple sizes
 191    in the loop.
 192
 193    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 194    original loop:
 195         for (i=0; i<N; i++){
 196           a[i] = b[i] + c[i];
 197         }
 198
 199    vectorized loop:
 200         for (i=0; i<N; i+=VF){
 201           a[i:VF] = b[i:VF] + c[i:VF];
 202         }
 203 */
 204
 205 static bool
 206 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 207 {
 208   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 209   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 210   int nbbs = loop->num_nodes;
 211   unsigned int vectorization_factor = 0;
 212   tree scalar_type;
 213   gphi *phi;
 214   tree vectype;
 215   unsigned int nunits;
 216   stmt_vec_info stmt_info;
 217   int i;
 218   HOST_WIDE_INT dummy;
 219   gimple stmt, pattern_stmt = NULL;
 220   gimple_seq pattern_def_seq = NULL;
 221   gimple_stmt_iterator pattern_def_si = gsi_none ();
 222   bool analyze_pattern_stmt = false;
 223
 224   if (dump_enabled_p ())
 225     dump_printf_loc (MSG_NOTE, vect_location,
 226                      "=== vect_determine_vectorization_factor ===\n");
 227
 228   for (i = 0; i < nbbs; i++)
 229     {
 230       basic_block bb = bbs[i];
 231
 232       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 233            gsi_next (&si))
 234         {
 235           phi = si.phi ();
 236           stmt_info = vinfo_for_stmt (phi);
 237           if (dump_enabled_p ())
 238             {
 239               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 241               dump_printf (MSG_NOTE, "\n");
 242             }
 243
 244           gcc_assert (stmt_info);
 245
 246           if (STMT_VINFO_RELEVANT_P (stmt_info))
 247             {
 248               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 249               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location,
 254                                    "get vectype for scalar type:  ");
 255                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 256                   dump_printf (MSG_NOTE, "\n");
 257                 }
 258
 259               vectype = get_vectype_for_scalar_type (scalar_type);
 260               if (!vectype)
 261                 {
 262                   if (dump_enabled_p ())
 263                     {
 264                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 265                                        "not vectorized: unsupported "
 266                                        "data-type ");
 267                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 268                                          scalar_type);
 269                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 270                     }
 271                   return false;
 272                 }
 273               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 274
 275               if (dump_enabled_p ())
 276                 {
 277                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 278                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 279                   dump_printf (MSG_NOTE, "\n");
 280                 }
 281
 282               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 283               if (dump_enabled_p ())
 284                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 285                                  nunits);
 286
 287               if (!vectorization_factor
 288                   || (nunits > vectorization_factor))
 289                 vectorization_factor = nunits;
 290             }
 291         }
 292
 293       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 294            !gsi_end_p (si) || analyze_pattern_stmt;)
 295         {
 296           tree vf_vectype;
 297
 298           if (analyze_pattern_stmt)
 299             stmt = pattern_stmt;
 300           else
 301             stmt = gsi_stmt (si);
 302
 303           stmt_info = vinfo_for_stmt (stmt);
 304
 305           if (dump_enabled_p ())
 306             {
 307               dump_printf_loc (MSG_NOTE, vect_location,
 308                                "==> examining statement: ");
 309               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310               dump_printf (MSG_NOTE, "\n");
 311             }
 312
 313           gcc_assert (stmt_info);
 314
 315           /* Skip stmts which do not need to be vectorized.  */
 316           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 317                && !STMT_VINFO_LIVE_P (stmt_info))
 318               || gimple_clobber_p (stmt))
 319             {
 320               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 321                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 322                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 323                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 324                 {
 325                   stmt = pattern_stmt;
 326                   stmt_info = vinfo_for_stmt (pattern_stmt);
 327                   if (dump_enabled_p ())
 328                     {
 329                       dump_printf_loc (MSG_NOTE, vect_location,
 330                                        "==> examining pattern statement: ");
 331                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 332                       dump_printf (MSG_NOTE, "\n");
 333                     }
 334                 }
 335               else
 336                 {
 337                   if (dump_enabled_p ())
 338                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 339                   gsi_next (&si);
 340                   continue;
 341                 }
 342             }
 343           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 344                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 345                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 346                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 347             analyze_pattern_stmt = true;
 348
 349           /* If a pattern statement has def stmts, analyze them too.  */
 350           if (is_pattern_stmt_p (stmt_info))
 351             {
 352               if (pattern_def_seq == NULL)
 353                 {
 354                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 355                   pattern_def_si = gsi_start (pattern_def_seq);
 356                 }
 357               else if (!gsi_end_p (pattern_def_si))
 358                 gsi_next (&pattern_def_si);
 359               if (pattern_def_seq != NULL)
 360                 {
 361                   gimple pattern_def_stmt = NULL;
 362                   stmt_vec_info pattern_def_stmt_info = NULL;
 363
 364                   while (!gsi_end_p (pattern_def_si))
 365                     {
 366                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 367                       pattern_def_stmt_info
 368                         = vinfo_for_stmt (pattern_def_stmt);
 369                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 370                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 371                         break;
 372                       gsi_next (&pattern_def_si);
 373                     }
 374
 375                   if (!gsi_end_p (pattern_def_si))
 376                     {
 377                       if (dump_enabled_p ())
 378                         {
 379                           dump_printf_loc (MSG_NOTE, vect_location,
 380                                            "==> examining pattern def stmt: ");
 381                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 382                                             pattern_def_stmt, 0);
 383                           dump_printf (MSG_NOTE, "\n");
 384                         }
 385
 386                       stmt = pattern_def_stmt;
 387                       stmt_info = pattern_def_stmt_info;
 388                     }
 389                   else
 390                     {
 391                       pattern_def_si = gsi_none ();
 392                       analyze_pattern_stmt = false;
 393                     }
 394                 }
 395               else
 396                 analyze_pattern_stmt = false;
 397             }
 398
 399           if (gimple_get_lhs (stmt) == NULL_TREE
 400               /* MASK_STORE has no lhs, but is ok.  */
 401               && (!is_gimple_call (stmt)
 402                   || !gimple_call_internal_p (stmt)
 403                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 404             {
 405               if (is_gimple_call (stmt))
 406                 {
 407                   /* Ignore calls with no lhs.  These must be calls to
 408                      #pragma omp simd functions, and what vectorization factor
 409                      it really needs can't be determined until
 410                      vectorizable_simd_clone_call.  */
 411                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 412                     {
 413                       pattern_def_seq = NULL;
 414                       gsi_next (&si);
 415                     }
 416                   continue;
 417                 }
 418               if (dump_enabled_p ())
 419                 {
 420                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                                    "not vectorized: irregular stmt.");
 422                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 423                                     0);
 424                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 425                 }
 426               return false;
 427             }
 428
 429           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 430             {
 431               if (dump_enabled_p ())
 432                 {
 433                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 434                                    "not vectorized: vector stmt in loop:");
 435                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 436                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 437                 }
 438               return false;
 439             }
 440
 441           if (STMT_VINFO_VECTYPE (stmt_info))
 442             {
 443               /* The only case when a vectype had been already set is for stmts
 444                  that contain a dataref, or for "pattern-stmts" (stmts
 445                  generated by the vectorizer to represent/replace a certain
 446                  idiom).  */
 447               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 448                           || is_pattern_stmt_p (stmt_info)
 449                           || !gsi_end_p (pattern_def_si));
 450               vectype = STMT_VINFO_VECTYPE (stmt_info);
 451             }
 452           else
 453             {
 454               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 455               if (is_gimple_call (stmt)
 456                   && gimple_call_internal_p (stmt)
 457                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 458                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 459               else
 460                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 484
 485               if (dump_enabled_p ())
 486                 {
 487                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 488                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 489                   dump_printf (MSG_NOTE, "\n");
 490                 }
 491             }
 492
 493           /* The vectorization factor is according to the smallest
 494              scalar type (or the largest vector size, but we only
 495              support one vector size per loop).  */
 496           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 497                                                        &dummy);
 498           if (dump_enabled_p ())
 499             {
 500               dump_printf_loc (MSG_NOTE, vect_location,
 501                                "get vectype for scalar type:  ");
 502               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 503               dump_printf (MSG_NOTE, "\n");
 504             }
 505           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 506           if (!vf_vectype)
 507             {
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 511                                    "not vectorized: unsupported data-type ");
 512                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 513                                      scalar_type);
 514                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 515                 }
 516               return false;
 517             }
 518
 519           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 520                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 521             {
 522               if (dump_enabled_p ())
 523                 {
 524                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 525                                    "not vectorized: different sized vector "
 526                                    "types in statement, ");
 527                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 528                                      vectype);
 529                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 530                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 531                                      vf_vectype);
 532                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 533                 }
 534               return false;
 535             }
 536
 537           if (dump_enabled_p ())
 538             {
 539               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 540               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 541               dump_printf (MSG_NOTE, "\n");
 542             }
 543
 544           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 545           if (dump_enabled_p ())
 546             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 547           if (!vectorization_factor
 548               || (nunits > vectorization_factor))
 549             vectorization_factor = nunits;
 550
 551           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 552             {
 553               pattern_def_seq = NULL;
 554               gsi_next (&si);
 555             }
 556         }
 557     }
 558
 559   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 560   if (dump_enabled_p ())
 561     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 562                      vectorization_factor);
 563   if (vectorization_factor <= 1)
 564     {
 565       if (dump_enabled_p ())
 566         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 567                          "not vectorized: unsupported data-type\n");
 568       return false;
 569     }
 570   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 571
 572   return true;
 573 }
 574
 575
 576 /* Function vect_is_simple_iv_evolution.
 577
 578    FORNOW: A simple evolution of an induction variables in the loop is
 579    considered a polynomial evolution.  */
 580
 581 static bool
 582 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 583                              tree * step)
 584 {
 585   tree init_expr;
 586   tree step_expr;
 587   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 588   basic_block bb;
 589
 590   /* When there is no evolution in this loop, the evolution function
 591      is not "simple".  */
 592   if (evolution_part == NULL_TREE)
 593     return false;
 594
 595   /* When the evolution is a polynomial of degree >= 2
 596      the evolution function is not "simple".  */
 597   if (tree_is_chrec (evolution_part))
 598     return false;
 599
 600   step_expr = evolution_part;
 601   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 602
 603   if (dump_enabled_p ())
 604     {
 605       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 606       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 607       dump_printf (MSG_NOTE, ",  init: ");
 608       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 609       dump_printf (MSG_NOTE, "\n");
 610     }
 611
 612   *init = init_expr;
 613   *step = step_expr;
 614
 615   if (TREE_CODE (step_expr) != INTEGER_CST
 616       && (TREE_CODE (step_expr) != SSA_NAME
 617           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 618               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 619           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 620               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 621                   || !flag_associative_math)))
 622       && (TREE_CODE (step_expr) != REAL_CST
 623           || !flag_associative_math))
 624     {
 625       if (dump_enabled_p ())
 626         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 627                          "step unknown.\n");
 628       return false;
 629     }
 630
 631   return true;
 632 }
 633
 634 /* Function vect_analyze_scalar_cycles_1.
 635
 636    Examine the cross iteration def-use cycles of scalar variables
 637    in LOOP.  LOOP_VINFO represents the loop that is now being
 638    considered for vectorization (can be LOOP, or an outer-loop
 639    enclosing LOOP).  */
 640
 641 static void
 642 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 643 {
 644   basic_block bb = loop->header;
 645   tree init, step;
 646   auto_vec<gimple, 64> worklist;
 647   gphi_iterator gsi;
 648   bool double_reduc;
 649
 650   if (dump_enabled_p ())
 651     dump_printf_loc (MSG_NOTE, vect_location,
 652                      "=== vect_analyze_scalar_cycles ===\n");
 653
 654   /* First - identify all inductions.  Reduction detection assumes that all the
 655      inductions have been identified, therefore, this order must not be
 656      changed.  */
 657   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 658     {
 659       gphi *phi = gsi.phi ();
 660       tree access_fn = NULL;
 661       tree def = PHI_RESULT (phi);
 662       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 663
 664       if (dump_enabled_p ())
 665         {
 666           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 667           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 668           dump_printf (MSG_NOTE, "\n");
 669         }
 670
 671       /* Skip virtual phi's.  The data dependences that are associated with
 672          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 673       if (virtual_operand_p (def))
 674         continue;
 675
 676       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 677
 678       /* Analyze the evolution function.  */
 679       access_fn = analyze_scalar_evolution (loop, def);
 680       if (access_fn)
 681         {
 682           STRIP_NOPS (access_fn);
 683           if (dump_enabled_p ())
 684             {
 685               dump_printf_loc (MSG_NOTE, vect_location,
 686                                "Access function of PHI: ");
 687               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 688               dump_printf (MSG_NOTE, "\n");
 689             }
 690           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 691             = evolution_part_in_loop_num (access_fn, loop->num);
 692         }
 693
 694       if (!access_fn
 695           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 696           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 697               && TREE_CODE (step) != INTEGER_CST))
 698         {
 699           worklist.safe_push (phi);
 700           continue;
 701         }
 702
 703       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 704
 705       if (dump_enabled_p ())
 706         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 707       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 708     }
 709
 710
 711   /* Second - identify all reductions and nested cycles.  */
 712   while (worklist.length () > 0)
 713     {
 714       gimple phi = worklist.pop ();
 715       tree def = PHI_RESULT (phi);
 716       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 717       gimple reduc_stmt;
 718       bool nested_cycle;
 719
 720       if (dump_enabled_p ())
 721         {
 722           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 723           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 724           dump_printf (MSG_NOTE, "\n");
 725         }
 726
 727       gcc_assert (!virtual_operand_p (def)
 728                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 729
 730       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 731       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 732                                                 &double_reduc);
 733       if (reduc_stmt)
 734         {
 735           if (double_reduc)
 736             {
 737               if (dump_enabled_p ())
 738                 dump_printf_loc (MSG_NOTE, vect_location,
 739                                  "Detected double reduction.\n");
 740
 741               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 742               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 743                                                     vect_double_reduction_def;
 744             }
 745           else
 746             {
 747               if (nested_cycle)
 748                 {
 749                   if (dump_enabled_p ())
 750                     dump_printf_loc (MSG_NOTE, vect_location,
 751                                      "Detected vectorizable nested cycle.\n");
 752
 753                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 754                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 755                                                              vect_nested_cycle;
 756                 }
 757               else
 758                 {
 759                   if (dump_enabled_p ())
 760                     dump_printf_loc (MSG_NOTE, vect_location,
 761                                      "Detected reduction.\n");
 762
 763                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 764                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 765                                                            vect_reduction_def;
 766                   /* Store the reduction cycles for possible vectorization in
 767                      loop-aware SLP.  */
 768                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 769                 }
 770             }
 771         }
 772       else
 773         if (dump_enabled_p ())
 774           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 775                            "Unknown def-use cycle pattern.\n");
 776     }
 777 }
 778
 779
 780 /* Function vect_analyze_scalar_cycles.
 781
 782    Examine the cross iteration def-use cycles of scalar variables, by
 783    analyzing the loop-header PHIs of scalar variables.  Classify each
 784    cycle as one of the following: invariant, induction, reduction, unknown.
 785    We do that for the loop represented by LOOP_VINFO, and also to its
 786    inner-loop, if exists.
 787    Examples for scalar cycles:
 788
 789    Example1: reduction:
 790
 791               loop1:
 792               for (i=0; i<N; i++)
 793                  sum += a[i];
 794
 795    Example2: induction:
 796
 797               loop2:
 798               for (i=0; i<N; i++)
 799                  a[i] = i;  */
 800
 801 static void
 802 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 803 {
 804   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 805
 806   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 807
 808   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 809      Reductions in such inner-loop therefore have different properties than
 810      the reductions in the nest that gets vectorized:
 811      1. When vectorized, they are executed in the same order as in the original
 812         scalar loop, so we can't change the order of computation when
 813         vectorizing them.
 814      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 815         current checks are too strict.  */
 816
 817   if (loop->inner)
 818     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 819 }
 820
 821 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 822
 823 static void
 824 vect_fixup_reduc_chain (gimple stmt)
 825 {
 826   gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 827   gimple stmtp;
 828   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 829               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 830   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 831   do
 832     {
 833       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 834       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 835       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 836       if (stmt)
 837         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 838           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 839     }
 840   while (stmt);
 841   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 842 }
 843
 844 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 845
 846 static void
 847 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 848 {
 849   gimple first;
 850   unsigned i;
 851
 852   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 853     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 854       {
 855         vect_fixup_reduc_chain (first);
 856         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 857           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 858       }
 859 }
 860
 861 /* Function vect_get_loop_niters.
 862
 863    Determine how many iterations the loop is executed and place it
 864    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 865    in NUMBER_OF_ITERATIONSM1.
 866
 867    Return the loop exit condition.  */
 868
 869
 870 static gcond *
 871 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 872                       tree *number_of_iterationsm1)
 873 {
 874   tree niters;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location,
 878                      "=== get_loop_niters ===\n");
 879
 880   niters = number_of_latch_executions (loop);
 881   *number_of_iterationsm1 = niters;
 882
 883   /* We want the number of loop header executions which is the number
 884      of latch executions plus one.
 885      ???  For UINT_MAX latch executions this number overflows to zero
 886      for loops like do { n++; } while (n != 0);  */
 887   if (niters && !chrec_contains_undetermined (niters))
 888     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 889                           build_int_cst (TREE_TYPE (niters), 1));
 890   *number_of_iterations = niters;
 891
 892   return get_loop_exit_condition (loop);
 893 }
 894
 895
 896 /* Function bb_in_loop_p
 897
 898    Used as predicate for dfs order traversal of the loop bbs.  */
 899
 900 static bool
 901 bb_in_loop_p (const_basic_block bb, const void *data)
 902 {
 903   const struct loop *const loop = (const struct loop *)data;
 904   if (flow_bb_inside_loop_p (loop, bb))
 905     return true;
 906   return false;
 907 }
 908
 909
 910 /* Function new_loop_vec_info.
 911
 912    Create and initialize a new loop_vec_info struct for LOOP, as well as
 913    stmt_vec_info structs for all the stmts in LOOP.  */
 914
 915 static loop_vec_info
 916 new_loop_vec_info (struct loop *loop)
 917 {
 918   loop_vec_info res;
 919   basic_block *bbs;
 920   gimple_stmt_iterator si;
 921   unsigned int i, nbbs;
 922
 923   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 924   LOOP_VINFO_LOOP (res) = loop;
 925
 926   bbs = get_loop_body (loop);
 927
 928   /* Create/Update stmt_info for all stmts in the loop.  */
 929   for (i = 0; i < loop->num_nodes; i++)
 930     {
 931       basic_block bb = bbs[i];
 932
 933       /* BBs in a nested inner-loop will have been already processed (because
 934          we will have called vect_analyze_loop_form for any nested inner-loop).
 935          Therefore, for stmts in an inner-loop we just want to update the
 936          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 937          loop_info of the outer-loop we are currently considering to vectorize
 938          (instead of the loop_info of the inner-loop).
 939          For stmts in other BBs we need to create a stmt_info from scratch.  */
 940       if (bb->loop_father != loop)
 941         {
 942           /* Inner-loop bb.  */
 943           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 944           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 945             {
 946               gimple phi = gsi_stmt (si);
 947               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 948               loop_vec_info inner_loop_vinfo =
 949                 STMT_VINFO_LOOP_VINFO (stmt_info);
 950               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 951               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 952             }
 953           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 954            {
 955               gimple stmt = gsi_stmt (si);
 956               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 957               loop_vec_info inner_loop_vinfo =
 958                  STMT_VINFO_LOOP_VINFO (stmt_info);
 959               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 960               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 961            }
 962         }
 963       else
 964         {
 965           /* bb in current nest.  */
 966           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 967             {
 968               gimple phi = gsi_stmt (si);
 969               gimple_set_uid (phi, 0);
 970               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 971             }
 972
 973           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 974             {
 975               gimple stmt = gsi_stmt (si);
 976               gimple_set_uid (stmt, 0);
 977               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 978             }
 979         }
 980     }
 981
 982   /* CHECKME: We want to visit all BBs before their successors (except for
 983      latch blocks, for which this assertion wouldn't hold).  In the simple
 984      case of the loop forms we allow, a dfs order of the BBs would the same
 985      as reversed postorder traversal, so we are safe.  */
 986
 987    free (bbs);
 988    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 989    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 990                               bbs, loop->num_nodes, loop);
 991    gcc_assert (nbbs == loop->num_nodes);
 992
 993   LOOP_VINFO_BBS (res) = bbs;
 994   LOOP_VINFO_NITERSM1 (res) = NULL;
 995   LOOP_VINFO_NITERS (res) = NULL;
 996   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 997   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 998   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 999   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1000   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1001   LOOP_VINFO_VECT_FACTOR (res) = 0;
1002   LOOP_VINFO_LOOP_NEST (res).create (3);
1003   LOOP_VINFO_DATAREFS (res).create (10);
1004   LOOP_VINFO_DDRS (res).create (10 * 10);
1005   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1006   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
1007              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1008   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
1009              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1010   LOOP_VINFO_GROUPED_STORES (res).create (10);
1011   LOOP_VINFO_REDUCTIONS (res).create (10);
1012   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
1013   LOOP_VINFO_SLP_INSTANCES (res).create (10);
1014   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1015   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1016   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1017   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1018   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1019
1020   return res;
1021 }
1022
1023
1024 /* Function destroy_loop_vec_info.
1025
1026    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1027    stmts in the loop.  */
1028
1029 void
1030 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1031 {
1032   struct loop *loop;
1033   basic_block *bbs;
1034   int nbbs;
1035   gimple_stmt_iterator si;
1036   int j;
1037   vec<slp_instance> slp_instances;
1038   slp_instance instance;
1039   bool swapped;
1040
1041   if (!loop_vinfo)
1042     return;
1043
1044   loop = LOOP_VINFO_LOOP (loop_vinfo);
1045
1046   bbs = LOOP_VINFO_BBS (loop_vinfo);
1047   nbbs = clean_stmts ? loop->num_nodes : 0;
1048   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1049
1050   for (j = 0; j < nbbs; j++)
1051     {
1052       basic_block bb = bbs[j];
1053       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1054         free_stmt_vec_info (gsi_stmt (si));
1055
1056       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1057         {
1058           gimple stmt = gsi_stmt (si);
1059
1060           /* We may have broken canonical form by moving a constant
1061              into RHS1 of a commutative op.  Fix such occurrences.  */
1062           if (swapped && is_gimple_assign (stmt))
1063             {
1064               enum tree_code code = gimple_assign_rhs_code (stmt);
1065
1066               if ((code == PLUS_EXPR
1067                    || code == POINTER_PLUS_EXPR
1068                    || code == MULT_EXPR)
1069                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1070                 swap_ssa_operands (stmt,
1071                                    gimple_assign_rhs1_ptr (stmt),
1072                                    gimple_assign_rhs2_ptr (stmt));
1073             }
1074
1075           /* Free stmt_vec_info.  */
1076           free_stmt_vec_info (stmt);
1077           gsi_next (&si);
1078         }
1079     }
1080
1081   free (LOOP_VINFO_BBS (loop_vinfo));
1082   vect_destroy_datarefs (loop_vinfo, NULL);
1083   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1084   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1085   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1086   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1087   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1088   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1089     vect_free_slp_instance (instance);
1090
1091   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1092   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1093   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1094   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1095
1096   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1097   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1098
1099   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1100
1101   free (loop_vinfo);
1102   loop->aux = NULL;
1103 }
1104
1105
1106 /* Function vect_analyze_loop_1.
1107
1108    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1109    for it. The different analyses will record information in the
1110    loop_vec_info struct.  This is a subset of the analyses applied in
1111    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1112    that is now considered for (outer-loop) vectorization.  */
1113
1114 static loop_vec_info
1115 vect_analyze_loop_1 (struct loop *loop)
1116 {
1117   loop_vec_info loop_vinfo;
1118
1119   if (dump_enabled_p ())
1120     dump_printf_loc (MSG_NOTE, vect_location,
1121                      "===== analyze_loop_nest_1 =====\n");
1122
1123   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1124
1125   loop_vinfo = vect_analyze_loop_form (loop);
1126   if (!loop_vinfo)
1127     {
1128       if (dump_enabled_p ())
1129         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130                          "bad inner-loop form.\n");
1131       return NULL;
1132     }
1133
1134   return loop_vinfo;
1135 }
1136
1137
1138 /* Function vect_analyze_loop_form.
1139
1140    Verify that certain CFG restrictions hold, including:
1141    - the loop has a pre-header
1142    - the loop has a single entry and exit
1143    - the loop exit condition is simple enough, and the number of iterations
1144      can be analyzed (a countable loop).  */
1145
1146 loop_vec_info
1147 vect_analyze_loop_form (struct loop *loop)
1148 {
1149   loop_vec_info loop_vinfo;
1150   gcond *loop_cond;
1151   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1152   loop_vec_info inner_loop_vinfo = NULL;
1153
1154   if (dump_enabled_p ())
1155     dump_printf_loc (MSG_NOTE, vect_location,
1156                      "=== vect_analyze_loop_form ===\n");
1157
1158   /* Different restrictions apply when we are considering an inner-most loop,
1159      vs. an outer (nested) loop.
1160      (FORNOW. May want to relax some of these restrictions in the future).  */
1161
1162   if (!loop->inner)
1163     {
1164       /* Inner-most loop.  We currently require that the number of BBs is
1165          exactly 2 (the header and latch).  Vectorizable inner-most loops
1166          look like this:
1167
1168                         (pre-header)
1169                            |
1170                           header <--------+
1171                            | |            |
1172                            | +--> latch --+
1173                            |
1174                         (exit-bb)  */
1175
1176       if (loop->num_nodes != 2)
1177         {
1178           if (dump_enabled_p ())
1179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                              "not vectorized: control flow in loop.\n");
1181           return NULL;
1182         }
1183
1184       if (empty_block_p (loop->header))
1185         {
1186           if (dump_enabled_p ())
1187             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188                              "not vectorized: empty loop.\n");
1189           return NULL;
1190         }
1191     }
1192   else
1193     {
1194       struct loop *innerloop = loop->inner;
1195       edge entryedge;
1196
1197       /* Nested loop. We currently require that the loop is doubly-nested,
1198          contains a single inner loop, and the number of BBs is exactly 5.
1199          Vectorizable outer-loops look like this:
1200
1201                         (pre-header)
1202                            |
1203                           header <---+
1204                            |         |
1205                           inner-loop |
1206                            |         |
1207                           tail ------+
1208                            |
1209                         (exit-bb)
1210
1211          The inner-loop has the properties expected of inner-most loops
1212          as described above.  */
1213
1214       if ((loop->inner)->inner || (loop->inner)->next)
1215         {
1216           if (dump_enabled_p ())
1217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218                              "not vectorized: multiple nested loops.\n");
1219           return NULL;
1220         }
1221
1222       /* Analyze the inner-loop.  */
1223       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1224       if (!inner_loop_vinfo)
1225         {
1226           if (dump_enabled_p ())
1227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228                              "not vectorized: Bad inner loop.\n");
1229           return NULL;
1230         }
1231
1232       if (!expr_invariant_in_loop_p (loop,
1233                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: inner-loop count not"
1238                              " invariant.\n");
1239           destroy_loop_vec_info (inner_loop_vinfo, true);
1240           return NULL;
1241         }
1242
1243       if (loop->num_nodes != 5)
1244         {
1245           if (dump_enabled_p ())
1246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247                              "not vectorized: control flow in loop.\n");
1248           destroy_loop_vec_info (inner_loop_vinfo, true);
1249           return NULL;
1250         }
1251
1252       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1253       entryedge = EDGE_PRED (innerloop->header, 0);
1254       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1255         entryedge = EDGE_PRED (innerloop->header, 1);
1256
1257       if (entryedge->src != loop->header
1258           || !single_exit (innerloop)
1259           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1260         {
1261           if (dump_enabled_p ())
1262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                              "not vectorized: unsupported outerloop form.\n");
1264           destroy_loop_vec_info (inner_loop_vinfo, true);
1265           return NULL;
1266         }
1267
1268       if (dump_enabled_p ())
1269         dump_printf_loc (MSG_NOTE, vect_location,
1270                          "Considering outer-loop vectorization.\n");
1271     }
1272
1273   if (!single_exit (loop)
1274       || EDGE_COUNT (loop->header->preds) != 2)
1275     {
1276       if (dump_enabled_p ())
1277         {
1278           if (!single_exit (loop))
1279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1280                              "not vectorized: multiple exits.\n");
1281           else if (EDGE_COUNT (loop->header->preds) != 2)
1282             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1283                              "not vectorized: too many incoming edges.\n");
1284         }
1285       if (inner_loop_vinfo)
1286         destroy_loop_vec_info (inner_loop_vinfo, true);
1287       return NULL;
1288     }
1289
1290   /* We assume that the loop exit condition is at the end of the loop. i.e,
1291      that the loop is represented as a do-while (with a proper if-guard
1292      before the loop if needed), where the loop header contains all the
1293      executable statements, and the latch is empty.  */
1294   if (!empty_block_p (loop->latch)
1295       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1296     {
1297       if (dump_enabled_p ())
1298         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1299                          "not vectorized: latch block not empty.\n");
1300       if (inner_loop_vinfo)
1301         destroy_loop_vec_info (inner_loop_vinfo, true);
1302       return NULL;
1303     }
1304
1305   /* Make sure there exists a single-predecessor exit bb:  */
1306   if (!single_pred_p (single_exit (loop)->dest))
1307     {
1308       edge e = single_exit (loop);
1309       if (!(e->flags & EDGE_ABNORMAL))
1310         {
1311           split_loop_exit_edge (e);
1312           if (dump_enabled_p ())
1313             dump_printf (MSG_NOTE, "split exit edge.\n");
1314         }
1315       else
1316         {
1317           if (dump_enabled_p ())
1318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1319                              "not vectorized: abnormal loop exit edge.\n");
1320           if (inner_loop_vinfo)
1321             destroy_loop_vec_info (inner_loop_vinfo, true);
1322           return NULL;
1323         }
1324     }
1325
1326   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1327                                     &number_of_iterationsm1);
1328   if (!loop_cond)
1329     {
1330       if (dump_enabled_p ())
1331         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1332                          "not vectorized: complicated exit condition.\n");
1333       if (inner_loop_vinfo)
1334         destroy_loop_vec_info (inner_loop_vinfo, true);
1335       return NULL;
1336     }
1337
1338   if (!number_of_iterations
1339       || chrec_contains_undetermined (number_of_iterations))
1340     {
1341       if (dump_enabled_p ())
1342         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343                          "not vectorized: number of iterations cannot be "
1344                          "computed.\n");
1345       if (inner_loop_vinfo)
1346         destroy_loop_vec_info (inner_loop_vinfo, true);
1347       return NULL;
1348     }
1349
1350   if (integer_zerop (number_of_iterations))
1351     {
1352       if (dump_enabled_p ())
1353         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354                          "not vectorized: number of iterations = 0.\n");
1355       if (inner_loop_vinfo)
1356         destroy_loop_vec_info (inner_loop_vinfo, true);
1357       return NULL;
1358     }
1359
1360   loop_vinfo = new_loop_vec_info (loop);
1361   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1362   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1363   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1364
1365   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1366     {
1367       if (dump_enabled_p ())
1368         {
1369           dump_printf_loc (MSG_NOTE, vect_location,
1370                            "Symbolic number of iterations is ");
1371           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1372           dump_printf (MSG_NOTE, "\n");
1373         }
1374     }
1375
1376   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1377
1378   /* CHECKME: May want to keep it around it in the future.  */
1379   if (inner_loop_vinfo)
1380     destroy_loop_vec_info (inner_loop_vinfo, false);
1381
1382   gcc_assert (!loop->aux);
1383   loop->aux = loop_vinfo;
1384   return loop_vinfo;
1385 }
1386
1387 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1388    statements update the vectorization factor.  */
1389
1390 static void
1391 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1392 {
1393   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1394   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1395   int nbbs = loop->num_nodes;
1396   unsigned int vectorization_factor;
1397   int i;
1398
1399   if (dump_enabled_p ())
1400     dump_printf_loc (MSG_NOTE, vect_location,
1401                      "=== vect_update_vf_for_slp ===\n");
1402
1403   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1404   gcc_assert (vectorization_factor != 0);
1405
1406   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1407      vectorization factor of the loop is the unrolling factor required by
1408      the SLP instances.  If that unrolling factor is 1, we say, that we
1409      perform pure SLP on loop - cross iteration parallelism is not
1410      exploited.  */
1411   bool only_slp_in_loop = true;
1412   for (i = 0; i < nbbs; i++)
1413     {
1414       basic_block bb = bbs[i];
1415       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1416            gsi_next (&si))
1417         {
1418           gimple stmt = gsi_stmt (si);
1419           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1420           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1421               && STMT_VINFO_RELATED_STMT (stmt_info))
1422             {
1423               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1424               stmt_info = vinfo_for_stmt (stmt);
1425             }
1426           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1427                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1428               && !PURE_SLP_STMT (stmt_info))
1429             /* STMT needs both SLP and loop-based vectorization.  */
1430             only_slp_in_loop = false;
1431         }
1432     }
1433
1434   if (only_slp_in_loop)
1435     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1436   else
1437     vectorization_factor
1438       = least_common_multiple (vectorization_factor,
1439                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1440
1441   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1442   if (dump_enabled_p ())
1443     dump_printf_loc (MSG_NOTE, vect_location,
1444                      "Updating vectorization factor to %d\n",
1445                      vectorization_factor);
1446 }
1447
1448 /* Function vect_analyze_loop_operations.
1449
1450    Scan the loop stmts and make sure they are all vectorizable.  */
1451
1452 static bool
1453 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1454 {
1455   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1456   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1457   int nbbs = loop->num_nodes;
1458   unsigned int vectorization_factor;
1459   int i;
1460   stmt_vec_info stmt_info;
1461   bool need_to_vectorize = false;
1462   int min_profitable_iters;
1463   int min_scalar_loop_bound;
1464   unsigned int th;
1465   bool ok;
1466   HOST_WIDE_INT max_niter;
1467   HOST_WIDE_INT estimated_niter;
1468   int min_profitable_estimate;
1469
1470   if (dump_enabled_p ())
1471     dump_printf_loc (MSG_NOTE, vect_location,
1472                      "=== vect_analyze_loop_operations ===\n");
1473
1474   for (i = 0; i < nbbs; i++)
1475     {
1476       basic_block bb = bbs[i];
1477
1478       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1479            gsi_next (&si))
1480         {
1481           gphi *phi = si.phi ();
1482           ok = true;
1483
1484           stmt_info = vinfo_for_stmt (phi);
1485           if (dump_enabled_p ())
1486             {
1487               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1488               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1489               dump_printf (MSG_NOTE, "\n");
1490             }
1491
1492           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1493              (i.e., a phi in the tail of the outer-loop).  */
1494           if (! is_loop_header_bb_p (bb))
1495             {
1496               /* FORNOW: we currently don't support the case that these phis
1497                  are not used in the outerloop (unless it is double reduction,
1498                  i.e., this phi is vect_reduction_def), cause this case
1499                  requires to actually do something here.  */
1500               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1501                    || STMT_VINFO_LIVE_P (stmt_info))
1502                   && STMT_VINFO_DEF_TYPE (stmt_info)
1503                      != vect_double_reduction_def)
1504                 {
1505                   if (dump_enabled_p ())
1506                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                                      "Unsupported loop-closed phi in "
1508                                      "outer-loop.\n");
1509                   return false;
1510                 }
1511
1512               /* If PHI is used in the outer loop, we check that its operand
1513                  is defined in the inner loop.  */
1514               if (STMT_VINFO_RELEVANT_P (stmt_info))
1515                 {
1516                   tree phi_op;
1517                   gimple op_def_stmt;
1518
1519                   if (gimple_phi_num_args (phi) != 1)
1520                     return false;
1521
1522                   phi_op = PHI_ARG_DEF (phi, 0);
1523                   if (TREE_CODE (phi_op) != SSA_NAME)
1524                     return false;
1525
1526                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1527                   if (gimple_nop_p (op_def_stmt)
1528                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1529                       || !vinfo_for_stmt (op_def_stmt))
1530                     return false;
1531
1532                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1533                         != vect_used_in_outer
1534                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1535                            != vect_used_in_outer_by_reduction)
1536                     return false;
1537                 }
1538
1539               continue;
1540             }
1541
1542           gcc_assert (stmt_info);
1543
1544           if (STMT_VINFO_LIVE_P (stmt_info))
1545             {
1546               /* FORNOW: not yet supported.  */
1547               if (dump_enabled_p ())
1548                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1549                                  "not vectorized: value used after loop.\n");
1550               return false;
1551             }
1552
1553           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1554               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1555             {
1556               /* A scalar-dependence cycle that we don't support.  */
1557               if (dump_enabled_p ())
1558                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1559                                  "not vectorized: scalar dependence cycle.\n");
1560               return false;
1561             }
1562
1563           if (STMT_VINFO_RELEVANT_P (stmt_info))
1564             {
1565               need_to_vectorize = true;
1566               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1567                 ok = vectorizable_induction (phi, NULL, NULL);
1568             }
1569
1570           if (!ok)
1571             {
1572               if (dump_enabled_p ())
1573                 {
1574                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1575                                    "not vectorized: relevant phi not "
1576                                    "supported: ");
1577                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1578                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1579                 }
1580               return false;
1581             }
1582         }
1583
1584       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1585            gsi_next (&si))
1586         {
1587           gimple stmt = gsi_stmt (si);
1588           if (!gimple_clobber_p (stmt)
1589               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1590             return false;
1591         }
1592     } /* bbs */
1593
1594   /* All operations in the loop are either irrelevant (deal with loop
1595      control, or dead), or only used outside the loop and can be moved
1596      out of the loop (e.g. invariants, inductions).  The loop can be
1597      optimized away by scalar optimizations.  We're better off not
1598      touching this loop.  */
1599   if (!need_to_vectorize)
1600     {
1601       if (dump_enabled_p ())
1602         dump_printf_loc (MSG_NOTE, vect_location,
1603                          "All the computation can be taken out of the loop.\n");
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "not vectorized: redundant loop. no profit to "
1607                          "vectorize.\n");
1608       return false;
1609     }
1610
1611   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1612   gcc_assert (vectorization_factor != 0);
1613
1614   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1615     dump_printf_loc (MSG_NOTE, vect_location,
1616                      "vectorization_factor = %d, niters = "
1617                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1618                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1619
1620   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1621        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1622       || ((max_niter = max_stmt_executions_int (loop)) != -1
1623           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1624     {
1625       if (dump_enabled_p ())
1626         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1627                          "not vectorized: iteration count too small.\n");
1628       if (dump_enabled_p ())
1629         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                          "not vectorized: iteration count smaller than "
1631                          "vectorization factor.\n");
1632       return false;
1633     }
1634
1635   /* Analyze cost.  Decide if worth while to vectorize.  */
1636
1637   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1638                                       &min_profitable_estimate);
1639   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1640
1641   if (min_profitable_iters < 0)
1642     {
1643       if (dump_enabled_p ())
1644         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645                          "not vectorized: vectorization not profitable.\n");
1646       if (dump_enabled_p ())
1647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648                          "not vectorized: vector version will never be "
1649                          "profitable.\n");
1650       return false;
1651     }
1652
1653   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1654                             * vectorization_factor) - 1);
1655
1656
1657   /* Use the cost model only if it is more conservative than user specified
1658      threshold.  */
1659
1660   th = (unsigned) min_scalar_loop_bound;
1661   if (min_profitable_iters
1662       && (!min_scalar_loop_bound
1663           || min_profitable_iters > min_scalar_loop_bound))
1664     th = (unsigned) min_profitable_iters;
1665
1666   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1667
1668   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1669       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1670     {
1671       if (dump_enabled_p ())
1672         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673                          "not vectorized: vectorization not profitable.\n");
1674       if (dump_enabled_p ())
1675         dump_printf_loc (MSG_NOTE, vect_location,
1676                          "not vectorized: iteration count smaller than user "
1677                          "specified loop bound parameter or minimum profitable "
1678                          "iterations (whichever is more conservative).\n");
1679       return false;
1680     }
1681
1682   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1683       && ((unsigned HOST_WIDE_INT) estimated_niter
1684           <= MAX (th, (unsigned)min_profitable_estimate)))
1685     {
1686       if (dump_enabled_p ())
1687         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1688                          "not vectorized: estimated iteration count too "
1689                          "small.\n");
1690       if (dump_enabled_p ())
1691         dump_printf_loc (MSG_NOTE, vect_location,
1692                          "not vectorized: estimated iteration count smaller "
1693                          "than specified loop bound parameter or minimum "
1694                          "profitable iterations (whichever is more "
1695                          "conservative).\n");
1696       return false;
1697     }
1698
1699   return true;
1700 }
1701
1702
1703 /* Function vect_analyze_loop_2.
1704
1705    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1706    for it.  The different analyses will record information in the
1707    loop_vec_info struct.  */
1708 static bool
1709 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1710 {
1711   bool ok;
1712   int max_vf = MAX_VECTORIZATION_FACTOR;
1713   int min_vf = 2;
1714   unsigned int th;
1715   unsigned int n_stmts = 0;
1716
1717   /* Find all data references in the loop (which correspond to vdefs/vuses)
1718      and analyze their evolution in the loop.  Also adjust the minimal
1719      vectorization factor according to the loads and stores.
1720
1721      FORNOW: Handle only simple, array references, which
1722      alignment can be forced, and aligned pointer-references.  */
1723
1724   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1725   if (!ok)
1726     {
1727       if (dump_enabled_p ())
1728         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729                          "bad data references.\n");
1730       return false;
1731     }
1732
1733   /* Classify all cross-iteration scalar data-flow cycles.
1734      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1735
1736   vect_analyze_scalar_cycles (loop_vinfo);
1737
1738   vect_pattern_recog (loop_vinfo, NULL);
1739
1740   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1741
1742   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1743      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1744
1745   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1746   if (!ok)
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1750                          "bad data access.\n");
1751       return false;
1752     }
1753
1754   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1755
1756   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1757   if (!ok)
1758     {
1759       if (dump_enabled_p ())
1760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                          "unexpected pattern.\n");
1762       return false;
1763     }
1764
1765   /* Analyze data dependences between the data-refs in the loop
1766      and adjust the maximum vectorization factor according to
1767      the dependences.
1768      FORNOW: fail at the first data dependence that we encounter.  */
1769
1770   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1771   if (!ok
1772       || max_vf < min_vf)
1773     {
1774       if (dump_enabled_p ())
1775             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776                              "bad data dependence.\n");
1777       return false;
1778     }
1779
1780   ok = vect_determine_vectorization_factor (loop_vinfo);
1781   if (!ok)
1782     {
1783       if (dump_enabled_p ())
1784         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785                          "can't determine vectorization factor.\n");
1786       return false;
1787     }
1788   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1789     {
1790       if (dump_enabled_p ())
1791         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1792                          "bad data dependence.\n");
1793       return false;
1794     }
1795
1796   /* Analyze the alignment of the data-refs in the loop.
1797      Fail if a data reference is found that cannot be vectorized.  */
1798
1799   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1800   if (!ok)
1801     {
1802       if (dump_enabled_p ())
1803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1804                          "bad data alignment.\n");
1805       return false;
1806     }
1807
1808   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1809      It is important to call pruning after vect_analyze_data_ref_accesses,
1810      since we use grouping information gathered by interleaving analysis.  */
1811   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1812   if (!ok)
1813     {
1814       if (dump_enabled_p ())
1815         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1816                          "number of versioning for alias "
1817                          "run-time tests exceeds %d "
1818                          "(--param vect-max-version-for-alias-checks)\n",
1819                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1820       return false;
1821     }
1822
1823   /* This pass will decide on using loop versioning and/or loop peeling in
1824      order to enhance the alignment of data references in the loop.  */
1825
1826   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1827   if (!ok)
1828     {
1829       if (dump_enabled_p ())
1830         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1831                          "bad data alignment.\n");
1832       return false;
1833     }
1834
1835   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1836   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1837   if (ok)
1838     {
1839       /* If there are any SLP instances mark them as pure_slp.  */
1840       if (vect_make_slp_decision (loop_vinfo))
1841         {
1842           /* Find stmts that need to be both vectorized and SLPed.  */
1843           vect_detect_hybrid_slp (loop_vinfo);
1844
1845           /* Update the vectorization factor based on the SLP decision.  */
1846           vect_update_vf_for_slp (loop_vinfo);
1847
1848           /* Analyze operations in the SLP instances.  Note this may
1849              remove unsupported SLP instances which makes the above
1850              SLP kind detection invalid.  */
1851           unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1852           vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1853                                        LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1854           if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1855             return false;
1856         }
1857     }
1858   else
1859     return false;
1860
1861   /* Scan all the remaining operations in the loop that are not subject
1862      to SLP and make sure they are vectorizable.  */
1863   ok = vect_analyze_loop_operations (loop_vinfo);
1864   if (!ok)
1865     {
1866       if (dump_enabled_p ())
1867         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                          "bad operation or unsupported loop bound.\n");
1869       return false;
1870     }
1871
1872   /* Decide whether we need to create an epilogue loop to handle
1873      remaining scalar iterations.  */
1874   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1875         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1876        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1877
1878   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1879       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1880     {
1881       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1882                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1883           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1884         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1885     }
1886   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1887            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1888                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1889                /* In case of versioning, check if the maximum number of
1890                   iterations is greater than th.  If they are identical,
1891                   the epilogue is unnecessary.  */
1892                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1893                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1894                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1895                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1896     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1897
1898   /* If an epilogue loop is required make sure we can create one.  */
1899   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1900       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1901     {
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1904       if (!vect_can_advance_ivs_p (loop_vinfo)
1905           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1906                                            single_exit (LOOP_VINFO_LOOP
1907                                                          (loop_vinfo))))
1908         {
1909           if (dump_enabled_p ())
1910             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1911                              "not vectorized: can't create required "
1912                              "epilog loop\n");
1913           return false;
1914         }
1915     }
1916
1917   return true;
1918 }
1919
1920 /* Function vect_analyze_loop.
1921
1922    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1923    for it.  The different analyses will record information in the
1924    loop_vec_info struct.  */
1925 loop_vec_info
1926 vect_analyze_loop (struct loop *loop)
1927 {
1928   loop_vec_info loop_vinfo;
1929   unsigned int vector_sizes;
1930
1931   /* Autodetect first vector size we try.  */
1932   current_vector_size = 0;
1933   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1934
1935   if (dump_enabled_p ())
1936     dump_printf_loc (MSG_NOTE, vect_location,
1937                      "===== analyze_loop_nest =====\n");
1938
1939   if (loop_outer (loop)
1940       && loop_vec_info_for_loop (loop_outer (loop))
1941       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1942     {
1943       if (dump_enabled_p ())
1944         dump_printf_loc (MSG_NOTE, vect_location,
1945                          "outer-loop already vectorized.\n");
1946       return NULL;
1947     }
1948
1949   while (1)
1950     {
1951       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1952       loop_vinfo = vect_analyze_loop_form (loop);
1953       if (!loop_vinfo)
1954         {
1955           if (dump_enabled_p ())
1956             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1957                              "bad loop form.\n");
1958           return NULL;
1959         }
1960
1961       if (vect_analyze_loop_2 (loop_vinfo))
1962         {
1963           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1964
1965           return loop_vinfo;
1966         }
1967
1968       destroy_loop_vec_info (loop_vinfo, true);
1969
1970       vector_sizes &= ~current_vector_size;
1971       if (vector_sizes == 0
1972           || current_vector_size == 0)
1973         return NULL;
1974
1975       /* Try the next biggest vector size.  */
1976       current_vector_size = 1 << floor_log2 (vector_sizes);
1977       if (dump_enabled_p ())
1978         dump_printf_loc (MSG_NOTE, vect_location,
1979                          "***** Re-trying analysis with "
1980                          "vector size %d\n", current_vector_size);
1981     }
1982 }
1983
1984
1985 /* Function reduction_code_for_scalar_code
1986
1987    Input:
1988    CODE - tree_code of a reduction operations.
1989
1990    Output:
1991    REDUC_CODE - the corresponding tree-code to be used to reduce the
1992       vector of partial results into a single scalar result, or ERROR_MARK
1993       if the operation is a supported reduction operation, but does not have
1994       such a tree-code.
1995
1996    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1997
1998 static bool
1999 reduction_code_for_scalar_code (enum tree_code code,
2000                                 enum tree_code *reduc_code)
2001 {
2002   switch (code)
2003     {
2004       case MAX_EXPR:
2005         *reduc_code = REDUC_MAX_EXPR;
2006         return true;
2007
2008       case MIN_EXPR:
2009         *reduc_code = REDUC_MIN_EXPR;
2010         return true;
2011
2012       case PLUS_EXPR:
2013         *reduc_code = REDUC_PLUS_EXPR;
2014         return true;
2015
2016       case MULT_EXPR:
2017       case MINUS_EXPR:
2018       case BIT_IOR_EXPR:
2019       case BIT_XOR_EXPR:
2020       case BIT_AND_EXPR:
2021         *reduc_code = ERROR_MARK;
2022         return true;
2023
2024       default:
2025        return false;
2026     }
2027 }
2028
2029
2030 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2031    STMT is printed with a message MSG. */
2032
2033 static void
2034 report_vect_op (int msg_type, gimple stmt, const char *msg)
2035 {
2036   dump_printf_loc (msg_type, vect_location, "%s", msg);
2037   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2038   dump_printf (msg_type, "\n");
2039 }
2040
2041
2042 /* Detect SLP reduction of the form:
2043
2044    #a1 = phi <a5, a0>
2045    a2 = operation (a1)
2046    a3 = operation (a2)
2047    a4 = operation (a3)
2048    a5 = operation (a4)
2049
2050    #a = phi <a5>
2051
2052    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2053    FIRST_STMT is the first reduction stmt in the chain
2054    (a2 = operation (a1)).
2055
2056    Return TRUE if a reduction chain was detected.  */
2057
2058 static bool
2059 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2060 {
2061   struct loop *loop = (gimple_bb (phi))->loop_father;
2062   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2063   enum tree_code code;
2064   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2065   stmt_vec_info use_stmt_info, current_stmt_info;
2066   tree lhs;
2067   imm_use_iterator imm_iter;
2068   use_operand_p use_p;
2069   int nloop_uses, size = 0, n_out_of_loop_uses;
2070   bool found = false;
2071
2072   if (loop != vect_loop)
2073     return false;
2074
2075   lhs = PHI_RESULT (phi);
2076   code = gimple_assign_rhs_code (first_stmt);
2077   while (1)
2078     {
2079       nloop_uses = 0;
2080       n_out_of_loop_uses = 0;
2081       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2082         {
2083           gimple use_stmt = USE_STMT (use_p);
2084           if (is_gimple_debug (use_stmt))
2085             continue;
2086
2087           /* Check if we got back to the reduction phi.  */
2088           if (use_stmt == phi)
2089             {
2090               loop_use_stmt = use_stmt;
2091               found = true;
2092               break;
2093             }
2094
2095           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2096             {
2097               loop_use_stmt = use_stmt;
2098               nloop_uses++;
2099             }
2100            else
2101              n_out_of_loop_uses++;
2102
2103            /* There are can be either a single use in the loop or two uses in
2104               phi nodes.  */
2105            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2106              return false;
2107         }
2108
2109       if (found)
2110         break;
2111
2112       /* We reached a statement with no loop uses.  */
2113       if (nloop_uses == 0)
2114         return false;
2115
2116       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2117       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2118         return false;
2119
2120       if (!is_gimple_assign (loop_use_stmt)
2121           || code != gimple_assign_rhs_code (loop_use_stmt)
2122           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2123         return false;
2124
2125       /* Insert USE_STMT into reduction chain.  */
2126       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2127       if (current_stmt)
2128         {
2129           current_stmt_info = vinfo_for_stmt (current_stmt);
2130           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2131           GROUP_FIRST_ELEMENT (use_stmt_info)
2132             = GROUP_FIRST_ELEMENT (current_stmt_info);
2133         }
2134       else
2135         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2136
2137       lhs = gimple_assign_lhs (loop_use_stmt);
2138       current_stmt = loop_use_stmt;
2139       size++;
2140    }
2141
2142   if (!found || loop_use_stmt != phi || size < 2)
2143     return false;
2144
2145   /* Swap the operands, if needed, to make the reduction operand be the second
2146      operand.  */
2147   lhs = PHI_RESULT (phi);
2148   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2149   while (next_stmt)
2150     {
2151       if (gimple_assign_rhs2 (next_stmt) == lhs)
2152         {
2153           tree op = gimple_assign_rhs1 (next_stmt);
2154           gimple def_stmt = NULL;
2155
2156           if (TREE_CODE (op) == SSA_NAME)
2157             def_stmt = SSA_NAME_DEF_STMT (op);
2158
2159           /* Check that the other def is either defined in the loop
2160              ("vect_internal_def"), or it's an induction (defined by a
2161              loop-header phi-node).  */
2162           if (def_stmt
2163               && gimple_bb (def_stmt)
2164               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2165               && (is_gimple_assign (def_stmt)
2166                   || is_gimple_call (def_stmt)
2167                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2168                            == vect_induction_def
2169                   || (gimple_code (def_stmt) == GIMPLE_PHI
2170                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2171                                   == vect_internal_def
2172                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2173             {
2174               lhs = gimple_assign_lhs (next_stmt);
2175               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2176               continue;
2177             }
2178
2179           return false;
2180         }
2181       else
2182         {
2183           tree op = gimple_assign_rhs2 (next_stmt);
2184           gimple def_stmt = NULL;
2185
2186           if (TREE_CODE (op) == SSA_NAME)
2187             def_stmt = SSA_NAME_DEF_STMT (op);
2188
2189           /* Check that the other def is either defined in the loop
2190             ("vect_internal_def"), or it's an induction (defined by a
2191             loop-header phi-node).  */
2192           if (def_stmt
2193               && gimple_bb (def_stmt)
2194               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2195               && (is_gimple_assign (def_stmt)
2196                   || is_gimple_call (def_stmt)
2197                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2198                               == vect_induction_def
2199                   || (gimple_code (def_stmt) == GIMPLE_PHI
2200                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2201                                   == vect_internal_def
2202                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2203             {
2204               if (dump_enabled_p ())
2205                 {
2206                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2207                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2208                   dump_printf (MSG_NOTE, "\n");
2209                 }
2210
2211               swap_ssa_operands (next_stmt,
2212                                  gimple_assign_rhs1_ptr (next_stmt),
2213                                  gimple_assign_rhs2_ptr (next_stmt));
2214               update_stmt (next_stmt);
2215
2216               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2217                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2218             }
2219           else
2220             return false;
2221         }
2222
2223       lhs = gimple_assign_lhs (next_stmt);
2224       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2225     }
2226
2227   /* Save the chain for further analysis in SLP detection.  */
2228   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2229   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2230   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2231
2232   return true;
2233 }
2234
2235
2236 /* Function vect_is_simple_reduction_1
2237
2238    (1) Detect a cross-iteration def-use cycle that represents a simple
2239    reduction computation.  We look for the following pattern:
2240
2241    loop_header:
2242      a1 = phi < a0, a2 >
2243      a3 = ...
2244      a2 = operation (a3, a1)
2245
2246    or
2247
2248    a3 = ...
2249    loop_header:
2250      a1 = phi < a0, a2 >
2251      a2 = operation (a3, a1)
2252
2253    such that:
2254    1. operation is commutative and associative and it is safe to
2255       change the order of the computation (if CHECK_REDUCTION is true)
2256    2. no uses for a2 in the loop (a2 is used out of the loop)
2257    3. no uses of a1 in the loop besides the reduction operation
2258    4. no uses of a1 outside the loop.
2259
2260    Conditions 1,4 are tested here.
2261    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2262
2263    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2264    nested cycles, if CHECK_REDUCTION is false.
2265
2266    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2267    reductions:
2268
2269      a1 = phi < a0, a2 >
2270      inner loop (def of a3)
2271      a2 = phi < a3 >
2272
2273    If MODIFY is true it tries also to rework the code in-place to enable
2274    detection of more reduction patterns.  For the time being we rewrite
2275    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2276 */
2277
2278 static gimple
2279 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2280                             bool check_reduction, bool *double_reduc,
2281                             bool modify)
2282 {
2283   struct loop *loop = (gimple_bb (phi))->loop_father;
2284   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2285   edge latch_e = loop_latch_edge (loop);
2286   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2287   gimple def_stmt, def1 = NULL, def2 = NULL;
2288   enum tree_code orig_code, code;
2289   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2290   tree type;
2291   int nloop_uses;
2292   tree name;
2293   imm_use_iterator imm_iter;
2294   use_operand_p use_p;
2295   bool phi_def;
2296
2297   *double_reduc = false;
2298
2299   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2300      otherwise, we assume outer loop vectorization.  */
2301   gcc_assert ((check_reduction && loop == vect_loop)
2302               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2303
2304   name = PHI_RESULT (phi);
2305   /* ???  If there are no uses of the PHI result the inner loop reduction
2306      won't be detected as possibly double-reduction by vectorizable_reduction
2307      because that tries to walk the PHI arg from the preheader edge which
2308      can be constant.  See PR60382.  */
2309   if (has_zero_uses (name))
2310     return NULL;
2311   nloop_uses = 0;
2312   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2313     {
2314       gimple use_stmt = USE_STMT (use_p);
2315       if (is_gimple_debug (use_stmt))
2316         continue;
2317
2318       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2319         {
2320           if (dump_enabled_p ())
2321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2322                              "intermediate value used outside loop.\n");
2323
2324           return NULL;
2325         }
2326
2327       nloop_uses++;
2328       if (nloop_uses > 1)
2329         {
2330           if (dump_enabled_p ())
2331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332                              "reduction used in loop.\n");
2333           return NULL;
2334         }
2335     }
2336
2337   if (TREE_CODE (loop_arg) != SSA_NAME)
2338     {
2339       if (dump_enabled_p ())
2340         {
2341           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2342                            "reduction: not ssa_name: ");
2343           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2344           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2345         }
2346       return NULL;
2347     }
2348
2349   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2350   if (!def_stmt)
2351     {
2352       if (dump_enabled_p ())
2353         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354                          "reduction: no def_stmt.\n");
2355       return NULL;
2356     }
2357
2358   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2359     {
2360       if (dump_enabled_p ())
2361         {
2362           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2363           dump_printf (MSG_NOTE, "\n");
2364         }
2365       return NULL;
2366     }
2367
2368   if (is_gimple_assign (def_stmt))
2369     {
2370       name = gimple_assign_lhs (def_stmt);
2371       phi_def = false;
2372     }
2373   else
2374     {
2375       name = PHI_RESULT (def_stmt);
2376       phi_def = true;
2377     }
2378
2379   nloop_uses = 0;
2380   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2381     {
2382       gimple use_stmt = USE_STMT (use_p);
2383       if (is_gimple_debug (use_stmt))
2384         continue;
2385       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2386         nloop_uses++;
2387       if (nloop_uses > 1)
2388         {
2389           if (dump_enabled_p ())
2390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2391                              "reduction used in loop.\n");
2392           return NULL;
2393         }
2394     }
2395
2396   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2397      defined in the inner loop.  */
2398   if (phi_def)
2399     {
2400       op1 = PHI_ARG_DEF (def_stmt, 0);
2401
2402       if (gimple_phi_num_args (def_stmt) != 1
2403           || TREE_CODE (op1) != SSA_NAME)
2404         {
2405           if (dump_enabled_p ())
2406             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2407                              "unsupported phi node definition.\n");
2408
2409           return NULL;
2410         }
2411
2412       def1 = SSA_NAME_DEF_STMT (op1);
2413       if (gimple_bb (def1)
2414           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2415           && loop->inner
2416           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2417           && is_gimple_assign (def1))
2418         {
2419           if (dump_enabled_p ())
2420             report_vect_op (MSG_NOTE, def_stmt,
2421                             "detected double reduction: ");
2422
2423           *double_reduc = true;
2424           return def_stmt;
2425         }
2426
2427       return NULL;
2428     }
2429
2430   code = orig_code = gimple_assign_rhs_code (def_stmt);
2431
2432   /* We can handle "res -= x[i]", which is non-associative by
2433      simply rewriting this into "res += -x[i]".  Avoid changing
2434      gimple instruction for the first simple tests and only do this
2435      if we're allowed to change code at all.  */
2436   if (code == MINUS_EXPR
2437       && modify
2438       && (op1 = gimple_assign_rhs1 (def_stmt))
2439       && TREE_CODE (op1) == SSA_NAME
2440       && SSA_NAME_DEF_STMT (op1) == phi)
2441     code = PLUS_EXPR;
2442
2443   if (check_reduction
2444       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2445     {
2446       if (dump_enabled_p ())
2447         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2448                         "reduction: not commutative/associative: ");
2449       return NULL;
2450     }
2451
2452   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2453     {
2454       if (code != COND_EXPR)
2455         {
2456           if (dump_enabled_p ())
2457             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2458                             "reduction: not binary operation: ");
2459
2460           return NULL;
2461         }
2462
2463       op3 = gimple_assign_rhs1 (def_stmt);
2464       if (COMPARISON_CLASS_P (op3))
2465         {
2466           op4 = TREE_OPERAND (op3, 1);
2467           op3 = TREE_OPERAND (op3, 0);
2468         }
2469
2470       op1 = gimple_assign_rhs2 (def_stmt);
2471       op2 = gimple_assign_rhs3 (def_stmt);
2472
2473       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2474         {
2475           if (dump_enabled_p ())
2476             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2477                             "reduction: uses not ssa_names: ");
2478
2479           return NULL;
2480         }
2481     }
2482   else
2483     {
2484       op1 = gimple_assign_rhs1 (def_stmt);
2485       op2 = gimple_assign_rhs2 (def_stmt);
2486
2487       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2488         {
2489           if (dump_enabled_p ())
2490             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2491                             "reduction: uses not ssa_names: ");
2492
2493           return NULL;
2494         }
2495    }
2496
2497   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2498   if ((TREE_CODE (op1) == SSA_NAME
2499        && !types_compatible_p (type,TREE_TYPE (op1)))
2500       || (TREE_CODE (op2) == SSA_NAME
2501           && !types_compatible_p (type, TREE_TYPE (op2)))
2502       || (op3 && TREE_CODE (op3) == SSA_NAME
2503           && !types_compatible_p (type, TREE_TYPE (op3)))
2504       || (op4 && TREE_CODE (op4) == SSA_NAME
2505           && !types_compatible_p (type, TREE_TYPE (op4))))
2506     {
2507       if (dump_enabled_p ())
2508         {
2509           dump_printf_loc (MSG_NOTE, vect_location,
2510                            "reduction: multiple types: operation type: ");
2511           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2512           dump_printf (MSG_NOTE, ", operands types: ");
2513           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2514                              TREE_TYPE (op1));
2515           dump_printf (MSG_NOTE, ",");
2516           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2517                              TREE_TYPE (op2));
2518           if (op3)
2519             {
2520               dump_printf (MSG_NOTE, ",");
2521               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2522                                  TREE_TYPE (op3));
2523             }
2524
2525           if (op4)
2526             {
2527               dump_printf (MSG_NOTE, ",");
2528               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2529                                  TREE_TYPE (op4));
2530             }
2531           dump_printf (MSG_NOTE, "\n");
2532         }
2533
2534       return NULL;
2535     }
2536
2537   /* Check that it's ok to change the order of the computation.
2538      Generally, when vectorizing a reduction we change the order of the
2539      computation.  This may change the behavior of the program in some
2540      cases, so we need to check that this is ok.  One exception is when
2541      vectorizing an outer-loop: the inner-loop is executed sequentially,
2542      and therefore vectorizing reductions in the inner-loop during
2543      outer-loop vectorization is safe.  */
2544
2545   /* CHECKME: check for !flag_finite_math_only too?  */
2546   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2547       && check_reduction)
2548     {
2549       /* Changing the order of operations changes the semantics.  */
2550       if (dump_enabled_p ())
2551         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2552                         "reduction: unsafe fp math optimization: ");
2553       return NULL;
2554     }
2555   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2556            && check_reduction)
2557     {
2558       /* Changing the order of operations changes the semantics.  */
2559       if (dump_enabled_p ())
2560         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2561                         "reduction: unsafe int math optimization: ");
2562       return NULL;
2563     }
2564   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2565     {
2566       /* Changing the order of operations changes the semantics.  */
2567       if (dump_enabled_p ())
2568         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2569                         "reduction: unsafe fixed-point math optimization: ");
2570       return NULL;
2571     }
2572
2573   /* If we detected "res -= x[i]" earlier, rewrite it into
2574      "res += -x[i]" now.  If this turns out to be useless reassoc
2575      will clean it up again.  */
2576   if (orig_code == MINUS_EXPR)
2577     {
2578       tree rhs = gimple_assign_rhs2 (def_stmt);
2579       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2580       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2581       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2582       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2583                                                           loop_info, NULL));
2584       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2585       gimple_assign_set_rhs2 (def_stmt, negrhs);
2586       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2587       update_stmt (def_stmt);
2588     }
2589
2590   /* Reduction is safe. We're dealing with one of the following:
2591      1) integer arithmetic and no trapv
2592      2) floating point arithmetic, and special flags permit this optimization
2593      3) nested cycle (i.e., outer loop vectorization).  */
2594   if (TREE_CODE (op1) == SSA_NAME)
2595     def1 = SSA_NAME_DEF_STMT (op1);
2596
2597   if (TREE_CODE (op2) == SSA_NAME)
2598     def2 = SSA_NAME_DEF_STMT (op2);
2599
2600   if (code != COND_EXPR
2601       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2602     {
2603       if (dump_enabled_p ())
2604         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2605       return NULL;
2606     }
2607
2608   /* Check that one def is the reduction def, defined by PHI,
2609      the other def is either defined in the loop ("vect_internal_def"),
2610      or it's an induction (defined by a loop-header phi-node).  */
2611
2612   if (def2 && def2 == phi
2613       && (code == COND_EXPR
2614           || !def1 || gimple_nop_p (def1)
2615           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2616           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2617               && (is_gimple_assign (def1)
2618                   || is_gimple_call (def1)
2619                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2620                       == vect_induction_def
2621                   || (gimple_code (def1) == GIMPLE_PHI
2622                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2623                           == vect_internal_def
2624                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2625     {
2626       if (dump_enabled_p ())
2627         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2628       return def_stmt;
2629     }
2630
2631   if (def1 && def1 == phi
2632       && (code == COND_EXPR
2633           || !def2 || gimple_nop_p (def2)
2634           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2635           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2636               && (is_gimple_assign (def2)
2637                   || is_gimple_call (def2)
2638                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2639                       == vect_induction_def
2640                   || (gimple_code (def2) == GIMPLE_PHI
2641                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2642                           == vect_internal_def
2643                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2644     {
2645       if (check_reduction)
2646         {
2647           /* Swap operands (just for simplicity - so that the rest of the code
2648              can assume that the reduction variable is always the last (second)
2649              argument).  */
2650           if (dump_enabled_p ())
2651             report_vect_op (MSG_NOTE, def_stmt,
2652                             "detected reduction: need to swap operands: ");
2653
2654           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2655                              gimple_assign_rhs2_ptr (def_stmt));
2656
2657           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2658             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2659         }
2660       else
2661         {
2662           if (dump_enabled_p ())
2663             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2664         }
2665
2666       return def_stmt;
2667     }
2668
2669   /* Try to find SLP reduction chain.  */
2670   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2671     {
2672       if (dump_enabled_p ())
2673         report_vect_op (MSG_NOTE, def_stmt,
2674                         "reduction: detected reduction chain: ");
2675
2676       return def_stmt;
2677     }
2678
2679   if (dump_enabled_p ())
2680     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2681                     "reduction: unknown pattern: ");
2682
2683   return NULL;
2684 }
2685
2686 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2687    in-place.  Arguments as there.  */
2688
2689 static gimple
2690 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2691                           bool check_reduction, bool *double_reduc)
2692 {
2693   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2694                                      double_reduc, false);
2695 }
2696
2697 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2698    in-place if it enables detection of more reductions.  Arguments
2699    as there.  */
2700
2701 gimple
2702 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2703                           bool check_reduction, bool *double_reduc)
2704 {
2705   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2706                                      double_reduc, true);
2707 }
2708
2709 /* Calculate the cost of one scalar iteration of the loop.  */
2710 int
2711 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2712                                        stmt_vector_for_cost *scalar_cost_vec)
2713 {
2714   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2715   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2716   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2717   int innerloop_iters, i;
2718
2719   /* Count statements in scalar loop.  Using this as scalar cost for a single
2720      iteration for now.
2721
2722      TODO: Add outer loop support.
2723
2724      TODO: Consider assigning different costs to different scalar
2725      statements.  */
2726
2727   /* FORNOW.  */
2728   innerloop_iters = 1;
2729   if (loop->inner)
2730     innerloop_iters = 50; /* FIXME */
2731
2732   for (i = 0; i < nbbs; i++)
2733     {
2734       gimple_stmt_iterator si;
2735       basic_block bb = bbs[i];
2736
2737       if (bb->loop_father == loop->inner)
2738         factor = innerloop_iters;
2739       else
2740         factor = 1;
2741
2742       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2743         {
2744           gimple stmt = gsi_stmt (si);
2745           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2746
2747           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2748             continue;
2749
2750           /* Skip stmts that are not vectorized inside the loop.  */
2751           if (stmt_info
2752               && !STMT_VINFO_RELEVANT_P (stmt_info)
2753               && (!STMT_VINFO_LIVE_P (stmt_info)
2754                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2755               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2756             continue;
2757
2758           vect_cost_for_stmt kind;
2759           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2760             {
2761               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2762                kind = scalar_load;
2763              else
2764                kind = scalar_store;
2765             }
2766           else
2767             kind = scalar_stmt;
2768
2769           scalar_single_iter_cost
2770             += record_stmt_cost (scalar_cost_vec, factor, kind,
2771                                  NULL, 0, vect_prologue);
2772         }
2773     }
2774   return scalar_single_iter_cost;
2775 }
2776
2777 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2778 int
2779 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2780                              int *peel_iters_epilogue,
2781                              stmt_vector_for_cost *scalar_cost_vec,
2782                              stmt_vector_for_cost *prologue_cost_vec,
2783                              stmt_vector_for_cost *epilogue_cost_vec)
2784 {
2785   int retval = 0;
2786   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2787
2788   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2789     {
2790       *peel_iters_epilogue = vf/2;
2791       if (dump_enabled_p ())
2792         dump_printf_loc (MSG_NOTE, vect_location,
2793                          "cost model: epilogue peel iters set to vf/2 "
2794                          "because loop iterations are unknown .\n");
2795
2796       /* If peeled iterations are known but number of scalar loop
2797          iterations are unknown, count a taken branch per peeled loop.  */
2798       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2799                                  NULL, 0, vect_prologue);
2800       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2801                                  NULL, 0, vect_epilogue);
2802     }
2803   else
2804     {
2805       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2806       peel_iters_prologue = niters < peel_iters_prologue ?
2807                             niters : peel_iters_prologue;
2808       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2809       /* If we need to peel for gaps, but no peeling is required, we have to
2810          peel VF iterations.  */
2811       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2812         *peel_iters_epilogue = vf;
2813     }
2814
2815   stmt_info_for_cost *si;
2816   int j;
2817   if (peel_iters_prologue)
2818     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2819       retval += record_stmt_cost (prologue_cost_vec,
2820                                   si->count * peel_iters_prologue,
2821                                   si->kind, NULL, si->misalign,
2822                                   vect_prologue);
2823   if (*peel_iters_epilogue)
2824     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2825       retval += record_stmt_cost (epilogue_cost_vec,
2826                                   si->count * *peel_iters_epilogue,
2827                                   si->kind, NULL, si->misalign,
2828                                   vect_epilogue);
2829
2830   return retval;
2831 }
2832
2833 /* Function vect_estimate_min_profitable_iters
2834
2835    Return the number of iterations required for the vector version of the
2836    loop to be profitable relative to the cost of the scalar version of the
2837    loop.  */
2838
2839 static void
2840 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2841                                     int *ret_min_profitable_niters,
2842                                     int *ret_min_profitable_estimate)
2843 {
2844   int min_profitable_iters;
2845   int min_profitable_estimate;
2846   int peel_iters_prologue;
2847   int peel_iters_epilogue;
2848   unsigned vec_inside_cost = 0;
2849   int vec_outside_cost = 0;
2850   unsigned vec_prologue_cost = 0;
2851   unsigned vec_epilogue_cost = 0;
2852   int scalar_single_iter_cost = 0;
2853   int scalar_outside_cost = 0;
2854   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2855   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2856   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2857
2858   /* Cost model disabled.  */
2859   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2860     {
2861       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2862       *ret_min_profitable_niters = 0;
2863       *ret_min_profitable_estimate = 0;
2864       return;
2865     }
2866
2867   /* Requires loop versioning tests to handle misalignment.  */
2868   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2869     {
2870       /*  FIXME: Make cost depend on complexity of individual check.  */
2871       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2872       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2873                             vect_prologue);
2874       dump_printf (MSG_NOTE,
2875                    "cost model: Adding cost of checks for loop "
2876                    "versioning to treat misalignment.\n");
2877     }
2878
2879   /* Requires loop versioning with alias checks.  */
2880   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2881     {
2882       /*  FIXME: Make cost depend on complexity of individual check.  */
2883       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2884       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2885                             vect_prologue);
2886       dump_printf (MSG_NOTE,
2887                    "cost model: Adding cost of checks for loop "
2888                    "versioning aliasing.\n");
2889     }
2890
2891   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2892       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2893     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2894                           vect_prologue);
2895
2896   /* Count statements in scalar loop.  Using this as scalar cost for a single
2897      iteration for now.
2898
2899      TODO: Add outer loop support.
2900
2901      TODO: Consider assigning different costs to different scalar
2902      statements.  */
2903
2904   auto_vec<stmt_info_for_cost> scalar_cost_vec;
2905   scalar_single_iter_cost
2906      = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2907
2908   /* Add additional cost for the peeled instructions in prologue and epilogue
2909      loop.
2910
2911      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2912      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2913
2914      TODO: Build an expression that represents peel_iters for prologue and
2915      epilogue to be used in a run-time test.  */
2916
2917   if (npeel  < 0)
2918     {
2919       peel_iters_prologue = vf/2;
2920       dump_printf (MSG_NOTE, "cost model: "
2921                    "prologue peel iters set to vf/2.\n");
2922
2923       /* If peeling for alignment is unknown, loop bound of main loop becomes
2924          unknown.  */
2925       peel_iters_epilogue = vf/2;
2926       dump_printf (MSG_NOTE, "cost model: "
2927                    "epilogue peel iters set to vf/2 because "
2928                    "peeling for alignment is unknown.\n");
2929
2930       /* If peeled iterations are unknown, count a taken branch and a not taken
2931          branch per peeled loop. Even if scalar loop iterations are known,
2932          vector iterations are not known since peeled prologue iterations are
2933          not known. Hence guards remain the same.  */
2934       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2935                             NULL, 0, vect_prologue);
2936       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2937                             NULL, 0, vect_prologue);
2938       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2939                             NULL, 0, vect_epilogue);
2940       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2941                             NULL, 0, vect_epilogue);
2942       stmt_info_for_cost *si;
2943       int j;
2944       FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2945         {
2946           struct _stmt_vec_info *stmt_info
2947             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2948           (void) add_stmt_cost (target_cost_data,
2949                                 si->count * peel_iters_prologue,
2950                                 si->kind, stmt_info, si->misalign,
2951                                 vect_prologue);
2952           (void) add_stmt_cost (target_cost_data,
2953                                 si->count * peel_iters_epilogue,
2954                                 si->kind, stmt_info, si->misalign,
2955                                 vect_epilogue);
2956         }
2957     }
2958   else
2959     {
2960       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2961       stmt_info_for_cost *si;
2962       int j;
2963       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2964
2965       prologue_cost_vec.create (2);
2966       epilogue_cost_vec.create (2);
2967       peel_iters_prologue = npeel;
2968
2969       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2970                                           &peel_iters_epilogue,
2971                                           &scalar_cost_vec,
2972                                           &prologue_cost_vec,
2973                                           &epilogue_cost_vec);
2974
2975       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2976         {
2977           struct _stmt_vec_info *stmt_info
2978             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2979           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2980                                 si->misalign, vect_prologue);
2981         }
2982
2983       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2984         {
2985           struct _stmt_vec_info *stmt_info
2986             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2987           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2988                                 si->misalign, vect_epilogue);
2989         }
2990
2991       prologue_cost_vec.release ();
2992       epilogue_cost_vec.release ();
2993     }
2994
2995   /* FORNOW: The scalar outside cost is incremented in one of the
2996      following ways:
2997
2998      1. The vectorizer checks for alignment and aliasing and generates
2999      a condition that allows dynamic vectorization.  A cost model
3000      check is ANDED with the versioning condition.  Hence scalar code
3001      path now has the added cost of the versioning check.
3002
3003        if (cost > th & versioning_check)
3004          jmp to vector code
3005
3006      Hence run-time scalar is incremented by not-taken branch cost.
3007
3008      2. The vectorizer then checks if a prologue is required.  If the
3009      cost model check was not done before during versioning, it has to
3010      be done before the prologue check.
3011
3012        if (cost <= th)
3013          prologue = scalar_iters
3014        if (prologue == 0)
3015          jmp to vector code
3016        else
3017          execute prologue
3018        if (prologue == num_iters)
3019          go to exit
3020
3021      Hence the run-time scalar cost is incremented by a taken branch,
3022      plus a not-taken branch, plus a taken branch cost.
3023
3024      3. The vectorizer then checks if an epilogue is required.  If the
3025      cost model check was not done before during prologue check, it
3026      has to be done with the epilogue check.
3027
3028        if (prologue == 0)
3029          jmp to vector code
3030        else
3031          execute prologue
3032        if (prologue == num_iters)
3033          go to exit
3034        vector code:
3035          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3036            jmp to epilogue
3037
3038      Hence the run-time scalar cost should be incremented by 2 taken
3039      branches.
3040
3041      TODO: The back end may reorder the BBS's differently and reverse
3042      conditions/branch directions.  Change the estimates below to
3043      something more reasonable.  */
3044
3045   /* If the number of iterations is known and we do not do versioning, we can
3046      decide whether to vectorize at compile time.  Hence the scalar version
3047      do not carry cost model guard costs.  */
3048   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3049       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3050       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3051     {
3052       /* Cost model check occurs at versioning.  */
3053       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3054           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3055         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3056       else
3057         {
3058           /* Cost model check occurs at prologue generation.  */
3059           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3060             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3061               + vect_get_stmt_cost (cond_branch_not_taken);
3062           /* Cost model check occurs at epilogue generation.  */
3063           else
3064             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3065         }
3066     }
3067
3068   /* Complete the target-specific cost calculations.  */
3069   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3070                &vec_inside_cost, &vec_epilogue_cost);
3071
3072   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3073
3074   if (dump_enabled_p ())
3075     {
3076       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3077       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3078                    vec_inside_cost);
3079       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3080                    vec_prologue_cost);
3081       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3082                    vec_epilogue_cost);
3083       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3084                    scalar_single_iter_cost);
3085       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3086                    scalar_outside_cost);
3087       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3088                    vec_outside_cost);
3089       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3090                    peel_iters_prologue);
3091       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3092                    peel_iters_epilogue);
3093     }
3094
3095   /* Calculate number of iterations required to make the vector version
3096      profitable, relative to the loop bodies only.  The following condition
3097      must hold true:
3098      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3099      where
3100      SIC = scalar iteration cost, VIC = vector iteration cost,
3101      VOC = vector outside cost, VF = vectorization factor,
3102      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3103      SOC = scalar outside cost for run time cost model check.  */
3104
3105   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3106     {
3107       if (vec_outside_cost <= 0)
3108         min_profitable_iters = 1;
3109       else
3110         {
3111           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3112                                   - vec_inside_cost * peel_iters_prologue
3113                                   - vec_inside_cost * peel_iters_epilogue)
3114                                  / ((scalar_single_iter_cost * vf)
3115                                     - vec_inside_cost);
3116
3117           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3118               <= (((int) vec_inside_cost * min_profitable_iters)
3119                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3120             min_profitable_iters++;
3121         }
3122     }
3123   /* vector version will never be profitable.  */
3124   else
3125     {
3126       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3127         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3128                     "did not happen for a simd loop");
3129
3130       if (dump_enabled_p ())
3131         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3132                          "cost model: the vector iteration cost = %d "
3133                          "divided by the scalar iteration cost = %d "
3134                          "is greater or equal to the vectorization factor = %d"
3135                          ".\n",
3136                          vec_inside_cost, scalar_single_iter_cost, vf);
3137       *ret_min_profitable_niters = -1;
3138       *ret_min_profitable_estimate = -1;
3139       return;
3140     }
3141
3142   dump_printf (MSG_NOTE,
3143                "  Calculated minimum iters for profitability: %d\n",
3144                min_profitable_iters);
3145
3146   min_profitable_iters =
3147         min_profitable_iters < vf ? vf : min_profitable_iters;
3148
3149   /* Because the condition we create is:
3150      if (niters <= min_profitable_iters)
3151        then skip the vectorized loop.  */
3152   min_profitable_iters--;
3153
3154   if (dump_enabled_p ())
3155     dump_printf_loc (MSG_NOTE, vect_location,
3156                      "  Runtime profitability threshold = %d\n",
3157                      min_profitable_iters);
3158
3159   *ret_min_profitable_niters = min_profitable_iters;
3160
3161   /* Calculate number of iterations required to make the vector version
3162      profitable, relative to the loop bodies only.
3163
3164      Non-vectorized variant is SIC * niters and it must win over vector
3165      variant on the expected loop trip count.  The following condition must hold true:
3166      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3167
3168   if (vec_outside_cost <= 0)
3169     min_profitable_estimate = 1;
3170   else
3171     {
3172       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3173                                  - vec_inside_cost * peel_iters_prologue
3174                                  - vec_inside_cost * peel_iters_epilogue)
3175                                  / ((scalar_single_iter_cost * vf)
3176                                    - vec_inside_cost);
3177     }
3178   min_profitable_estimate --;
3179   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3180   if (dump_enabled_p ())
3181     dump_printf_loc (MSG_NOTE, vect_location,
3182                      "  Static estimate profitability threshold = %d\n",
3183                       min_profitable_iters);
3184
3185   *ret_min_profitable_estimate = min_profitable_estimate;
3186 }
3187
3188 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3189    vector elements (not bits) for a vector of mode MODE.  */
3190 static void
3191 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3192                               unsigned char *sel)
3193 {
3194   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3195
3196   for (i = 0; i < nelt; i++)
3197     sel[i] = (i + offset) & (2*nelt - 1);
3198 }
3199
3200 /* Checks whether the target supports whole-vector shifts for vectors of mode
3201    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3202    it supports vec_perm_const with masks for all necessary shift amounts.  */
3203 static bool
3204 have_whole_vector_shift (enum machine_mode mode)
3205 {
3206   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3207     return true;
3208
3209   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3210     return false;
3211
3212   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3213   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3214
3215   for (i = nelt/2; i >= 1; i/=2)
3216     {
3217       calc_vec_perm_mask_for_shift (mode, i, sel);
3218       if (!can_vec_perm_p (mode, false, sel))
3219         return false;
3220     }
3221   return true;
3222 }
3223
3224 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3225
3226 static tree
3227 get_reduction_op (gimple stmt, int reduc_index)
3228 {
3229   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3230     {
3231     case GIMPLE_SINGLE_RHS:
3232       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3233                   == ternary_op);
3234       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3235     case GIMPLE_UNARY_RHS:
3236       return gimple_assign_rhs1 (stmt);
3237     case GIMPLE_BINARY_RHS:
3238       return (reduc_index
3239               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3240     case GIMPLE_TERNARY_RHS:
3241       return gimple_op (stmt, reduc_index + 1);
3242     default:
3243       gcc_unreachable ();
3244     }
3245 }
3246
3247 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3248    functions. Design better to avoid maintenance issues.  */
3249
3250 /* Function vect_model_reduction_cost.
3251
3252    Models cost for a reduction operation, including the vector ops
3253    generated within the strip-mine loop, the initial definition before
3254    the loop, and the epilogue code that must be generated.  */
3255
3256 static bool
3257 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3258                            int ncopies, int reduc_index)
3259 {
3260   int prologue_cost = 0, epilogue_cost = 0;
3261   enum tree_code code;
3262   optab optab;
3263   tree vectype;
3264   gimple stmt, orig_stmt;
3265   tree reduction_op;
3266   machine_mode mode;
3267   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3268   struct loop *loop = NULL;
3269   void *target_cost_data;
3270
3271   if (loop_vinfo)
3272     {
3273       loop = LOOP_VINFO_LOOP (loop_vinfo);
3274       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3275     }
3276   else
3277     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3278
3279   /* Cost of reduction op inside loop.  */
3280   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3281                                         stmt_info, 0, vect_body);
3282   stmt = STMT_VINFO_STMT (stmt_info);
3283
3284   reduction_op = get_reduction_op (stmt, reduc_index);
3285
3286   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3287   if (!vectype)
3288     {
3289       if (dump_enabled_p ())
3290         {
3291           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3292                            "unsupported data-type ");
3293           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3294                              TREE_TYPE (reduction_op));
3295           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3296         }
3297       return false;
3298    }
3299
3300   mode = TYPE_MODE (vectype);
3301   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3302
3303   if (!orig_stmt)
3304     orig_stmt = STMT_VINFO_STMT (stmt_info);
3305
3306   code = gimple_assign_rhs_code (orig_stmt);
3307
3308   /* Add in cost for initial definition.  */
3309   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3310                                   stmt_info, 0, vect_prologue);
3311
3312   /* Determine cost of epilogue code.
3313
3314      We have a reduction operator that will reduce the vector in one statement.
3315      Also requires scalar extract.  */
3316
3317   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3318     {
3319       if (reduc_code != ERROR_MARK)
3320         {
3321           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3322                                           stmt_info, 0, vect_epilogue);
3323           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3324                                           stmt_info, 0, vect_epilogue);
3325         }
3326       else
3327         {
3328           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3329           tree bitsize =
3330             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3331           int element_bitsize = tree_to_uhwi (bitsize);
3332           int nelements = vec_size_in_bits / element_bitsize;
3333
3334           optab = optab_for_tree_code (code, vectype, optab_default);
3335
3336           /* We have a whole vector shift available.  */
3337           if (VECTOR_MODE_P (mode)
3338               && optab_handler (optab, mode) != CODE_FOR_nothing
3339               && have_whole_vector_shift (mode))
3340             {
3341               /* Final reduction via vector shifts and the reduction operator.
3342                  Also requires scalar extract.  */
3343               epilogue_cost += add_stmt_cost (target_cost_data,
3344                                               exact_log2 (nelements) * 2,
3345                                               vector_stmt, stmt_info, 0,
3346                                               vect_epilogue);
3347               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3348                                               vec_to_scalar, stmt_info, 0,
3349                                               vect_epilogue);
3350             }
3351           else
3352             /* Use extracts and reduction op for final reduction.  For N
3353                elements, we have N extracts and N-1 reduction ops.  */
3354             epilogue_cost += add_stmt_cost (target_cost_data,
3355                                             nelements + nelements - 1,
3356                                             vector_stmt, stmt_info, 0,
3357                                             vect_epilogue);
3358         }
3359     }
3360
3361   if (dump_enabled_p ())
3362     dump_printf (MSG_NOTE,
3363                  "vect_model_reduction_cost: inside_cost = %d, "
3364                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3365                  prologue_cost, epilogue_cost);
3366
3367   return true;
3368 }
3369
3370
3371 /* Function vect_model_induction_cost.
3372
3373    Models cost for induction operations.  */
3374
3375 static void
3376 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3377 {
3378   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3379   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3380   unsigned inside_cost, prologue_cost;
3381
3382   /* loop cost for vec_loop.  */
3383   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3384                                stmt_info, 0, vect_body);
3385
3386   /* prologue cost for vec_init and vec_step.  */
3387   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3388                                  stmt_info, 0, vect_prologue);
3389
3390   if (dump_enabled_p ())
3391     dump_printf_loc (MSG_NOTE, vect_location,
3392                      "vect_model_induction_cost: inside_cost = %d, "
3393                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3394 }
3395
3396
3397 /* Function get_initial_def_for_induction
3398
3399    Input:
3400    STMT - a stmt that performs an induction operation in the loop.
3401    IV_PHI - the initial value of the induction variable
3402
3403    Output:
3404    Return a vector variable, initialized with the first VF values of
3405    the induction variable.  E.g., for an iv with IV_PHI='X' and
3406    evolution S, for a vector of 4 units, we want to return:
3407    [X, X + S, X + 2*S, X + 3*S].  */
3408
3409 static tree
3410 get_initial_def_for_induction (gimple iv_phi)
3411 {
3412   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3413   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3414   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3415   tree vectype;
3416   int nunits;
3417   edge pe = loop_preheader_edge (loop);
3418   struct loop *iv_loop;
3419   basic_block new_bb;
3420   tree new_vec, vec_init, vec_step, t;
3421   tree new_var;
3422   tree new_name;
3423   gimple init_stmt, new_stmt;
3424   gphi *induction_phi;
3425   tree induc_def, vec_def, vec_dest;
3426   tree init_expr, step_expr;
3427   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3428   int i;
3429   int ncopies;
3430   tree expr;
3431   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3432   bool nested_in_vect_loop = false;
3433   gimple_seq stmts = NULL;
3434   imm_use_iterator imm_iter;
3435   use_operand_p use_p;
3436   gimple exit_phi;
3437   edge latch_e;
3438   tree loop_arg;
3439   gimple_stmt_iterator si;
3440   basic_block bb = gimple_bb (iv_phi);
3441   tree stepvectype;
3442   tree resvectype;
3443
3444   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3445   if (nested_in_vect_loop_p (loop, iv_phi))
3446     {
3447       nested_in_vect_loop = true;
3448       iv_loop = loop->inner;
3449     }
3450   else
3451     iv_loop = loop;
3452   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3453
3454   latch_e = loop_latch_edge (iv_loop);
3455   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3456
3457   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3458   gcc_assert (step_expr != NULL_TREE);
3459
3460   pe = loop_preheader_edge (iv_loop);
3461   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3462                                      loop_preheader_edge (iv_loop));
3463
3464   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3465   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3466   gcc_assert (vectype);
3467   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3468   ncopies = vf / nunits;
3469
3470   gcc_assert (phi_info);
3471   gcc_assert (ncopies >= 1);
3472
3473   /* Convert the step to the desired type.  */
3474   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3475                                                   step_expr),
3476                                     &stmts, true, NULL_TREE);
3477   if (stmts)
3478     {
3479       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3480       gcc_assert (!new_bb);
3481     }
3482
3483   /* Find the first insertion point in the BB.  */
3484   si = gsi_after_labels (bb);
3485
3486   /* Create the vector that holds the initial_value of the induction.  */
3487   if (nested_in_vect_loop)
3488     {
3489       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3490          been created during vectorization of previous stmts.  We obtain it
3491          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3492       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3493       /* If the initial value is not of proper type, convert it.  */
3494       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3495         {
3496           new_stmt
3497             = gimple_build_assign (vect_get_new_vect_var (vectype,
3498                                                           vect_simple_var,
3499                                                           "vec_iv_"),
3500                                    VIEW_CONVERT_EXPR,
3501                                    build1 (VIEW_CONVERT_EXPR, vectype,
3502                                            vec_init));
3503           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3504           gimple_assign_set_lhs (new_stmt, vec_init);
3505           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3506                                                  new_stmt);
3507           gcc_assert (!new_bb);
3508           set_vinfo_for_stmt (new_stmt,
3509                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3510         }
3511     }
3512   else
3513     {
3514       vec<constructor_elt, va_gc> *v;
3515
3516       /* iv_loop is the loop to be vectorized. Create:
3517          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3518       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3519                                        vect_scalar_var, "var_");
3520       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3521                                                      init_expr),
3522                                        &stmts, false, new_var);
3523       if (stmts)
3524         {
3525           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3526           gcc_assert (!new_bb);
3527         }
3528
3529       vec_alloc (v, nunits);
3530       bool constant_p = is_gimple_min_invariant (new_name);
3531       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3532       for (i = 1; i < nunits; i++)
3533         {
3534           /* Create: new_name_i = new_name + step_expr  */
3535           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3536                                   new_name, step_expr);
3537           if (!is_gimple_min_invariant (new_name))
3538             {
3539               init_stmt = gimple_build_assign (new_var, new_name);
3540               new_name = make_ssa_name (new_var, init_stmt);
3541               gimple_assign_set_lhs (init_stmt, new_name);
3542               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3543               gcc_assert (!new_bb);
3544               if (dump_enabled_p ())
3545                 {
3546                   dump_printf_loc (MSG_NOTE, vect_location,
3547                                    "created new init_stmt: ");
3548                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3549                   dump_printf (MSG_NOTE, "\n");
3550                 }
3551               constant_p = false;
3552             }
3553           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3554         }
3555       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3556       if (constant_p)
3557         new_vec = build_vector_from_ctor (vectype, v);
3558       else
3559         new_vec = build_constructor (vectype, v);
3560       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3561     }
3562
3563
3564   /* Create the vector that holds the step of the induction.  */
3565   if (nested_in_vect_loop)
3566     /* iv_loop is nested in the loop to be vectorized. Generate:
3567        vec_step = [S, S, S, S]  */
3568     new_name = step_expr;
3569   else
3570     {
3571       /* iv_loop is the loop to be vectorized. Generate:
3572           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3573       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3574         {
3575           expr = build_int_cst (integer_type_node, vf);
3576           expr = fold_convert (TREE_TYPE (step_expr), expr);
3577         }
3578       else
3579         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3580       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3581                               expr, step_expr);
3582       if (TREE_CODE (step_expr) == SSA_NAME)
3583         new_name = vect_init_vector (iv_phi, new_name,
3584                                      TREE_TYPE (step_expr), NULL);
3585     }
3586
3587   t = unshare_expr (new_name);
3588   gcc_assert (CONSTANT_CLASS_P (new_name)
3589               || TREE_CODE (new_name) == SSA_NAME);
3590   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3591   gcc_assert (stepvectype);
3592   new_vec = build_vector_from_val (stepvectype, t);
3593   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3594
3595
3596   /* Create the following def-use cycle:
3597      loop prolog:
3598          vec_init = ...
3599          vec_step = ...
3600      loop:
3601          vec_iv = PHI <vec_init, vec_loop>
3602          ...
3603          STMT
3604          ...
3605          vec_loop = vec_iv + vec_step;  */
3606
3607   /* Create the induction-phi that defines the induction-operand.  */
3608   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3609   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3610   set_vinfo_for_stmt (induction_phi,
3611                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3612   induc_def = PHI_RESULT (induction_phi);
3613
3614   /* Create the iv update inside the loop  */
3615   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3616   vec_def = make_ssa_name (vec_dest, new_stmt);
3617   gimple_assign_set_lhs (new_stmt, vec_def);
3618   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3619   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3620                                                    NULL));
3621
3622   /* Set the arguments of the phi node:  */
3623   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3624   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3625                UNKNOWN_LOCATION);
3626
3627
3628   /* In case that vectorization factor (VF) is bigger than the number
3629      of elements that we can fit in a vectype (nunits), we have to generate
3630      more than one vector stmt - i.e - we need to "unroll" the
3631      vector stmt by a factor VF/nunits.  For more details see documentation
3632      in vectorizable_operation.  */
3633
3634   if (ncopies > 1)
3635     {
3636       stmt_vec_info prev_stmt_vinfo;
3637       /* FORNOW. This restriction should be relaxed.  */
3638       gcc_assert (!nested_in_vect_loop);
3639
3640       /* Create the vector that holds the step of the induction.  */
3641       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3642         {
3643           expr = build_int_cst (integer_type_node, nunits);
3644           expr = fold_convert (TREE_TYPE (step_expr), expr);
3645         }
3646       else
3647         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3648       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3649                               expr, step_expr);
3650       if (TREE_CODE (step_expr) == SSA_NAME)
3651         new_name = vect_init_vector (iv_phi, new_name,
3652                                      TREE_TYPE (step_expr), NULL);
3653       t = unshare_expr (new_name);
3654       gcc_assert (CONSTANT_CLASS_P (new_name)
3655                   || TREE_CODE (new_name) == SSA_NAME);
3656       new_vec = build_vector_from_val (stepvectype, t);
3657       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3658
3659       vec_def = induc_def;
3660       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3661       for (i = 1; i < ncopies; i++)
3662         {
3663           /* vec_i = vec_prev + vec_step  */
3664           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3665                                           vec_def, vec_step);
3666           vec_def = make_ssa_name (vec_dest, new_stmt);
3667           gimple_assign_set_lhs (new_stmt, vec_def);
3668
3669           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3670           if (!useless_type_conversion_p (resvectype, vectype))
3671             {
3672               new_stmt
3673                 = gimple_build_assign
3674                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3675                                                 "vec_iv_"),
3676                          VIEW_CONVERT_EXPR,
3677                          build1 (VIEW_CONVERT_EXPR, resvectype,
3678                                  gimple_assign_lhs (new_stmt)));
3679               gimple_assign_set_lhs (new_stmt,
3680                                      make_ssa_name
3681                                        (gimple_assign_lhs (new_stmt), new_stmt));
3682               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3683             }
3684           set_vinfo_for_stmt (new_stmt,
3685                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3686           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3687           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3688         }
3689     }
3690
3691   if (nested_in_vect_loop)
3692     {
3693       /* Find the loop-closed exit-phi of the induction, and record
3694          the final vector of induction results:  */
3695       exit_phi = NULL;
3696       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3697         {
3698           gimple use_stmt = USE_STMT (use_p);
3699           if (is_gimple_debug (use_stmt))
3700             continue;
3701
3702           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3703             {
3704               exit_phi = use_stmt;
3705               break;
3706             }
3707         }
3708       if (exit_phi)
3709         {
3710           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3711           /* FORNOW. Currently not supporting the case that an inner-loop induction
3712              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3713           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3714                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3715
3716           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3717           if (dump_enabled_p ())
3718             {
3719               dump_printf_loc (MSG_NOTE, vect_location,
3720                                "vector of inductions after inner-loop:");
3721               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3722               dump_printf (MSG_NOTE, "\n");
3723             }
3724         }
3725     }
3726
3727
3728   if (dump_enabled_p ())
3729     {
3730       dump_printf_loc (MSG_NOTE, vect_location,
3731                        "transform induction: created def-use cycle: ");
3732       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3733       dump_printf (MSG_NOTE, "\n");
3734       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3735                         SSA_NAME_DEF_STMT (vec_def), 0);
3736       dump_printf (MSG_NOTE, "\n");
3737     }
3738
3739   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3740   if (!useless_type_conversion_p (resvectype, vectype))
3741     {
3742       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3743                                                              vect_simple_var,
3744                                                              "vec_iv_"),
3745                                       VIEW_CONVERT_EXPR,
3746                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3747                                               induc_def));
3748       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3749       gimple_assign_set_lhs (new_stmt, induc_def);
3750       si = gsi_after_labels (bb);
3751       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3752       set_vinfo_for_stmt (new_stmt,
3753                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3754       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3755         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3756     }
3757
3758   return induc_def;
3759 }
3760
3761
3762 /* Function get_initial_def_for_reduction
3763
3764    Input:
3765    STMT - a stmt that performs a reduction operation in the loop.
3766    INIT_VAL - the initial value of the reduction variable
3767
3768    Output:
3769    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3770         of the reduction (used for adjusting the epilog - see below).
3771    Return a vector variable, initialized according to the operation that STMT
3772         performs. This vector will be used as the initial value of the
3773         vector of partial results.
3774
3775    Option1 (adjust in epilog): Initialize the vector as follows:
3776      add/bit or/xor:    [0,0,...,0,0]
3777      mult/bit and:      [1,1,...,1,1]
3778      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3779    and when necessary (e.g. add/mult case) let the caller know
3780    that it needs to adjust the result by init_val.
3781
3782    Option2: Initialize the vector as follows:
3783      add/bit or/xor:    [init_val,0,0,...,0]
3784      mult/bit and:      [init_val,1,1,...,1]
3785      min/max/cond_expr: [init_val,init_val,...,init_val]
3786    and no adjustments are needed.
3787
3788    For example, for the following code:
3789
3790    s = init_val;
3791    for (i=0;i<n;i++)
3792      s = s + a[i];
3793
3794    STMT is 's = s + a[i]', and the reduction variable is 's'.
3795    For a vector of 4 units, we want to return either [0,0,0,init_val],
3796    or [0,0,0,0] and let the caller know that it needs to adjust
3797    the result at the end by 'init_val'.
3798
3799    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3800    initialization vector is simpler (same element in all entries), if
3801    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3802
3803    A cost model should help decide between these two schemes.  */
3804
3805 tree
3806 get_initial_def_for_reduction (gimple stmt, tree init_val,
3807                                tree *adjustment_def)
3808 {
3809   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3810   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3811   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3812   tree scalar_type = TREE_TYPE (init_val);
3813   tree vectype = get_vectype_for_scalar_type (scalar_type);
3814   int nunits;
3815   enum tree_code code = gimple_assign_rhs_code (stmt);
3816   tree def_for_init;
3817   tree init_def;
3818   tree *elts;
3819   int i;
3820   bool nested_in_vect_loop = false;
3821   tree init_value;
3822   REAL_VALUE_TYPE real_init_val = dconst0;
3823   int int_init_val = 0;
3824   gimple def_stmt = NULL;
3825
3826   gcc_assert (vectype);
3827   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3828
3829   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3830               || SCALAR_FLOAT_TYPE_P (scalar_type));
3831
3832   if (nested_in_vect_loop_p (loop, stmt))
3833     nested_in_vect_loop = true;
3834   else
3835     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3836
3837   /* In case of double reduction we only create a vector variable to be put
3838      in the reduction phi node.  The actual statement creation is done in
3839      vect_create_epilog_for_reduction.  */
3840   if (adjustment_def && nested_in_vect_loop
3841       && TREE_CODE (init_val) == SSA_NAME
3842       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3843       && gimple_code (def_stmt) == GIMPLE_PHI
3844       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3845       && vinfo_for_stmt (def_stmt)
3846       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3847           == vect_double_reduction_def)
3848     {
3849       *adjustment_def = NULL;
3850       return vect_create_destination_var (init_val, vectype);
3851     }
3852
3853   if (TREE_CONSTANT (init_val))
3854     {
3855       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3856         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3857       else
3858         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3859     }
3860   else
3861     init_value = init_val;
3862
3863   switch (code)
3864     {
3865       case WIDEN_SUM_EXPR:
3866       case DOT_PROD_EXPR:
3867       case SAD_EXPR:
3868       case PLUS_EXPR:
3869       case MINUS_EXPR:
3870       case BIT_IOR_EXPR:
3871       case BIT_XOR_EXPR:
3872       case MULT_EXPR:
3873       case BIT_AND_EXPR:
3874         /* ADJUSMENT_DEF is NULL when called from
3875            vect_create_epilog_for_reduction to vectorize double reduction.  */
3876         if (adjustment_def)
3877           {
3878             if (nested_in_vect_loop)
3879               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3880                                                               NULL);
3881             else
3882               *adjustment_def = init_val;
3883           }
3884
3885         if (code == MULT_EXPR)
3886           {
3887             real_init_val = dconst1;
3888             int_init_val = 1;
3889           }
3890
3891         if (code == BIT_AND_EXPR)
3892           int_init_val = -1;
3893
3894         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3895           def_for_init = build_real (scalar_type, real_init_val);
3896         else
3897           def_for_init = build_int_cst (scalar_type, int_init_val);
3898
3899         /* Create a vector of '0' or '1' except the first element.  */
3900         elts = XALLOCAVEC (tree, nunits);
3901         for (i = nunits - 2; i >= 0; --i)
3902           elts[i + 1] = def_for_init;
3903
3904         /* Option1: the first element is '0' or '1' as well.  */
3905         if (adjustment_def)
3906           {
3907             elts[0] = def_for_init;
3908             init_def = build_vector (vectype, elts);
3909             break;
3910           }
3911
3912         /* Option2: the first element is INIT_VAL.  */
3913         elts[0] = init_val;
3914         if (TREE_CONSTANT (init_val))
3915           init_def = build_vector (vectype, elts);
3916         else
3917           {
3918             vec<constructor_elt, va_gc> *v;
3919             vec_alloc (v, nunits);
3920             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3921             for (i = 1; i < nunits; ++i)
3922               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3923             init_def = build_constructor (vectype, v);
3924           }
3925
3926         break;
3927
3928       case MIN_EXPR:
3929       case MAX_EXPR:
3930       case COND_EXPR:
3931         if (adjustment_def)
3932           {
3933             *adjustment_def = NULL_TREE;
3934             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3935             break;
3936           }
3937
3938         init_def = build_vector_from_val (vectype, init_value);
3939         break;
3940
3941       default:
3942         gcc_unreachable ();
3943     }
3944
3945   return init_def;
3946 }
3947
3948 /* Function vect_create_epilog_for_reduction
3949
3950    Create code at the loop-epilog to finalize the result of a reduction
3951    computation.
3952
3953    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3954      reduction statements.
3955    STMT is the scalar reduction stmt that is being vectorized.
3956    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3957      number of elements that we can fit in a vectype (nunits).  In this case
3958      we have to generate more than one vector stmt - i.e - we need to "unroll"
3959      the vector stmt by a factor VF/nunits.  For more details see documentation
3960      in vectorizable_operation.
3961    REDUC_CODE is the tree-code for the epilog reduction.
3962    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3963      computation.
3964    REDUC_INDEX is the index of the operand in the right hand side of the
3965      statement that is defined by REDUCTION_PHI.
3966    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3967    SLP_NODE is an SLP node containing a group of reduction statements. The
3968      first one in this group is STMT.
3969
3970    This function:
3971    1. Creates the reduction def-use cycles: sets the arguments for
3972       REDUCTION_PHIS:
3973       The loop-entry argument is the vectorized initial-value of the reduction.
3974       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3975       sums.
3976    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3977       by applying the operation specified by REDUC_CODE if available, or by
3978       other means (whole-vector shifts or a scalar loop).
3979       The function also creates a new phi node at the loop exit to preserve
3980       loop-closed form, as illustrated below.
3981
3982      The flow at the entry to this function:
3983
3984         loop:
3985           vec_def = phi <null, null>            # REDUCTION_PHI
3986           VECT_DEF = vector_stmt                # vectorized form of STMT
3987           s_loop = scalar_stmt                  # (scalar) STMT
3988         loop_exit:
3989           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3990           use <s_out0>
3991           use <s_out0>
3992
3993      The above is transformed by this function into:
3994
3995         loop:
3996           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3997           VECT_DEF = vector_stmt                # vectorized form of STMT
3998           s_loop = scalar_stmt                  # (scalar) STMT
3999         loop_exit:
4000           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4001           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4002           v_out2 = reduce <v_out1>
4003           s_out3 = extract_field <v_out2, 0>
4004           s_out4 = adjust_result <s_out3>
4005           use <s_out4>
4006           use <s_out4>
4007 */
4008
4009 static void
4010 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
4011                                   int ncopies, enum tree_code reduc_code,
4012                                   vec<gimple> reduction_phis,
4013                                   int reduc_index, bool double_reduc,
4014                                   slp_tree slp_node)
4015 {
4016   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4017   stmt_vec_info prev_phi_info;
4018   tree vectype;
4019   machine_mode mode;
4020   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4021   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4022   basic_block exit_bb;
4023   tree scalar_dest;
4024   tree scalar_type;
4025   gimple new_phi = NULL, phi;
4026   gimple_stmt_iterator exit_gsi;
4027   tree vec_dest;
4028   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4029   gimple epilog_stmt = NULL;
4030   enum tree_code code = gimple_assign_rhs_code (stmt);
4031   gimple exit_phi;
4032   tree bitsize;
4033   tree adjustment_def = NULL;
4034   tree vec_initial_def = NULL;
4035   tree reduction_op, expr, def;
4036   tree orig_name, scalar_result;
4037   imm_use_iterator imm_iter, phi_imm_iter;
4038   use_operand_p use_p, phi_use_p;
4039   gimple use_stmt, orig_stmt, reduction_phi = NULL;
4040   bool nested_in_vect_loop = false;
4041   auto_vec<gimple> new_phis;
4042   auto_vec<gimple> inner_phis;
4043   enum vect_def_type dt = vect_unknown_def_type;
4044   int j, i;
4045   auto_vec<tree> scalar_results;
4046   unsigned int group_size = 1, k, ratio;
4047   auto_vec<tree> vec_initial_defs;
4048   auto_vec<gimple> phis;
4049   bool slp_reduc = false;
4050   tree new_phi_result;
4051   gimple inner_phi = NULL;
4052
4053   if (slp_node)
4054     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4055
4056   if (nested_in_vect_loop_p (loop, stmt))
4057     {
4058       outer_loop = loop;
4059       loop = loop->inner;
4060       nested_in_vect_loop = true;
4061       gcc_assert (!slp_node);
4062     }
4063
4064   reduction_op = get_reduction_op (stmt, reduc_index);
4065
4066   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4067   gcc_assert (vectype);
4068   mode = TYPE_MODE (vectype);
4069
4070   /* 1. Create the reduction def-use cycle:
4071      Set the arguments of REDUCTION_PHIS, i.e., transform
4072
4073         loop:
4074           vec_def = phi <null, null>            # REDUCTION_PHI
4075           VECT_DEF = vector_stmt                # vectorized form of STMT
4076           ...
4077
4078      into:
4079
4080         loop:
4081           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4082           VECT_DEF = vector_stmt                # vectorized form of STMT
4083           ...
4084
4085      (in case of SLP, do it for all the phis). */
4086
4087   /* Get the loop-entry arguments.  */
4088   if (slp_node)
4089     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4090                        NULL, slp_node, reduc_index);
4091   else
4092     {
4093       vec_initial_defs.create (1);
4094      /* For the case of reduction, vect_get_vec_def_for_operand returns
4095         the scalar def before the loop, that defines the initial value
4096         of the reduction variable.  */
4097       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4098                                                       &adjustment_def);
4099       vec_initial_defs.quick_push (vec_initial_def);
4100     }
4101
4102   /* Set phi nodes arguments.  */
4103   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4104     {
4105       tree vec_init_def, def;
4106       gimple_seq stmts;
4107       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4108                                            true, NULL_TREE);
4109       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4110       def = vect_defs[i];
4111       for (j = 0; j < ncopies; j++)
4112         {
4113           /* Set the loop-entry arg of the reduction-phi.  */
4114           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4115                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4116
4117           /* Set the loop-latch arg for the reduction-phi.  */
4118           if (j > 0)
4119             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4120
4121           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4122                        UNKNOWN_LOCATION);
4123
4124           if (dump_enabled_p ())
4125             {
4126               dump_printf_loc (MSG_NOTE, vect_location,
4127                                "transform reduction: created def-use cycle: ");
4128               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4129               dump_printf (MSG_NOTE, "\n");
4130               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4131               dump_printf (MSG_NOTE, "\n");
4132             }
4133
4134           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4135         }
4136     }
4137
4138   /* 2. Create epilog code.
4139         The reduction epilog code operates across the elements of the vector
4140         of partial results computed by the vectorized loop.
4141         The reduction epilog code consists of:
4142
4143         step 1: compute the scalar result in a vector (v_out2)
4144         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4145         step 3: adjust the scalar result (s_out3) if needed.
4146
4147         Step 1 can be accomplished using one the following three schemes:
4148           (scheme 1) using reduc_code, if available.
4149           (scheme 2) using whole-vector shifts, if available.
4150           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4151                      combined.
4152
4153           The overall epilog code looks like this:
4154
4155           s_out0 = phi <s_loop>         # original EXIT_PHI
4156           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4157           v_out2 = reduce <v_out1>              # step 1
4158           s_out3 = extract_field <v_out2, 0>    # step 2
4159           s_out4 = adjust_result <s_out3>       # step 3
4160
4161           (step 3 is optional, and steps 1 and 2 may be combined).
4162           Lastly, the uses of s_out0 are replaced by s_out4.  */
4163
4164
4165   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4166          v_out1 = phi <VECT_DEF>
4167          Store them in NEW_PHIS.  */
4168
4169   exit_bb = single_exit (loop)->dest;
4170   prev_phi_info = NULL;
4171   new_phis.create (vect_defs.length ());
4172   FOR_EACH_VEC_ELT (vect_defs, i, def)
4173     {
4174       for (j = 0; j < ncopies; j++)
4175         {
4176           tree new_def = copy_ssa_name (def);
4177           phi = create_phi_node (new_def, exit_bb);
4178           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4179           if (j == 0)
4180             new_phis.quick_push (phi);
4181           else
4182             {
4183               def = vect_get_vec_def_for_stmt_copy (dt, def);
4184               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4185             }
4186
4187           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4188           prev_phi_info = vinfo_for_stmt (phi);
4189         }
4190     }
4191
4192   /* The epilogue is created for the outer-loop, i.e., for the loop being
4193      vectorized.  Create exit phis for the outer loop.  */
4194   if (double_reduc)
4195     {
4196       loop = outer_loop;
4197       exit_bb = single_exit (loop)->dest;
4198       inner_phis.create (vect_defs.length ());
4199       FOR_EACH_VEC_ELT (new_phis, i, phi)
4200         {
4201           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4202           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4203           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4204                            PHI_RESULT (phi));
4205           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4206                                                             loop_vinfo, NULL));
4207           inner_phis.quick_push (phi);
4208           new_phis[i] = outer_phi;
4209           prev_phi_info = vinfo_for_stmt (outer_phi);
4210           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4211             {
4212               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4213               new_result = copy_ssa_name (PHI_RESULT (phi));
4214               outer_phi = create_phi_node (new_result, exit_bb);
4215               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4216                                PHI_RESULT (phi));
4217               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4218                                                         loop_vinfo, NULL));
4219               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4220               prev_phi_info = vinfo_for_stmt (outer_phi);
4221             }
4222         }
4223     }
4224
4225   exit_gsi = gsi_after_labels (exit_bb);
4226
4227   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4228          (i.e. when reduc_code is not available) and in the final adjustment
4229          code (if needed).  Also get the original scalar reduction variable as
4230          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4231          represents a reduction pattern), the tree-code and scalar-def are
4232          taken from the original stmt that the pattern-stmt (STMT) replaces.
4233          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4234          are taken from STMT.  */
4235
4236   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4237   if (!orig_stmt)
4238     {
4239       /* Regular reduction  */
4240       orig_stmt = stmt;
4241     }
4242   else
4243     {
4244       /* Reduction pattern  */
4245       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4246       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4247       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4248     }
4249
4250   code = gimple_assign_rhs_code (orig_stmt);
4251   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4252      partial results are added and not subtracted.  */
4253   if (code == MINUS_EXPR)
4254     code = PLUS_EXPR;
4255
4256   scalar_dest = gimple_assign_lhs (orig_stmt);
4257   scalar_type = TREE_TYPE (scalar_dest);
4258   scalar_results.create (group_size);
4259   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4260   bitsize = TYPE_SIZE (scalar_type);
4261
4262   /* In case this is a reduction in an inner-loop while vectorizing an outer
4263      loop - we don't need to extract a single scalar result at the end of the
4264      inner-loop (unless it is double reduction, i.e., the use of reduction is
4265      outside the outer-loop).  The final vector of partial results will be used
4266      in the vectorized outer-loop, or reduced to a scalar result at the end of
4267      the outer-loop.  */
4268   if (nested_in_vect_loop && !double_reduc)
4269     goto vect_finalize_reduction;
4270
4271   /* SLP reduction without reduction chain, e.g.,
4272      # a1 = phi <a2, a0>
4273      # b1 = phi <b2, b0>
4274      a2 = operation (a1)
4275      b2 = operation (b1)  */
4276   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4277
4278   /* In case of reduction chain, e.g.,
4279      # a1 = phi <a3, a0>
4280      a2 = operation (a1)
4281      a3 = operation (a2),
4282
4283      we may end up with more than one vector result.  Here we reduce them to
4284      one vector.  */
4285   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4286     {
4287       tree first_vect = PHI_RESULT (new_phis[0]);
4288       tree tmp;
4289       gassign *new_vec_stmt = NULL;
4290
4291       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4292       for (k = 1; k < new_phis.length (); k++)
4293         {
4294           gimple next_phi = new_phis[k];
4295           tree second_vect = PHI_RESULT (next_phi);
4296
4297           tmp = build2 (code, vectype,  first_vect, second_vect);
4298           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4299           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4300           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4301           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4302         }
4303
4304       new_phi_result = first_vect;
4305       if (new_vec_stmt)
4306         {
4307           new_phis.truncate (0);
4308           new_phis.safe_push (new_vec_stmt);
4309         }
4310     }
4311   else
4312     new_phi_result = PHI_RESULT (new_phis[0]);
4313
4314   /* 2.3 Create the reduction code, using one of the three schemes described
4315          above. In SLP we simply need to extract all the elements from the
4316          vector (without reducing them), so we use scalar shifts.  */
4317   if (reduc_code != ERROR_MARK && !slp_reduc)
4318     {
4319       tree tmp;
4320       tree vec_elem_type;
4321
4322       /*** Case 1:  Create:
4323            v_out2 = reduc_expr <v_out1>  */
4324
4325       if (dump_enabled_p ())
4326         dump_printf_loc (MSG_NOTE, vect_location,
4327                          "Reduce using direct vector reduction.\n");
4328
4329       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4330       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4331         {
4332           tree tmp_dest =
4333               vect_create_destination_var (scalar_dest, vec_elem_type);
4334           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4335           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4336           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4337           gimple_assign_set_lhs (epilog_stmt, new_temp);
4338           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4339
4340           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4341         }
4342       else
4343         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4344       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4345       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4346       gimple_assign_set_lhs (epilog_stmt, new_temp);
4347       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4348       scalar_results.safe_push (new_temp);
4349     }
4350   else
4351     {
4352       bool reduce_with_shift = have_whole_vector_shift (mode);
4353       int element_bitsize = tree_to_uhwi (bitsize);
4354       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4355       tree vec_temp;
4356
4357       /* Regardless of whether we have a whole vector shift, if we're
4358          emulating the operation via tree-vect-generic, we don't want
4359          to use it.  Only the first round of the reduction is likely
4360          to still be profitable via emulation.  */
4361       /* ??? It might be better to emit a reduction tree code here, so that
4362          tree-vect-generic can expand the first round via bit tricks.  */
4363       if (!VECTOR_MODE_P (mode))
4364         reduce_with_shift = false;
4365       else
4366         {
4367           optab optab = optab_for_tree_code (code, vectype, optab_default);
4368           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4369             reduce_with_shift = false;
4370         }
4371
4372       if (reduce_with_shift && !slp_reduc)
4373         {
4374           int nelements = vec_size_in_bits / element_bitsize;
4375           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4376
4377           int elt_offset;
4378
4379           tree zero_vec = build_zero_cst (vectype);
4380           /*** Case 2: Create:
4381              for (offset = nelements/2; offset >= 1; offset/=2)
4382                 {
4383                   Create:  va' = vec_shift <va, offset>
4384                   Create:  va = vop <va, va'>
4385                 }  */
4386
4387           tree rhs;
4388
4389           if (dump_enabled_p ())
4390             dump_printf_loc (MSG_NOTE, vect_location,
4391                              "Reduce using vector shifts\n");
4392
4393           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4394           new_temp = new_phi_result;
4395           for (elt_offset = nelements / 2;
4396                elt_offset >= 1;
4397                elt_offset /= 2)
4398             {
4399               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4400               tree mask = vect_gen_perm_mask_any (vectype, sel);
4401               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4402                                                  new_temp, zero_vec, mask);
4403               new_name = make_ssa_name (vec_dest, epilog_stmt);
4404               gimple_assign_set_lhs (epilog_stmt, new_name);
4405               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4406
4407               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4408                                                  new_temp);
4409               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4410               gimple_assign_set_lhs (epilog_stmt, new_temp);
4411               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4412             }
4413
4414           /* 2.4  Extract the final scalar result.  Create:
4415              s_out3 = extract_field <v_out2, bitpos>  */
4416
4417           if (dump_enabled_p ())
4418             dump_printf_loc (MSG_NOTE, vect_location,
4419                              "extract scalar result\n");
4420
4421           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4422                         bitsize, bitsize_zero_node);
4423           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4424           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4425           gimple_assign_set_lhs (epilog_stmt, new_temp);
4426           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4427           scalar_results.safe_push (new_temp);
4428         }
4429       else
4430         {
4431           /*** Case 3: Create:
4432              s = extract_field <v_out2, 0>
4433              for (offset = element_size;
4434                   offset < vector_size;
4435                   offset += element_size;)
4436                {
4437                  Create:  s' = extract_field <v_out2, offset>
4438                  Create:  s = op <s, s'>  // For non SLP cases
4439                }  */
4440
4441           if (dump_enabled_p ())
4442             dump_printf_loc (MSG_NOTE, vect_location,
4443                              "Reduce using scalar code.\n");
4444
4445           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4446           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4447             {
4448               int bit_offset;
4449               if (gimple_code (new_phi) == GIMPLE_PHI)
4450                 vec_temp = PHI_RESULT (new_phi);
4451               else
4452                 vec_temp = gimple_assign_lhs (new_phi);
4453               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4454                             bitsize_zero_node);
4455               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4456               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4457               gimple_assign_set_lhs (epilog_stmt, new_temp);
4458               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4459
4460               /* In SLP we don't need to apply reduction operation, so we just
4461                  collect s' values in SCALAR_RESULTS.  */
4462               if (slp_reduc)
4463                 scalar_results.safe_push (new_temp);
4464
4465               for (bit_offset = element_bitsize;
4466                    bit_offset < vec_size_in_bits;
4467                    bit_offset += element_bitsize)
4468                 {
4469                   tree bitpos = bitsize_int (bit_offset);
4470                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4471                                      bitsize, bitpos);
4472
4473                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4474                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4475                   gimple_assign_set_lhs (epilog_stmt, new_name);
4476                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4477
4478                   if (slp_reduc)
4479                     {
4480                       /* In SLP we don't need to apply reduction operation, so
4481                          we just collect s' values in SCALAR_RESULTS.  */
4482                       new_temp = new_name;
4483                       scalar_results.safe_push (new_name);
4484                     }
4485                   else
4486                     {
4487                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4488                                                          new_name, new_temp);
4489                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4490                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4491                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4492                     }
4493                 }
4494             }
4495
4496           /* The only case where we need to reduce scalar results in SLP, is
4497              unrolling.  If the size of SCALAR_RESULTS is greater than
4498              GROUP_SIZE, we reduce them combining elements modulo
4499              GROUP_SIZE.  */
4500           if (slp_reduc)
4501             {
4502               tree res, first_res, new_res;
4503               gimple new_stmt;
4504
4505               /* Reduce multiple scalar results in case of SLP unrolling.  */
4506               for (j = group_size; scalar_results.iterate (j, &res);
4507                    j++)
4508                 {
4509                   first_res = scalar_results[j % group_size];
4510                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4511                                                   first_res, res);
4512                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4513                   gimple_assign_set_lhs (new_stmt, new_res);
4514                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4515                   scalar_results[j % group_size] = new_res;
4516                 }
4517             }
4518           else
4519             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4520             scalar_results.safe_push (new_temp);
4521         }
4522     }
4523
4524 vect_finalize_reduction:
4525
4526   if (double_reduc)
4527     loop = loop->inner;
4528
4529   /* 2.5 Adjust the final result by the initial value of the reduction
4530          variable. (When such adjustment is not needed, then
4531          'adjustment_def' is zero).  For example, if code is PLUS we create:
4532          new_temp = loop_exit_def + adjustment_def  */
4533
4534   if (adjustment_def)
4535     {
4536       gcc_assert (!slp_reduc);
4537       if (nested_in_vect_loop)
4538         {
4539           new_phi = new_phis[0];
4540           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4541           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4542           new_dest = vect_create_destination_var (scalar_dest, vectype);
4543         }
4544       else
4545         {
4546           new_temp = scalar_results[0];
4547           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4548           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4549           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4550         }
4551
4552       epilog_stmt = gimple_build_assign (new_dest, expr);
4553       new_temp = make_ssa_name (new_dest, epilog_stmt);
4554       gimple_assign_set_lhs (epilog_stmt, new_temp);
4555       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4556       if (nested_in_vect_loop)
4557         {
4558           set_vinfo_for_stmt (epilog_stmt,
4559                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4560                                                  NULL));
4561           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4562                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4563
4564           if (!double_reduc)
4565             scalar_results.quick_push (new_temp);
4566           else
4567             scalar_results[0] = new_temp;
4568         }
4569       else
4570         scalar_results[0] = new_temp;
4571
4572       new_phis[0] = epilog_stmt;
4573     }
4574
4575   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4576           phis with new adjusted scalar results, i.e., replace use <s_out0>
4577           with use <s_out4>.
4578
4579      Transform:
4580         loop_exit:
4581           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4582           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4583           v_out2 = reduce <v_out1>
4584           s_out3 = extract_field <v_out2, 0>
4585           s_out4 = adjust_result <s_out3>
4586           use <s_out0>
4587           use <s_out0>
4588
4589      into:
4590
4591         loop_exit:
4592           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4593           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4594           v_out2 = reduce <v_out1>
4595           s_out3 = extract_field <v_out2, 0>
4596           s_out4 = adjust_result <s_out3>
4597           use <s_out4>
4598           use <s_out4> */
4599
4600
4601   /* In SLP reduction chain we reduce vector results into one vector if
4602      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4603      the last stmt in the reduction chain, since we are looking for the loop
4604      exit phi node.  */
4605   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4606     {
4607       gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4608       /* Handle reduction patterns.  */
4609       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4610         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4611
4612       scalar_dest = gimple_assign_lhs (dest_stmt);
4613       group_size = 1;
4614     }
4615
4616   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4617      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4618      need to match SCALAR_RESULTS with corresponding statements.  The first
4619      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4620      the first vector stmt, etc.
4621      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4622   if (group_size > new_phis.length ())
4623     {
4624       ratio = group_size / new_phis.length ();
4625       gcc_assert (!(group_size % new_phis.length ()));
4626     }
4627   else
4628     ratio = 1;
4629
4630   for (k = 0; k < group_size; k++)
4631     {
4632       if (k % ratio == 0)
4633         {
4634           epilog_stmt = new_phis[k / ratio];
4635           reduction_phi = reduction_phis[k / ratio];
4636           if (double_reduc)
4637             inner_phi = inner_phis[k / ratio];
4638         }
4639
4640       if (slp_reduc)
4641         {
4642           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4643
4644           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4645           /* SLP statements can't participate in patterns.  */
4646           gcc_assert (!orig_stmt);
4647           scalar_dest = gimple_assign_lhs (current_stmt);
4648         }
4649
4650       phis.create (3);
4651       /* Find the loop-closed-use at the loop exit of the original scalar
4652          result.  (The reduction result is expected to have two immediate uses -
4653          one at the latch block, and one at the loop exit).  */
4654       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4655         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4656             && !is_gimple_debug (USE_STMT (use_p)))
4657           phis.safe_push (USE_STMT (use_p));
4658
4659       /* While we expect to have found an exit_phi because of loop-closed-ssa
4660          form we can end up without one if the scalar cycle is dead.  */
4661
4662       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4663         {
4664           if (outer_loop)
4665             {
4666               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4667               gphi *vect_phi;
4668
4669               /* FORNOW. Currently not supporting the case that an inner-loop
4670                  reduction is not used in the outer-loop (but only outside the
4671                  outer-loop), unless it is double reduction.  */
4672               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4673                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4674                           || double_reduc);
4675
4676               if (double_reduc)
4677                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4678               else
4679                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4680               if (!double_reduc
4681                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4682                       != vect_double_reduction_def)
4683                 continue;
4684
4685               /* Handle double reduction:
4686
4687                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4688                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4689                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4690                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4691
4692                  At that point the regular reduction (stmt2 and stmt3) is
4693                  already vectorized, as well as the exit phi node, stmt4.
4694                  Here we vectorize the phi node of double reduction, stmt1, and
4695                  update all relevant statements.  */
4696
4697               /* Go through all the uses of s2 to find double reduction phi
4698                  node, i.e., stmt1 above.  */
4699               orig_name = PHI_RESULT (exit_phi);
4700               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4701                 {
4702                   stmt_vec_info use_stmt_vinfo;
4703                   stmt_vec_info new_phi_vinfo;
4704                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4705                   basic_block bb = gimple_bb (use_stmt);
4706                   gimple use;
4707
4708                   /* Check that USE_STMT is really double reduction phi
4709                      node.  */
4710                   if (gimple_code (use_stmt) != GIMPLE_PHI
4711                       || gimple_phi_num_args (use_stmt) != 2
4712                       || bb->loop_father != outer_loop)
4713                     continue;
4714                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4715                   if (!use_stmt_vinfo
4716                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4717                           != vect_double_reduction_def)
4718                     continue;
4719
4720                   /* Create vector phi node for double reduction:
4721                      vs1 = phi <vs0, vs2>
4722                      vs1 was created previously in this function by a call to
4723                        vect_get_vec_def_for_operand and is stored in
4724                        vec_initial_def;
4725                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4726                      vs0 is created here.  */
4727
4728                   /* Create vector phi node.  */
4729                   vect_phi = create_phi_node (vec_initial_def, bb);
4730                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4731                                     loop_vec_info_for_loop (outer_loop), NULL);
4732                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4733
4734                   /* Create vs0 - initial def of the double reduction phi.  */
4735                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4736                                              loop_preheader_edge (outer_loop));
4737                   init_def = get_initial_def_for_reduction (stmt,
4738                                                           preheader_arg, NULL);
4739                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4740                                                     vectype, NULL);
4741
4742                   /* Update phi node arguments with vs0 and vs2.  */
4743                   add_phi_arg (vect_phi, vect_phi_init,
4744                                loop_preheader_edge (outer_loop),
4745                                UNKNOWN_LOCATION);
4746                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4747                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4748                   if (dump_enabled_p ())
4749                     {
4750                       dump_printf_loc (MSG_NOTE, vect_location,
4751                                        "created double reduction phi node: ");
4752                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4753                       dump_printf (MSG_NOTE, "\n");
4754                     }
4755
4756                   vect_phi_res = PHI_RESULT (vect_phi);
4757
4758                   /* Replace the use, i.e., set the correct vs1 in the regular
4759                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4760                      loop is redundant.  */
4761                   use = reduction_phi;
4762                   for (j = 0; j < ncopies; j++)
4763                     {
4764                       edge pr_edge = loop_preheader_edge (loop);
4765                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4766                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4767                     }
4768                 }
4769             }
4770         }
4771
4772       phis.release ();
4773       if (nested_in_vect_loop)
4774         {
4775           if (double_reduc)
4776             loop = outer_loop;
4777           else
4778             continue;
4779         }
4780
4781       phis.create (3);
4782       /* Find the loop-closed-use at the loop exit of the original scalar
4783          result.  (The reduction result is expected to have two immediate uses,
4784          one at the latch block, and one at the loop exit).  For double
4785          reductions we are looking for exit phis of the outer loop.  */
4786       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4787         {
4788           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4789             {
4790               if (!is_gimple_debug (USE_STMT (use_p)))
4791                 phis.safe_push (USE_STMT (use_p));
4792             }
4793           else
4794             {
4795               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4796                 {
4797                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4798
4799                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4800                     {
4801                       if (!flow_bb_inside_loop_p (loop,
4802                                              gimple_bb (USE_STMT (phi_use_p)))
4803                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4804                         phis.safe_push (USE_STMT (phi_use_p));
4805                     }
4806                 }
4807             }
4808         }
4809
4810       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4811         {
4812           /* Replace the uses:  */
4813           orig_name = PHI_RESULT (exit_phi);
4814           scalar_result = scalar_results[k];
4815           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4816             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4817               SET_USE (use_p, scalar_result);
4818         }
4819
4820       phis.release ();
4821     }
4822 }
4823
4824
4825 /* Function vectorizable_reduction.
4826
4827    Check if STMT performs a reduction operation that can be vectorized.
4828    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4829    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4830    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4831
4832    This function also handles reduction idioms (patterns) that have been
4833    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4834    of this form:
4835      X = pattern_expr (arg0, arg1, ..., X)
4836    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4837    sequence that had been detected and replaced by the pattern-stmt (STMT).
4838
4839    In some cases of reduction patterns, the type of the reduction variable X is
4840    different than the type of the other arguments of STMT.
4841    In such cases, the vectype that is used when transforming STMT into a vector
4842    stmt is different than the vectype that is used to determine the
4843    vectorization factor, because it consists of a different number of elements
4844    than the actual number of elements that are being operated upon in parallel.
4845
4846    For example, consider an accumulation of shorts into an int accumulator.
4847    On some targets it's possible to vectorize this pattern operating on 8
4848    shorts at a time (hence, the vectype for purposes of determining the
4849    vectorization factor should be V8HI); on the other hand, the vectype that
4850    is used to create the vector form is actually V4SI (the type of the result).
4851
4852    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4853    indicates what is the actual level of parallelism (V8HI in the example), so
4854    that the right vectorization factor would be derived.  This vectype
4855    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4856    be used to create the vectorized stmt.  The right vectype for the vectorized
4857    stmt is obtained from the type of the result X:
4858         get_vectype_for_scalar_type (TREE_TYPE (X))
4859
4860    This means that, contrary to "regular" reductions (or "regular" stmts in
4861    general), the following equation:
4862       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4863    does *NOT* necessarily hold for reduction patterns.  */
4864
4865 bool
4866 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4867                         gimple *vec_stmt, slp_tree slp_node)
4868 {
4869   tree vec_dest;
4870   tree scalar_dest;
4871   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4872   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4873   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4874   tree vectype_in = NULL_TREE;
4875   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4876   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4877   enum tree_code code, orig_code, epilog_reduc_code;
4878   machine_mode vec_mode;
4879   int op_type;
4880   optab optab, reduc_optab;
4881   tree new_temp = NULL_TREE;
4882   tree def;
4883   gimple def_stmt;
4884   enum vect_def_type dt;
4885   gphi *new_phi = NULL;
4886   tree scalar_type;
4887   bool is_simple_use;
4888   gimple orig_stmt;
4889   stmt_vec_info orig_stmt_info;
4890   tree expr = NULL_TREE;
4891   int i;
4892   int ncopies;
4893   int epilog_copies;
4894   stmt_vec_info prev_stmt_info, prev_phi_info;
4895   bool single_defuse_cycle = false;
4896   tree reduc_def = NULL_TREE;
4897   gimple new_stmt = NULL;
4898   int j;
4899   tree ops[3];
4900   bool nested_cycle = false, found_nested_cycle_def = false;
4901   gimple reduc_def_stmt = NULL;
4902   bool double_reduc = false, dummy;
4903   basic_block def_bb;
4904   struct loop * def_stmt_loop, *outer_loop = NULL;
4905   tree def_arg;
4906   gimple def_arg_stmt;
4907   auto_vec<tree> vec_oprnds0;
4908   auto_vec<tree> vec_oprnds1;
4909   auto_vec<tree> vect_defs;
4910   auto_vec<gimple> phis;
4911   int vec_num;
4912   tree def0, def1, tem, op0, op1 = NULL_TREE;
4913   bool first_p = true;
4914
4915   /* In case of reduction chain we switch to the first stmt in the chain, but
4916      we don't update STMT_INFO, since only the last stmt is marked as reduction
4917      and has reduction properties.  */
4918   if (GROUP_FIRST_ELEMENT (stmt_info)
4919       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4920     {
4921       stmt = GROUP_FIRST_ELEMENT (stmt_info);
4922       first_p = false;
4923     }
4924
4925   if (nested_in_vect_loop_p (loop, stmt))
4926     {
4927       outer_loop = loop;
4928       loop = loop->inner;
4929       nested_cycle = true;
4930     }
4931
4932   /* 1. Is vectorizable reduction?  */
4933   /* Not supportable if the reduction variable is used in the loop, unless
4934      it's a reduction chain.  */
4935   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4936       && !GROUP_FIRST_ELEMENT (stmt_info))
4937     return false;
4938
4939   /* Reductions that are not used even in an enclosing outer-loop,
4940      are expected to be "live" (used out of the loop).  */
4941   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4942       && !STMT_VINFO_LIVE_P (stmt_info))
4943     return false;
4944
4945   /* Make sure it was already recognized as a reduction computation.  */
4946   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4947       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4948     return false;
4949
4950   /* 2. Has this been recognized as a reduction pattern?
4951
4952      Check if STMT represents a pattern that has been recognized
4953      in earlier analysis stages.  For stmts that represent a pattern,
4954      the STMT_VINFO_RELATED_STMT field records the last stmt in
4955      the original sequence that constitutes the pattern.  */
4956
4957   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4958   if (orig_stmt)
4959     {
4960       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4961       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4962       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4963     }
4964
4965   /* 3. Check the operands of the operation.  The first operands are defined
4966         inside the loop body. The last operand is the reduction variable,
4967         which is defined by the loop-header-phi.  */
4968
4969   gcc_assert (is_gimple_assign (stmt));
4970
4971   /* Flatten RHS.  */
4972   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4973     {
4974     case GIMPLE_SINGLE_RHS:
4975       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4976       if (op_type == ternary_op)
4977         {
4978           tree rhs = gimple_assign_rhs1 (stmt);
4979           ops[0] = TREE_OPERAND (rhs, 0);
4980           ops[1] = TREE_OPERAND (rhs, 1);
4981           ops[2] = TREE_OPERAND (rhs, 2);
4982           code = TREE_CODE (rhs);
4983         }
4984       else
4985         return false;
4986       break;
4987
4988     case GIMPLE_BINARY_RHS:
4989       code = gimple_assign_rhs_code (stmt);
4990       op_type = TREE_CODE_LENGTH (code);
4991       gcc_assert (op_type == binary_op);
4992       ops[0] = gimple_assign_rhs1 (stmt);
4993       ops[1] = gimple_assign_rhs2 (stmt);
4994       break;
4995
4996     case GIMPLE_TERNARY_RHS:
4997       code = gimple_assign_rhs_code (stmt);
4998       op_type = TREE_CODE_LENGTH (code);
4999       gcc_assert (op_type == ternary_op);
5000       ops[0] = gimple_assign_rhs1 (stmt);
5001       ops[1] = gimple_assign_rhs2 (stmt);
5002       ops[2] = gimple_assign_rhs3 (stmt);
5003       break;
5004
5005     case GIMPLE_UNARY_RHS:
5006       return false;
5007
5008     default:
5009       gcc_unreachable ();
5010     }
5011   /* The default is that the reduction variable is the last in statement.  */
5012   int reduc_index = op_type - 1;
5013
5014   if (code == COND_EXPR && slp_node)
5015     return false;
5016
5017   scalar_dest = gimple_assign_lhs (stmt);
5018   scalar_type = TREE_TYPE (scalar_dest);
5019   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5020       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5021     return false;
5022
5023   /* Do not try to vectorize bit-precision reductions.  */
5024   if ((TYPE_PRECISION (scalar_type)
5025        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5026     return false;
5027
5028   /* All uses but the last are expected to be defined in the loop.
5029      The last use is the reduction variable.  In case of nested cycle this
5030      assumption is not true: we use reduc_index to record the index of the
5031      reduction variable.  */
5032   for (i = 0; i < op_type - 1; i++)
5033     {
5034       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5035       if (i == 0 && code == COND_EXPR)
5036         continue;
5037
5038       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5039                                             &def_stmt, &def, &dt, &tem);
5040       if (!vectype_in)
5041         vectype_in = tem;
5042       gcc_assert (is_simple_use);
5043
5044       if (dt != vect_internal_def
5045           && dt != vect_external_def
5046           && dt != vect_constant_def
5047           && dt != vect_induction_def
5048           && !(dt == vect_nested_cycle && nested_cycle))
5049         return false;
5050
5051       if (dt == vect_nested_cycle)
5052         {
5053           found_nested_cycle_def = true;
5054           reduc_def_stmt = def_stmt;
5055           reduc_index = i;
5056         }
5057     }
5058
5059   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5060                                         &def_stmt, &def, &dt, &tem);
5061   if (!vectype_in)
5062     vectype_in = tem;
5063   gcc_assert (is_simple_use);
5064   if (!found_nested_cycle_def)
5065     reduc_def_stmt = def_stmt;
5066
5067   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5068     return false;
5069
5070   if (!(dt == vect_reduction_def
5071         || dt == vect_nested_cycle
5072         || ((dt == vect_internal_def || dt == vect_external_def
5073              || dt == vect_constant_def || dt == vect_induction_def)
5074             && nested_cycle && found_nested_cycle_def)))
5075     {
5076       /* For pattern recognized stmts, orig_stmt might be a reduction,
5077          but some helper statements for the pattern might not, or
5078          might be COND_EXPRs with reduction uses in the condition.  */
5079       gcc_assert (orig_stmt);
5080       return false;
5081     }
5082
5083   gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5084                                          !nested_cycle, &dummy);
5085   if (orig_stmt)
5086     gcc_assert (tmp == orig_stmt
5087                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5088   else
5089     /* We changed STMT to be the first stmt in reduction chain, hence we
5090        check that in this case the first element in the chain is STMT.  */
5091     gcc_assert (stmt == tmp
5092                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5093
5094   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5095     return false;
5096
5097   if (slp_node || PURE_SLP_STMT (stmt_info))
5098     ncopies = 1;
5099   else
5100     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5101                / TYPE_VECTOR_SUBPARTS (vectype_in));
5102
5103   gcc_assert (ncopies >= 1);
5104
5105   vec_mode = TYPE_MODE (vectype_in);
5106
5107   if (code == COND_EXPR)
5108     {
5109       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5110         {
5111           if (dump_enabled_p ())
5112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5113                              "unsupported condition in reduction\n");
5114
5115           return false;
5116         }
5117     }
5118   else
5119     {
5120       /* 4. Supportable by target?  */
5121
5122       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5123           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5124         {
5125           /* Shifts and rotates are only supported by vectorizable_shifts,
5126              not vectorizable_reduction.  */
5127           if (dump_enabled_p ())
5128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5129                              "unsupported shift or rotation.\n");
5130           return false;
5131         }
5132
5133       /* 4.1. check support for the operation in the loop  */
5134       optab = optab_for_tree_code (code, vectype_in, optab_default);
5135       if (!optab)
5136         {
5137           if (dump_enabled_p ())
5138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5139                              "no optab.\n");
5140
5141           return false;
5142         }
5143
5144       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5145         {
5146           if (dump_enabled_p ())
5147             dump_printf (MSG_NOTE, "op not supported by target.\n");
5148
5149           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5150               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5151                   < vect_min_worthwhile_factor (code))
5152             return false;
5153
5154           if (dump_enabled_p ())
5155             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5156         }
5157
5158       /* Worthwhile without SIMD support?  */
5159       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5160           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5161              < vect_min_worthwhile_factor (code))
5162         {
5163           if (dump_enabled_p ())
5164             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5165                              "not worthwhile without SIMD support.\n");
5166
5167           return false;
5168         }
5169     }
5170
5171   /* 4.2. Check support for the epilog operation.
5172
5173           If STMT represents a reduction pattern, then the type of the
5174           reduction variable may be different than the type of the rest
5175           of the arguments.  For example, consider the case of accumulation
5176           of shorts into an int accumulator; The original code:
5177                         S1: int_a = (int) short_a;
5178           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5179
5180           was replaced with:
5181                         STMT: int_acc = widen_sum <short_a, int_acc>
5182
5183           This means that:
5184           1. The tree-code that is used to create the vector operation in the
5185              epilog code (that reduces the partial results) is not the
5186              tree-code of STMT, but is rather the tree-code of the original
5187              stmt from the pattern that STMT is replacing.  I.e, in the example
5188              above we want to use 'widen_sum' in the loop, but 'plus' in the
5189              epilog.
5190           2. The type (mode) we use to check available target support
5191              for the vector operation to be created in the *epilog*, is
5192              determined by the type of the reduction variable (in the example
5193              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5194              However the type (mode) we use to check available target support
5195              for the vector operation to be created *inside the loop*, is
5196              determined by the type of the other arguments to STMT (in the
5197              example we'd check this: optab_handler (widen_sum_optab,
5198              vect_short_mode)).
5199
5200           This is contrary to "regular" reductions, in which the types of all
5201           the arguments are the same as the type of the reduction variable.
5202           For "regular" reductions we can therefore use the same vector type
5203           (and also the same tree-code) when generating the epilog code and
5204           when generating the code inside the loop.  */
5205
5206   if (orig_stmt)
5207     {
5208       /* This is a reduction pattern: get the vectype from the type of the
5209          reduction variable, and get the tree-code from orig_stmt.  */
5210       orig_code = gimple_assign_rhs_code (orig_stmt);
5211       gcc_assert (vectype_out);
5212       vec_mode = TYPE_MODE (vectype_out);
5213     }
5214   else
5215     {
5216       /* Regular reduction: use the same vectype and tree-code as used for
5217          the vector code inside the loop can be used for the epilog code. */
5218       orig_code = code;
5219     }
5220
5221   if (nested_cycle)
5222     {
5223       def_bb = gimple_bb (reduc_def_stmt);
5224       def_stmt_loop = def_bb->loop_father;
5225       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5226                                        loop_preheader_edge (def_stmt_loop));
5227       if (TREE_CODE (def_arg) == SSA_NAME
5228           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5229           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5230           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5231           && vinfo_for_stmt (def_arg_stmt)
5232           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5233               == vect_double_reduction_def)
5234         double_reduc = true;
5235     }
5236
5237   epilog_reduc_code = ERROR_MARK;
5238   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5239     {
5240       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5241                                          optab_default);
5242       if (!reduc_optab)
5243         {
5244           if (dump_enabled_p ())
5245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5246                              "no optab for reduction.\n");
5247
5248           epilog_reduc_code = ERROR_MARK;
5249         }
5250       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5251         {
5252           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5253           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5254             {
5255               if (dump_enabled_p ())
5256                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5257                                  "reduc op not supported by target.\n");
5258
5259               epilog_reduc_code = ERROR_MARK;
5260             }
5261         }
5262     }
5263   else
5264     {
5265       if (!nested_cycle || double_reduc)
5266         {
5267           if (dump_enabled_p ())
5268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5269                              "no reduc code for scalar code.\n");
5270
5271           return false;
5272         }
5273     }
5274
5275   if (double_reduc && ncopies > 1)
5276     {
5277       if (dump_enabled_p ())
5278         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5279                          "multiple types in double reduction\n");
5280
5281       return false;
5282     }
5283
5284   /* In case of widenning multiplication by a constant, we update the type
5285      of the constant to be the type of the other operand.  We check that the
5286      constant fits the type in the pattern recognition pass.  */
5287   if (code == DOT_PROD_EXPR
5288       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5289     {
5290       if (TREE_CODE (ops[0]) == INTEGER_CST)
5291         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5292       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5293         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5294       else
5295         {
5296           if (dump_enabled_p ())
5297             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5298                              "invalid types in dot-prod\n");
5299
5300           return false;
5301         }
5302     }
5303
5304   if (!vec_stmt) /* transformation not required.  */
5305     {
5306       if (first_p
5307           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5308                                          reduc_index))
5309         return false;
5310       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5311       return true;
5312     }
5313
5314   /** Transform.  **/
5315
5316   if (dump_enabled_p ())
5317     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5318
5319   /* FORNOW: Multiple types are not supported for condition.  */
5320   if (code == COND_EXPR)
5321     gcc_assert (ncopies == 1);
5322
5323   /* Create the destination vector  */
5324   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5325
5326   /* In case the vectorization factor (VF) is bigger than the number
5327      of elements that we can fit in a vectype (nunits), we have to generate
5328      more than one vector stmt - i.e - we need to "unroll" the
5329      vector stmt by a factor VF/nunits.  For more details see documentation
5330      in vectorizable_operation.  */
5331
5332   /* If the reduction is used in an outer loop we need to generate
5333      VF intermediate results, like so (e.g. for ncopies=2):
5334         r0 = phi (init, r0)
5335         r1 = phi (init, r1)
5336         r0 = x0 + r0;
5337         r1 = x1 + r1;
5338     (i.e. we generate VF results in 2 registers).
5339     In this case we have a separate def-use cycle for each copy, and therefore
5340     for each copy we get the vector def for the reduction variable from the
5341     respective phi node created for this copy.
5342
5343     Otherwise (the reduction is unused in the loop nest), we can combine
5344     together intermediate results, like so (e.g. for ncopies=2):
5345         r = phi (init, r)
5346         r = x0 + r;
5347         r = x1 + r;
5348    (i.e. we generate VF/2 results in a single register).
5349    In this case for each copy we get the vector def for the reduction variable
5350    from the vectorized reduction operation generated in the previous iteration.
5351   */
5352
5353   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5354     {
5355       single_defuse_cycle = true;
5356       epilog_copies = 1;
5357     }
5358   else
5359     epilog_copies = ncopies;
5360
5361   prev_stmt_info = NULL;
5362   prev_phi_info = NULL;
5363   if (slp_node)
5364     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5365   else
5366     {
5367       vec_num = 1;
5368       vec_oprnds0.create (1);
5369       if (op_type == ternary_op)
5370         vec_oprnds1.create (1);
5371     }
5372
5373   phis.create (vec_num);
5374   vect_defs.create (vec_num);
5375   if (!slp_node)
5376     vect_defs.quick_push (NULL_TREE);
5377
5378   for (j = 0; j < ncopies; j++)
5379     {
5380       if (j == 0 || !single_defuse_cycle)
5381         {
5382           for (i = 0; i < vec_num; i++)
5383             {
5384               /* Create the reduction-phi that defines the reduction
5385                  operand.  */
5386               new_phi = create_phi_node (vec_dest, loop->header);
5387               set_vinfo_for_stmt (new_phi,
5388                                   new_stmt_vec_info (new_phi, loop_vinfo,
5389                                                      NULL));
5390                if (j == 0 || slp_node)
5391                  phis.quick_push (new_phi);
5392             }
5393         }
5394
5395       if (code == COND_EXPR)
5396         {
5397           gcc_assert (!slp_node);
5398           vectorizable_condition (stmt, gsi, vec_stmt,
5399                                   PHI_RESULT (phis[0]),
5400                                   reduc_index, NULL);
5401           /* Multiple types are not supported for condition.  */
5402           break;
5403         }
5404
5405       /* Handle uses.  */
5406       if (j == 0)
5407         {
5408           op0 = ops[!reduc_index];
5409           if (op_type == ternary_op)
5410             {
5411               if (reduc_index == 0)
5412                 op1 = ops[2];
5413               else
5414                 op1 = ops[1];
5415             }
5416
5417           if (slp_node)
5418             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5419                                slp_node, -1);
5420           else
5421             {
5422               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5423                                                             stmt, NULL);
5424               vec_oprnds0.quick_push (loop_vec_def0);
5425               if (op_type == ternary_op)
5426                {
5427                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5428                                                                NULL);
5429                  vec_oprnds1.quick_push (loop_vec_def1);
5430                }
5431             }
5432         }
5433       else
5434         {
5435           if (!slp_node)
5436             {
5437               enum vect_def_type dt;
5438               gimple dummy_stmt;
5439               tree dummy;
5440
5441               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5442                                   &dummy_stmt, &dummy, &dt);
5443               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5444                                                               loop_vec_def0);
5445               vec_oprnds0[0] = loop_vec_def0;
5446               if (op_type == ternary_op)
5447                 {
5448                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5449                                       &dummy, &dt);
5450                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5451                                                                 loop_vec_def1);
5452                   vec_oprnds1[0] = loop_vec_def1;
5453                 }
5454             }
5455
5456           if (single_defuse_cycle)
5457             reduc_def = gimple_assign_lhs (new_stmt);
5458
5459           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5460         }
5461
5462       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5463         {
5464           if (slp_node)
5465             reduc_def = PHI_RESULT (phis[i]);
5466           else
5467             {
5468               if (!single_defuse_cycle || j == 0)
5469                 reduc_def = PHI_RESULT (new_phi);
5470             }
5471
5472           def1 = ((op_type == ternary_op)
5473                   ? vec_oprnds1[i] : NULL);
5474           if (op_type == binary_op)
5475             {
5476               if (reduc_index == 0)
5477                 expr = build2 (code, vectype_out, reduc_def, def0);
5478               else
5479                 expr = build2 (code, vectype_out, def0, reduc_def);
5480             }
5481           else
5482             {
5483               if (reduc_index == 0)
5484                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5485               else
5486                 {
5487                   if (reduc_index == 1)
5488                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5489                   else
5490                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5491                 }
5492             }
5493
5494           new_stmt = gimple_build_assign (vec_dest, expr);
5495           new_temp = make_ssa_name (vec_dest, new_stmt);
5496           gimple_assign_set_lhs (new_stmt, new_temp);
5497           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5498
5499           if (slp_node)
5500             {
5501               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5502               vect_defs.quick_push (new_temp);
5503             }
5504           else
5505             vect_defs[0] = new_temp;
5506         }
5507
5508       if (slp_node)
5509         continue;
5510
5511       if (j == 0)
5512         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5513       else
5514         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5515
5516       prev_stmt_info = vinfo_for_stmt (new_stmt);
5517       prev_phi_info = vinfo_for_stmt (new_phi);
5518     }
5519
5520   /* Finalize the reduction-phi (set its arguments) and create the
5521      epilog reduction code.  */
5522   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5523     {
5524       new_temp = gimple_assign_lhs (*vec_stmt);
5525       vect_defs[0] = new_temp;
5526     }
5527
5528   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5529                                     epilog_reduc_code, phis, reduc_index,
5530                                     double_reduc, slp_node);
5531
5532   return true;
5533 }
5534
5535 /* Function vect_min_worthwhile_factor.
5536
5537    For a loop where we could vectorize the operation indicated by CODE,
5538    return the minimum vectorization factor that makes it worthwhile
5539    to use generic vectors.  */
5540 int
5541 vect_min_worthwhile_factor (enum tree_code code)
5542 {
5543   switch (code)
5544     {
5545     case PLUS_EXPR:
5546     case MINUS_EXPR:
5547     case NEGATE_EXPR:
5548       return 4;
5549
5550     case BIT_AND_EXPR:
5551     case BIT_IOR_EXPR:
5552     case BIT_XOR_EXPR:
5553     case BIT_NOT_EXPR:
5554       return 2;
5555
5556     default:
5557       return INT_MAX;
5558     }
5559 }
5560
5561
5562 /* Function vectorizable_induction
5563
5564    Check if PHI performs an induction computation that can be vectorized.
5565    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5566    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5567    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5568
5569 bool
5570 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5571                         gimple *vec_stmt)
5572 {
5573   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5574   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5575   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5576   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5577   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5578   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5579   tree vec_def;
5580
5581   gcc_assert (ncopies >= 1);
5582   /* FORNOW. These restrictions should be relaxed.  */
5583   if (nested_in_vect_loop_p (loop, phi))
5584     {
5585       imm_use_iterator imm_iter;
5586       use_operand_p use_p;
5587       gimple exit_phi;
5588       edge latch_e;
5589       tree loop_arg;
5590
5591       if (ncopies > 1)
5592         {
5593           if (dump_enabled_p ())
5594             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5595                              "multiple types in nested loop.\n");
5596           return false;
5597         }
5598
5599       exit_phi = NULL;
5600       latch_e = loop_latch_edge (loop->inner);
5601       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5602       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5603         {
5604           gimple use_stmt = USE_STMT (use_p);
5605           if (is_gimple_debug (use_stmt))
5606             continue;
5607
5608           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5609             {
5610               exit_phi = use_stmt;
5611               break;
5612             }
5613         }
5614       if (exit_phi)
5615         {
5616           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5617           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5618                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5619             {
5620               if (dump_enabled_p ())
5621                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5622                                  "inner-loop induction only used outside "
5623                                  "of the outer vectorized loop.\n");
5624               return false;
5625             }
5626         }
5627     }
5628
5629   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5630     return false;
5631
5632   /* FORNOW: SLP not supported.  */
5633   if (STMT_SLP_TYPE (stmt_info))
5634     return false;
5635
5636   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5637
5638   if (gimple_code (phi) != GIMPLE_PHI)
5639     return false;
5640
5641   if (!vec_stmt) /* transformation not required.  */
5642     {
5643       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5644       if (dump_enabled_p ())
5645         dump_printf_loc (MSG_NOTE, vect_location,
5646                          "=== vectorizable_induction ===\n");
5647       vect_model_induction_cost (stmt_info, ncopies);
5648       return true;
5649     }
5650
5651   /** Transform.  **/
5652
5653   if (dump_enabled_p ())
5654     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5655
5656   vec_def = get_initial_def_for_induction (phi);
5657   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5658   return true;
5659 }
5660
5661 /* Function vectorizable_live_operation.
5662
5663    STMT computes a value that is used outside the loop.  Check if
5664    it can be supported.  */
5665
5666 bool
5667 vectorizable_live_operation (gimple stmt,
5668                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5669                              gimple *vec_stmt)
5670 {
5671   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5672   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5673   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5674   int i;
5675   int op_type;
5676   tree op;
5677   tree def;
5678   gimple def_stmt;
5679   enum vect_def_type dt;
5680   enum tree_code code;
5681   enum gimple_rhs_class rhs_class;
5682
5683   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5684
5685   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5686     return false;
5687
5688   if (!is_gimple_assign (stmt))
5689     {
5690       if (gimple_call_internal_p (stmt)
5691           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5692           && gimple_call_lhs (stmt)
5693           && loop->simduid
5694           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5695           && loop->simduid
5696              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5697         {
5698           edge e = single_exit (loop);
5699           basic_block merge_bb = e->dest;
5700           imm_use_iterator imm_iter;
5701           use_operand_p use_p;
5702           tree lhs = gimple_call_lhs (stmt);
5703
5704           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5705             {
5706               gimple use_stmt = USE_STMT (use_p);
5707               if (gimple_code (use_stmt) == GIMPLE_PHI
5708                   && gimple_bb (use_stmt) == merge_bb)
5709                 {
5710                   if (vec_stmt)
5711                     {
5712                       tree vfm1
5713                         = build_int_cst (unsigned_type_node,
5714                                          loop_vinfo->vectorization_factor - 1);
5715                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5716                     }
5717                   return true;
5718                 }
5719             }
5720         }
5721
5722       return false;
5723     }
5724
5725   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5726     return false;
5727
5728   /* FORNOW. CHECKME. */
5729   if (nested_in_vect_loop_p (loop, stmt))
5730     return false;
5731
5732   code = gimple_assign_rhs_code (stmt);
5733   op_type = TREE_CODE_LENGTH (code);
5734   rhs_class = get_gimple_rhs_class (code);
5735   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5736   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5737
5738   /* FORNOW: support only if all uses are invariant.  This means
5739      that the scalar operations can remain in place, unvectorized.
5740      The original last scalar value that they compute will be used.  */
5741
5742   for (i = 0; i < op_type; i++)
5743     {
5744       if (rhs_class == GIMPLE_SINGLE_RHS)
5745         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5746       else
5747         op = gimple_op (stmt, i + 1);
5748       if (op
5749           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5750                                   &dt))
5751         {
5752           if (dump_enabled_p ())
5753             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5754                              "use not simple.\n");
5755           return false;
5756         }
5757
5758       if (dt != vect_external_def && dt != vect_constant_def)
5759         return false;
5760     }
5761
5762   /* No transformation is required for the cases we currently support.  */
5763   return true;
5764 }
5765
5766 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5767
5768 static void
5769 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5770 {
5771   ssa_op_iter op_iter;
5772   imm_use_iterator imm_iter;
5773   def_operand_p def_p;
5774   gimple ustmt;
5775
5776   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5777     {
5778       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5779         {
5780           basic_block bb;
5781
5782           if (!is_gimple_debug (ustmt))
5783             continue;
5784
5785           bb = gimple_bb (ustmt);
5786
5787           if (!flow_bb_inside_loop_p (loop, bb))
5788             {
5789               if (gimple_debug_bind_p (ustmt))
5790                 {
5791                   if (dump_enabled_p ())
5792                     dump_printf_loc (MSG_NOTE, vect_location,
5793                                      "killing debug use\n");
5794
5795                   gimple_debug_bind_reset_value (ustmt);
5796                   update_stmt (ustmt);
5797                 }
5798               else
5799                 gcc_unreachable ();
5800             }
5801         }
5802     }
5803 }
5804
5805
5806 /* This function builds ni_name = number of iterations.  Statements
5807    are emitted on the loop preheader edge.  */
5808
5809 static tree
5810 vect_build_loop_niters (loop_vec_info loop_vinfo)
5811 {
5812   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5813   if (TREE_CODE (ni) == INTEGER_CST)
5814     return ni;
5815   else
5816     {
5817       tree ni_name, var;
5818       gimple_seq stmts = NULL;
5819       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5820
5821       var = create_tmp_var (TREE_TYPE (ni), "niters");
5822       ni_name = force_gimple_operand (ni, &stmts, false, var);
5823       if (stmts)
5824         gsi_insert_seq_on_edge_immediate (pe, stmts);
5825
5826       return ni_name;
5827     }
5828 }
5829
5830
5831 /* This function generates the following statements:
5832
5833    ni_name = number of iterations loop executes
5834    ratio = ni_name / vf
5835    ratio_mult_vf_name = ratio * vf
5836
5837    and places them on the loop preheader edge.  */
5838
5839 static void
5840 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5841                                  tree ni_name,
5842                                  tree *ratio_mult_vf_name_ptr,
5843                                  tree *ratio_name_ptr)
5844 {
5845   tree ni_minus_gap_name;
5846   tree var;
5847   tree ratio_name;
5848   tree ratio_mult_vf_name;
5849   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5850   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5851   tree log_vf;
5852
5853   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5854
5855   /* If epilogue loop is required because of data accesses with gaps, we
5856      subtract one iteration from the total number of iterations here for
5857      correct calculation of RATIO.  */
5858   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5859     {
5860       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5861                                        ni_name,
5862                                        build_one_cst (TREE_TYPE (ni_name)));
5863       if (!is_gimple_val (ni_minus_gap_name))
5864         {
5865           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5866           gimple stmts = NULL;
5867           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5868                                                     true, var);
5869           gsi_insert_seq_on_edge_immediate (pe, stmts);
5870         }
5871     }
5872   else
5873     ni_minus_gap_name = ni_name;
5874
5875   /* Create: ratio = ni >> log2(vf) */
5876   /* ???  As we have ni == number of latch executions + 1, ni could
5877      have overflown to zero.  So avoid computing ratio based on ni
5878      but compute it using the fact that we know ratio will be at least
5879      one, thus via (ni - vf) >> log2(vf) + 1.  */
5880   ratio_name
5881     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5882                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5883                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5884                                              ni_minus_gap_name,
5885                                              build_int_cst
5886                                                (TREE_TYPE (ni_name), vf)),
5887                                 log_vf),
5888                    build_int_cst (TREE_TYPE (ni_name), 1));
5889   if (!is_gimple_val (ratio_name))
5890     {
5891       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5892       gimple stmts = NULL;
5893       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5894       gsi_insert_seq_on_edge_immediate (pe, stmts);
5895     }
5896   *ratio_name_ptr = ratio_name;
5897
5898   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5899
5900   if (ratio_mult_vf_name_ptr)
5901     {
5902       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5903                                         ratio_name, log_vf);
5904       if (!is_gimple_val (ratio_mult_vf_name))
5905         {
5906           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5907           gimple stmts = NULL;
5908           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5909                                                      true, var);
5910           gsi_insert_seq_on_edge_immediate (pe, stmts);
5911         }
5912       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5913     }
5914
5915   return;
5916 }
5917
5918
5919 /* Function vect_transform_loop.
5920
5921    The analysis phase has determined that the loop is vectorizable.
5922    Vectorize the loop - created vectorized stmts to replace the scalar
5923    stmts in the loop, and update the loop exit condition.  */
5924
5925 void
5926 vect_transform_loop (loop_vec_info loop_vinfo)
5927 {
5928   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5929   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5930   int nbbs = loop->num_nodes;
5931   int i;
5932   tree ratio = NULL;
5933   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5934   bool grouped_store;
5935   bool slp_scheduled = false;
5936   gimple stmt, pattern_stmt;
5937   gimple_seq pattern_def_seq = NULL;
5938   gimple_stmt_iterator pattern_def_si = gsi_none ();
5939   bool transform_pattern_stmt = false;
5940   bool check_profitability = false;
5941   int th;
5942   /* Record number of iterations before we started tampering with the profile. */
5943   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5944
5945   if (dump_enabled_p ())
5946     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5947
5948   /* If profile is inprecise, we have chance to fix it up.  */
5949   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5950     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5951
5952   /* Use the more conservative vectorization threshold.  If the number
5953      of iterations is constant assume the cost check has been performed
5954      by our caller.  If the threshold makes all loops profitable that
5955      run at least the vectorization factor number of times checking
5956      is pointless, too.  */
5957   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5958   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5959       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5960     {
5961       if (dump_enabled_p ())
5962         dump_printf_loc (MSG_NOTE, vect_location,
5963                          "Profitability threshold is %d loop iterations.\n",
5964                          th);
5965       check_profitability = true;
5966     }
5967
5968   /* Version the loop first, if required, so the profitability check
5969      comes first.  */
5970
5971   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5972       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5973     {
5974       vect_loop_versioning (loop_vinfo, th, check_profitability);
5975       check_profitability = false;
5976     }
5977
5978   tree ni_name = vect_build_loop_niters (loop_vinfo);
5979   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5980
5981   /* Peel the loop if there are data refs with unknown alignment.
5982      Only one data ref with unknown store is allowed.  */
5983
5984   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5985     {
5986       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5987                                      th, check_profitability);
5988       check_profitability = false;
5989       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5990          be re-computed.  */
5991       ni_name = NULL_TREE;
5992     }
5993
5994   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5995      compile time constant), or it is a constant that doesn't divide by the
5996      vectorization factor, then an epilog loop needs to be created.
5997      We therefore duplicate the loop: the original loop will be vectorized,
5998      and will compute the first (n/VF) iterations.  The second copy of the loop
5999      will remain scalar and will compute the remaining (n%VF) iterations.
6000      (VF is the vectorization factor).  */
6001
6002   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6003       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6004     {
6005       tree ratio_mult_vf;
6006       if (!ni_name)
6007         ni_name = vect_build_loop_niters (loop_vinfo);
6008       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6009                                        &ratio);
6010       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6011                                       th, check_profitability);
6012     }
6013   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6014     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6015                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6016   else
6017     {
6018       if (!ni_name)
6019         ni_name = vect_build_loop_niters (loop_vinfo);
6020       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6021     }
6022
6023   /* 1) Make sure the loop header has exactly two entries
6024      2) Make sure we have a preheader basic block.  */
6025
6026   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6027
6028   split_edge (loop_preheader_edge (loop));
6029
6030   /* FORNOW: the vectorizer supports only loops which body consist
6031      of one basic block (header + empty latch). When the vectorizer will
6032      support more involved loop forms, the order by which the BBs are
6033      traversed need to be reconsidered.  */
6034
6035   for (i = 0; i < nbbs; i++)
6036     {
6037       basic_block bb = bbs[i];
6038       stmt_vec_info stmt_info;
6039
6040       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6041            gsi_next (&si))
6042         {
6043           gphi *phi = si.phi ();
6044           if (dump_enabled_p ())
6045             {
6046               dump_printf_loc (MSG_NOTE, vect_location,
6047                                "------>vectorizing phi: ");
6048               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6049               dump_printf (MSG_NOTE, "\n");
6050             }
6051           stmt_info = vinfo_for_stmt (phi);
6052           if (!stmt_info)
6053             continue;
6054
6055           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6056             vect_loop_kill_debug_uses (loop, phi);
6057
6058           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6059               && !STMT_VINFO_LIVE_P (stmt_info))
6060             continue;
6061
6062           if (STMT_VINFO_VECTYPE (stmt_info)
6063               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6064                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6065               && dump_enabled_p ())
6066             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6067
6068           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6069             {
6070               if (dump_enabled_p ())
6071                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6072               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6073             }
6074         }
6075
6076       pattern_stmt = NULL;
6077       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6078            !gsi_end_p (si) || transform_pattern_stmt;)
6079         {
6080           bool is_store;
6081
6082           if (transform_pattern_stmt)
6083             stmt = pattern_stmt;
6084           else
6085             {
6086               stmt = gsi_stmt (si);
6087               /* During vectorization remove existing clobber stmts.  */
6088               if (gimple_clobber_p (stmt))
6089                 {
6090                   unlink_stmt_vdef (stmt);
6091                   gsi_remove (&si, true);
6092                   release_defs (stmt);
6093                   continue;
6094                 }
6095             }
6096
6097           if (dump_enabled_p ())
6098             {
6099               dump_printf_loc (MSG_NOTE, vect_location,
6100                                "------>vectorizing statement: ");
6101               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6102               dump_printf (MSG_NOTE, "\n");
6103             }
6104
6105           stmt_info = vinfo_for_stmt (stmt);
6106
6107           /* vector stmts created in the outer-loop during vectorization of
6108              stmts in an inner-loop may not have a stmt_info, and do not
6109              need to be vectorized.  */
6110           if (!stmt_info)
6111             {
6112               gsi_next (&si);
6113               continue;
6114             }
6115
6116           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6117             vect_loop_kill_debug_uses (loop, stmt);
6118
6119           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6120               && !STMT_VINFO_LIVE_P (stmt_info))
6121             {
6122               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6123                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6124                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6125                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6126                 {
6127                   stmt = pattern_stmt;
6128                   stmt_info = vinfo_for_stmt (stmt);
6129                 }
6130               else
6131                 {
6132                   gsi_next (&si);
6133                   continue;
6134                 }
6135             }
6136           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6137                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6138                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6139                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6140             transform_pattern_stmt = true;
6141
6142           /* If pattern statement has def stmts, vectorize them too.  */
6143           if (is_pattern_stmt_p (stmt_info))
6144             {
6145               if (pattern_def_seq == NULL)
6146                 {
6147                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6148                   pattern_def_si = gsi_start (pattern_def_seq);
6149                 }
6150               else if (!gsi_end_p (pattern_def_si))
6151                 gsi_next (&pattern_def_si);
6152               if (pattern_def_seq != NULL)
6153                 {
6154                   gimple pattern_def_stmt = NULL;
6155                   stmt_vec_info pattern_def_stmt_info = NULL;
6156
6157                   while (!gsi_end_p (pattern_def_si))
6158                     {
6159                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6160                       pattern_def_stmt_info
6161                         = vinfo_for_stmt (pattern_def_stmt);
6162                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6163                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6164                         break;
6165                       gsi_next (&pattern_def_si);
6166                     }
6167
6168                   if (!gsi_end_p (pattern_def_si))
6169                     {
6170                       if (dump_enabled_p ())
6171                         {
6172                           dump_printf_loc (MSG_NOTE, vect_location,
6173                                            "==> vectorizing pattern def "
6174                                            "stmt: ");
6175                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6176                                             pattern_def_stmt, 0);
6177                           dump_printf (MSG_NOTE, "\n");
6178                         }
6179
6180                       stmt = pattern_def_stmt;
6181                       stmt_info = pattern_def_stmt_info;
6182                     }
6183                   else
6184                     {
6185                       pattern_def_si = gsi_none ();
6186                       transform_pattern_stmt = false;
6187                     }
6188                 }
6189               else
6190                 transform_pattern_stmt = false;
6191             }
6192
6193           if (STMT_VINFO_VECTYPE (stmt_info))
6194             {
6195               unsigned int nunits
6196                 = (unsigned int)
6197                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6198               if (!STMT_SLP_TYPE (stmt_info)
6199                   && nunits != (unsigned int) vectorization_factor
6200                   && dump_enabled_p ())
6201                   /* For SLP VF is set according to unrolling factor, and not
6202                      to vector size, hence for SLP this print is not valid.  */
6203                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6204             }
6205
6206           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6207              reached.  */
6208           if (STMT_SLP_TYPE (stmt_info))
6209             {
6210               if (!slp_scheduled)
6211                 {
6212                   slp_scheduled = true;
6213
6214                   if (dump_enabled_p ())
6215                     dump_printf_loc (MSG_NOTE, vect_location,
6216                                      "=== scheduling SLP instances ===\n");
6217
6218                   vect_schedule_slp (loop_vinfo, NULL);
6219                 }
6220
6221               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6222               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6223                 {
6224                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6225                     {
6226                       pattern_def_seq = NULL;
6227                       gsi_next (&si);
6228                     }
6229                   continue;
6230                 }
6231             }
6232
6233           /* -------- vectorize statement ------------ */
6234           if (dump_enabled_p ())
6235             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6236
6237           grouped_store = false;
6238           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6239           if (is_store)
6240             {
6241               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6242                 {
6243                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6244                      interleaving chain was completed - free all the stores in
6245                      the chain.  */
6246                   gsi_next (&si);
6247                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6248                 }
6249               else
6250                 {
6251                   /* Free the attached stmt_vec_info and remove the stmt.  */
6252                   gimple store = gsi_stmt (si);
6253                   free_stmt_vec_info (store);
6254                   unlink_stmt_vdef (store);
6255                   gsi_remove (&si, true);
6256                   release_defs (store);
6257                 }
6258
6259               /* Stores can only appear at the end of pattern statements.  */
6260               gcc_assert (!transform_pattern_stmt);
6261               pattern_def_seq = NULL;
6262             }
6263           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6264             {
6265               pattern_def_seq = NULL;
6266               gsi_next (&si);
6267             }
6268         }                       /* stmts in BB */
6269     }                           /* BBs in loop */
6270
6271   slpeel_make_loop_iterate_ntimes (loop, ratio);
6272
6273   /* Reduce loop iterations by the vectorization factor.  */
6274   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6275                       expected_iterations / vectorization_factor);
6276   loop->nb_iterations_upper_bound
6277     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6278   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6279       && loop->nb_iterations_upper_bound != 0)
6280     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6281   if (loop->any_estimate)
6282     {
6283       loop->nb_iterations_estimate
6284         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6285        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6286            && loop->nb_iterations_estimate != 0)
6287          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6288     }
6289
6290   if (dump_enabled_p ())
6291     {
6292       dump_printf_loc (MSG_NOTE, vect_location,
6293                        "LOOP VECTORIZED\n");
6294       if (loop->inner)
6295         dump_printf_loc (MSG_NOTE, vect_location,
6296                          "OUTER LOOP VECTORIZED\n");
6297       dump_printf (MSG_NOTE, "\n");
6298     }
6299 }