gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "cfgloop.h"
  45 #include "params.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50
  51 /* Loop Vectorization Pass.
  52
  53    This pass tries to vectorize loops.
  54
  55    For example, the vectorizer transforms the following simple loop:
  56
  57         short a[N]; short b[N]; short c[N]; int i;
  58
  59         for (i=0; i<N; i++){
  60           a[i] = b[i] + c[i];
  61         }
  62
  63    as if it was manually vectorized by rewriting the source code into:
  64
  65         typedef int __attribute__((mode(V8HI))) v8hi;
  66         short a[N];  short b[N]; short c[N];   int i;
  67         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  68         v8hi va, vb, vc;
  69
  70         for (i=0; i<N/8; i++){
  71           vb = pb[i];
  72           vc = pc[i];
  73           va = vb + vc;
  74           pa[i] = va;
  75         }
  76
  77         The main entry to this pass is vectorize_loops(), in which
  78    the vectorizer applies a set of analyses on a given set of loops,
  79    followed by the actual vectorization transformation for the loops that
  80    had successfully passed the analysis phase.
  81         Throughout this pass we make a distinction between two types of
  82    data: scalars (which are represented by SSA_NAMES), and memory references
  83    ("data-refs").  These two types of data require different handling both
  84    during analysis and transformation. The types of data-refs that the
  85    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  86    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  87    accesses are required to have a simple (consecutive) access pattern.
  88
  89    Analysis phase:
  90    ===============
  91         The driver for the analysis phase is vect_analyze_loop().
  92    It applies a set of analyses, some of which rely on the scalar evolution
  93    analyzer (scev) developed by Sebastian Pop.
  94
  95         During the analysis phase the vectorizer records some information
  96    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  97    loop, as well as general information about the loop as a whole, which is
  98    recorded in a "loop_vec_info" struct attached to each loop.
  99
 100    Transformation phase:
 101    =====================
 102         The loop transformation phase scans all the stmts in the loop, and
 103    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 104    the loop that needs to be vectorized.  It inserts the vector code sequence
 105    just before the scalar stmt S, and records a pointer to the vector code
 106    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 107    attached to S).  This pointer will be used for the vectorization of following
 108    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 109    otherwise, we rely on dead code elimination for removing it.
 110
 111         For example, say stmt S1 was vectorized into stmt VS1:
 112
 113    VS1: vb = px[i];
 114    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 115    S2:  a = b;
 116
 117    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 118    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 119    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 120    resulting sequence would be:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    VS2: va = vb;
 125    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 126
 127         Operands that are not SSA_NAMEs, are data-refs that appear in
 128    load/store operations (like 'x[i]' in S1), and are handled differently.
 129
 130    Target modeling:
 131    =================
 132         Currently the only target specific information that is used is the
 133    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 134    Targets that can support different sizes of vectors, for now will need
 135    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 136    flexibility will be added in the future.
 137
 138         Since we only vectorize operations which vector form can be
 139    expressed using existing tree codes, to verify that an operation is
 140    supported, the vectorizer checks the relevant optab at the relevant
 141    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 142    the value found is CODE_FOR_nothing, then there's no target support, and
 143    we can't vectorize the stmt.
 144
 145    For additional information on this project see:
 146    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 147 */
 148
 149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 150
 151 /* Function vect_determine_vectorization_factor
 152
 153    Determine the vectorization factor (VF).  VF is the number of data elements
 154    that are operated upon in parallel in a single iteration of the vectorized
 155    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 156    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 157    elements can fit in a single vector register.
 158
 159    We currently support vectorization of loops in which all types operated upon
 160    are of the same size.  Therefore this function currently sets VF according to
 161    the size of the types operated upon, and fails if there are multiple sizes
 162    in the loop.
 163
 164    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 165    original loop:
 166         for (i=0; i<N; i++){
 167           a[i] = b[i] + c[i];
 168         }
 169
 170    vectorized loop:
 171         for (i=0; i<N; i+=VF){
 172           a[i:VF] = b[i:VF] + c[i:VF];
 173         }
 174 */
 175
 176 static bool
 177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 178 {
 179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 180   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 181   unsigned nbbs = loop->num_nodes;
 182   unsigned int vectorization_factor = 0;
 183   tree scalar_type;
 184   gphi *phi;
 185   tree vectype;
 186   unsigned int nunits;
 187   stmt_vec_info stmt_info;
 188   unsigned i;
 189   HOST_WIDE_INT dummy;
 190   gimple *stmt, *pattern_stmt = NULL;
 191   gimple_seq pattern_def_seq = NULL;
 192   gimple_stmt_iterator pattern_def_si = gsi_none ();
 193   bool analyze_pattern_stmt = false;
 194   bool bool_result;
 195   auto_vec<stmt_vec_info> mask_producers;
 196
 197   if (dump_enabled_p ())
 198     dump_printf_loc (MSG_NOTE, vect_location,
 199                      "=== vect_determine_vectorization_factor ===\n");
 200
 201   for (i = 0; i < nbbs; i++)
 202     {
 203       basic_block bb = bbs[i];
 204
 205       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 206            gsi_next (&si))
 207         {
 208           phi = si.phi ();
 209           stmt_info = vinfo_for_stmt (phi);
 210           if (dump_enabled_p ())
 211             {
 212               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 213               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 214               dump_printf (MSG_NOTE, "\n");
 215             }
 216
 217           gcc_assert (stmt_info);
 218
 219           if (STMT_VINFO_RELEVANT_P (stmt_info))
 220             {
 221               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 222               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 223
 224               if (dump_enabled_p ())
 225                 {
 226                   dump_printf_loc (MSG_NOTE, vect_location,
 227                                    "get vectype for scalar type:  ");
 228                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 229                   dump_printf (MSG_NOTE, "\n");
 230                 }
 231
 232               vectype = get_vectype_for_scalar_type (scalar_type);
 233               if (!vectype)
 234                 {
 235                   if (dump_enabled_p ())
 236                     {
 237                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 238                                        "not vectorized: unsupported "
 239                                        "data-type ");
 240                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 241                                          scalar_type);
 242                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 243                     }
 244                   return false;
 245                 }
 246               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 247
 248               if (dump_enabled_p ())
 249                 {
 250                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 251                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 252                   dump_printf (MSG_NOTE, "\n");
 253                 }
 254
 255               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 256               if (dump_enabled_p ())
 257                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 258                                  nunits);
 259
 260               if (!vectorization_factor
 261                   || (nunits > vectorization_factor))
 262                 vectorization_factor = nunits;
 263             }
 264         }
 265
 266       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 267            !gsi_end_p (si) || analyze_pattern_stmt;)
 268         {
 269           tree vf_vectype;
 270
 271           if (analyze_pattern_stmt)
 272             stmt = pattern_stmt;
 273           else
 274             stmt = gsi_stmt (si);
 275
 276           stmt_info = vinfo_for_stmt (stmt);
 277
 278           if (dump_enabled_p ())
 279             {
 280               dump_printf_loc (MSG_NOTE, vect_location,
 281                                "==> examining statement: ");
 282               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 283               dump_printf (MSG_NOTE, "\n");
 284             }
 285
 286           gcc_assert (stmt_info);
 287
 288           /* Skip stmts which do not need to be vectorized.  */
 289           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 290                && !STMT_VINFO_LIVE_P (stmt_info))
 291               || gimple_clobber_p (stmt))
 292             {
 293               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 294                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 295                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 296                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 297                 {
 298                   stmt = pattern_stmt;
 299                   stmt_info = vinfo_for_stmt (pattern_stmt);
 300                   if (dump_enabled_p ())
 301                     {
 302                       dump_printf_loc (MSG_NOTE, vect_location,
 303                                        "==> examining pattern statement: ");
 304                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 305                       dump_printf (MSG_NOTE, "\n");
 306                     }
 307                 }
 308               else
 309                 {
 310                   if (dump_enabled_p ())
 311                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 312                   gsi_next (&si);
 313                   continue;
 314                 }
 315             }
 316           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 317                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 318                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 319                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 320             analyze_pattern_stmt = true;
 321
 322           /* If a pattern statement has def stmts, analyze them too.  */
 323           if (is_pattern_stmt_p (stmt_info))
 324             {
 325               if (pattern_def_seq == NULL)
 326                 {
 327                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 328                   pattern_def_si = gsi_start (pattern_def_seq);
 329                 }
 330               else if (!gsi_end_p (pattern_def_si))
 331                 gsi_next (&pattern_def_si);
 332               if (pattern_def_seq != NULL)
 333                 {
 334                   gimple *pattern_def_stmt = NULL;
 335                   stmt_vec_info pattern_def_stmt_info = NULL;
 336
 337                   while (!gsi_end_p (pattern_def_si))
 338                     {
 339                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 340                       pattern_def_stmt_info
 341                         = vinfo_for_stmt (pattern_def_stmt);
 342                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 343                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 344                         break;
 345                       gsi_next (&pattern_def_si);
 346                     }
 347
 348                   if (!gsi_end_p (pattern_def_si))
 349                     {
 350                       if (dump_enabled_p ())
 351                         {
 352                           dump_printf_loc (MSG_NOTE, vect_location,
 353                                            "==> examining pattern def stmt: ");
 354                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 355                                             pattern_def_stmt, 0);
 356                           dump_printf (MSG_NOTE, "\n");
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 398                 }
 399               return false;
 400             }
 401
 402           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 403             {
 404               if (dump_enabled_p ())
 405                 {
 406                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                    "not vectorized: vector stmt in loop:");
 408                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 409                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (is_gimple_call (stmt)
 431                   && gimple_call_internal_p (stmt)
 432                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 433                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 434               else
 435                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 436
 437               /* Bool ops don't participate in vectorization factor
 438                  computation.  For comparison use compared types to
 439                  compute a factor.  */
 440               if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
 441                 {
 442                   mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (gimple_code (stmt) == GIMPLE_ASSIGN
 446                       && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 447                          == tcc_comparison
 448                       && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
 449                          != BOOLEAN_TYPE)
 450                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 451                   else
 452                     {
 453                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 454                         {
 455                           pattern_def_seq = NULL;
 456                           gsi_next (&si);
 457                         }
 458                       continue;
 459                     }
 460                 }
 461
 462               if (dump_enabled_p ())
 463                 {
 464                   dump_printf_loc (MSG_NOTE, vect_location,
 465                                    "get vectype for scalar type:  ");
 466                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 467                   dump_printf (MSG_NOTE, "\n");
 468                 }
 469               vectype = get_vectype_for_scalar_type (scalar_type);
 470               if (!vectype)
 471                 {
 472                   if (dump_enabled_p ())
 473                     {
 474                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                                        "not vectorized: unsupported "
 476                                        "data-type ");
 477                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 478                                          scalar_type);
 479                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 480                     }
 481                   return false;
 482                 }
 483
 484               if (!bool_result)
 485                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 486
 487               if (dump_enabled_p ())
 488                 {
 489                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 490                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 491                   dump_printf (MSG_NOTE, "\n");
 492                 }
 493             }
 494
 495           /* Don't try to compute VF out scalar types if we stmt
 496              produces boolean vector.  Use result vectype instead.  */
 497           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 498             vf_vectype = vectype;
 499           else
 500             {
 501               /* The vectorization factor is according to the smallest
 502                  scalar type (or the largest vector size, but we only
 503                  support one vector size per loop).  */
 504               if (!bool_result)
 505                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 506                                                              &dummy);
 507               if (dump_enabled_p ())
 508                 {
 509                   dump_printf_loc (MSG_NOTE, vect_location,
 510                                    "get vectype for scalar type:  ");
 511                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 512                   dump_printf (MSG_NOTE, "\n");
 513                 }
 514               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 515             }
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   for (i = 0; i < mask_producers.length (); i++)
 583     {
 584       tree mask_type = NULL;
 585
 586       stmt = STMT_VINFO_STMT (mask_producers[i]);
 587
 588       if (gimple_code (stmt) == GIMPLE_ASSIGN
 589           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 590           && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 623                     }
 624                   return false;
 625                 }
 626
 627               /* No vectype probably means external definition.
 628                  Allow it in case there is another operand which
 629                  allows to determine mask type.  */
 630               if (!vectype)
 631                 continue;
 632
 633               if (!mask_type)
 634                 mask_type = vectype;
 635               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 636                        != TYPE_VECTOR_SUBPARTS (vectype))
 637                 {
 638                   if (dump_enabled_p ())
 639                     {
 640                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 641                                        "not vectorized: different sized masks "
 642                                        "types in statement, ");
 643                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 644                                          mask_type);
 645                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          vectype);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 649                     }
 650                   return false;
 651                 }
 652               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 653                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 654                 {
 655                   if (dump_enabled_p ())
 656                     {
 657                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 658                                        "not vectorized: mixed mask and "
 659                                        "nonmask vector types in statement, ");
 660                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 661                                          mask_type);
 662                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          vectype);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 666                     }
 667                   return false;
 668                 }
 669             }
 670
 671           /* We may compare boolean value loaded as vector of integers.
 672              Fix mask_type in such case.  */
 673           if (mask_type
 674               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 675               && gimple_code (stmt) == GIMPLE_ASSIGN
 676               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 677             mask_type = build_same_sized_truth_vector_type (mask_type);
 678         }
 679
 680       /* No mask_type should mean loop invariant predicate.
 681          This is probably a subject for optimization in
 682          if-conversion.  */
 683       if (!mask_type)
 684         {
 685           if (dump_enabled_p ())
 686             {
 687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                                "not vectorized: can't compute mask type "
 689                                "for statement, ");
 690               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 691                                 0);
 692               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 693             }
 694           return false;
 695         }
 696
 697       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 698     }
 699
 700   return true;
 701 }
 702
 703
 704 /* Function vect_is_simple_iv_evolution.
 705
 706    FORNOW: A simple evolution of an induction variables in the loop is
 707    considered a polynomial evolution.  */
 708
 709 static bool
 710 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 711                              tree * step)
 712 {
 713   tree init_expr;
 714   tree step_expr;
 715   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 716   basic_block bb;
 717
 718   /* When there is no evolution in this loop, the evolution function
 719      is not "simple".  */
 720   if (evolution_part == NULL_TREE)
 721     return false;
 722
 723   /* When the evolution is a polynomial of degree >= 2
 724      the evolution function is not "simple".  */
 725   if (tree_is_chrec (evolution_part))
 726     return false;
 727
 728   step_expr = evolution_part;
 729   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 730
 731   if (dump_enabled_p ())
 732     {
 733       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 735       dump_printf (MSG_NOTE, ",  init: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 737       dump_printf (MSG_NOTE, "\n");
 738     }
 739
 740   *init = init_expr;
 741   *step = step_expr;
 742
 743   if (TREE_CODE (step_expr) != INTEGER_CST
 744       && (TREE_CODE (step_expr) != SSA_NAME
 745           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 746               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 747           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 748               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 749                   || !flag_associative_math)))
 750       && (TREE_CODE (step_expr) != REAL_CST
 751           || !flag_associative_math))
 752     {
 753       if (dump_enabled_p ())
 754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 755                          "step unknown.\n");
 756       return false;
 757     }
 758
 759   return true;
 760 }
 761
 762 /* Function vect_analyze_scalar_cycles_1.
 763
 764    Examine the cross iteration def-use cycles of scalar variables
 765    in LOOP.  LOOP_VINFO represents the loop that is now being
 766    considered for vectorization (can be LOOP, or an outer-loop
 767    enclosing LOOP).  */
 768
 769 static void
 770 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 771 {
 772   basic_block bb = loop->header;
 773   tree init, step;
 774   auto_vec<gimple *, 64> worklist;
 775   gphi_iterator gsi;
 776   bool double_reduc;
 777
 778   if (dump_enabled_p ())
 779     dump_printf_loc (MSG_NOTE, vect_location,
 780                      "=== vect_analyze_scalar_cycles ===\n");
 781
 782   /* First - identify all inductions.  Reduction detection assumes that all the
 783      inductions have been identified, therefore, this order must not be
 784      changed.  */
 785   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 786     {
 787       gphi *phi = gsi.phi ();
 788       tree access_fn = NULL;
 789       tree def = PHI_RESULT (phi);
 790       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 791
 792       if (dump_enabled_p ())
 793         {
 794           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 795           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 796           dump_printf (MSG_NOTE, "\n");
 797         }
 798
 799       /* Skip virtual phi's.  The data dependences that are associated with
 800          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 801       if (virtual_operand_p (def))
 802         continue;
 803
 804       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 805
 806       /* Analyze the evolution function.  */
 807       access_fn = analyze_scalar_evolution (loop, def);
 808       if (access_fn)
 809         {
 810           STRIP_NOPS (access_fn);
 811           if (dump_enabled_p ())
 812             {
 813               dump_printf_loc (MSG_NOTE, vect_location,
 814                                "Access function of PHI: ");
 815               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 816               dump_printf (MSG_NOTE, "\n");
 817             }
 818           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 819             = evolution_part_in_loop_num (access_fn, loop->num);
 820         }
 821
 822       if (!access_fn
 823           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 824           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 825               && TREE_CODE (step) != INTEGER_CST))
 826         {
 827           worklist.safe_push (phi);
 828           continue;
 829         }
 830
 831       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 832
 833       if (dump_enabled_p ())
 834         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 835       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 836     }
 837
 838
 839   /* Second - identify all reductions and nested cycles.  */
 840   while (worklist.length () > 0)
 841     {
 842       gimple *phi = worklist.pop ();
 843       tree def = PHI_RESULT (phi);
 844       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 845       gimple *reduc_stmt;
 846       bool nested_cycle;
 847
 848       if (dump_enabled_p ())
 849         {
 850           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 851           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 852           dump_printf (MSG_NOTE, "\n");
 853         }
 854
 855       gcc_assert (!virtual_operand_p (def)
 856                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 857
 858       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 859       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 860                                                 &double_reduc, false);
 861       if (reduc_stmt)
 862         {
 863           if (double_reduc)
 864             {
 865               if (dump_enabled_p ())
 866                 dump_printf_loc (MSG_NOTE, vect_location,
 867                                  "Detected double reduction.\n");
 868
 869               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 870               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 871                                                     vect_double_reduction_def;
 872             }
 873           else
 874             {
 875               if (nested_cycle)
 876                 {
 877                   if (dump_enabled_p ())
 878                     dump_printf_loc (MSG_NOTE, vect_location,
 879                                      "Detected vectorizable nested cycle.\n");
 880
 881                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 882                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 883                                                              vect_nested_cycle;
 884                 }
 885               else
 886                 {
 887                   if (dump_enabled_p ())
 888                     dump_printf_loc (MSG_NOTE, vect_location,
 889                                      "Detected reduction.\n");
 890
 891                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 892                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 893                                                            vect_reduction_def;
 894                   /* Store the reduction cycles for possible vectorization in
 895                      loop-aware SLP.  */
 896                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 897                 }
 898             }
 899         }
 900       else
 901         if (dump_enabled_p ())
 902           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 903                            "Unknown def-use cycle pattern.\n");
 904     }
 905 }
 906
 907
 908 /* Function vect_analyze_scalar_cycles.
 909
 910    Examine the cross iteration def-use cycles of scalar variables, by
 911    analyzing the loop-header PHIs of scalar variables.  Classify each
 912    cycle as one of the following: invariant, induction, reduction, unknown.
 913    We do that for the loop represented by LOOP_VINFO, and also to its
 914    inner-loop, if exists.
 915    Examples for scalar cycles:
 916
 917    Example1: reduction:
 918
 919               loop1:
 920               for (i=0; i<N; i++)
 921                  sum += a[i];
 922
 923    Example2: induction:
 924
 925               loop2:
 926               for (i=0; i<N; i++)
 927                  a[i] = i;  */
 928
 929 static void
 930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 931 {
 932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 933
 934   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 935
 936   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 937      Reductions in such inner-loop therefore have different properties than
 938      the reductions in the nest that gets vectorized:
 939      1. When vectorized, they are executed in the same order as in the original
 940         scalar loop, so we can't change the order of computation when
 941         vectorizing them.
 942      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 943         current checks are too strict.  */
 944
 945   if (loop->inner)
 946     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 947 }
 948
 949 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 950
 951 static void
 952 vect_fixup_reduc_chain (gimple *stmt)
 953 {
 954   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 955   gimple *stmtp;
 956   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 957               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 958   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 959   do
 960     {
 961       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 963       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 964       if (stmt)
 965         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 966           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967     }
 968   while (stmt);
 969   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 970 }
 971
 972 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 973
 974 static void
 975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 976 {
 977   gimple *first;
 978   unsigned i;
 979
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 981     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 982       {
 983         vect_fixup_reduc_chain (first);
 984         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 985           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 986       }
 987 }
 988
 989 /* Function vect_get_loop_niters.
 990
 991    Determine how many iterations the loop is executed and place it
 992    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 993    in NUMBER_OF_ITERATIONSM1.
 994
 995    Return the loop exit condition.  */
 996
 997
 998 static gcond *
 999 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1000                       tree *number_of_iterationsm1)
1001 {
1002   tree niters;
1003
1004   if (dump_enabled_p ())
1005     dump_printf_loc (MSG_NOTE, vect_location,
1006                      "=== get_loop_niters ===\n");
1007
1008   niters = number_of_latch_executions (loop);
1009   *number_of_iterationsm1 = niters;
1010
1011   /* We want the number of loop header executions which is the number
1012      of latch executions plus one.
1013      ???  For UINT_MAX latch executions this number overflows to zero
1014      for loops like do { n++; } while (n != 0);  */
1015   if (niters && !chrec_contains_undetermined (niters))
1016     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1017                           build_int_cst (TREE_TYPE (niters), 1));
1018   *number_of_iterations = niters;
1019
1020   return get_loop_exit_condition (loop);
1021 }
1022
1023
1024 /* Function bb_in_loop_p
1025
1026    Used as predicate for dfs order traversal of the loop bbs.  */
1027
1028 static bool
1029 bb_in_loop_p (const_basic_block bb, const void *data)
1030 {
1031   const struct loop *const loop = (const struct loop *)data;
1032   if (flow_bb_inside_loop_p (loop, bb))
1033     return true;
1034   return false;
1035 }
1036
1037
1038 /* Function new_loop_vec_info.
1039
1040    Create and initialize a new loop_vec_info struct for LOOP, as well as
1041    stmt_vec_info structs for all the stmts in LOOP.  */
1042
1043 static loop_vec_info
1044 new_loop_vec_info (struct loop *loop)
1045 {
1046   loop_vec_info res;
1047   basic_block *bbs;
1048   gimple_stmt_iterator si;
1049   unsigned int i, nbbs;
1050
1051   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1052   res->kind = vec_info::loop;
1053   LOOP_VINFO_LOOP (res) = loop;
1054
1055   bbs = get_loop_body (loop);
1056
1057   /* Create/Update stmt_info for all stmts in the loop.  */
1058   for (i = 0; i < loop->num_nodes; i++)
1059     {
1060       basic_block bb = bbs[i];
1061
1062       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1063         {
1064           gimple *phi = gsi_stmt (si);
1065           gimple_set_uid (phi, 0);
1066           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1067         }
1068
1069       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1070         {
1071           gimple *stmt = gsi_stmt (si);
1072           gimple_set_uid (stmt, 0);
1073           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1074         }
1075     }
1076
1077   /* CHECKME: We want to visit all BBs before their successors (except for
1078      latch blocks, for which this assertion wouldn't hold).  In the simple
1079      case of the loop forms we allow, a dfs order of the BBs would the same
1080      as reversed postorder traversal, so we are safe.  */
1081
1082    free (bbs);
1083    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1084    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1085                               bbs, loop->num_nodes, loop);
1086    gcc_assert (nbbs == loop->num_nodes);
1087
1088   LOOP_VINFO_BBS (res) = bbs;
1089   LOOP_VINFO_NITERSM1 (res) = NULL;
1090   LOOP_VINFO_NITERS (res) = NULL;
1091   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1092   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1093   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1094   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1095   LOOP_VINFO_VECT_FACTOR (res) = 0;
1096   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1097   LOOP_VINFO_DATAREFS (res) = vNULL;
1098   LOOP_VINFO_DDRS (res) = vNULL;
1099   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1100   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1101   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1102   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1103   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1104   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1105   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1106   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1107   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1108   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1109   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1110   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1111
1112   return res;
1113 }
1114
1115
1116 /* Function destroy_loop_vec_info.
1117
1118    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1119    stmts in the loop.  */
1120
1121 void
1122 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1123 {
1124   struct loop *loop;
1125   basic_block *bbs;
1126   int nbbs;
1127   gimple_stmt_iterator si;
1128   int j;
1129   vec<slp_instance> slp_instances;
1130   slp_instance instance;
1131   bool swapped;
1132
1133   if (!loop_vinfo)
1134     return;
1135
1136   loop = LOOP_VINFO_LOOP (loop_vinfo);
1137
1138   bbs = LOOP_VINFO_BBS (loop_vinfo);
1139   nbbs = clean_stmts ? loop->num_nodes : 0;
1140   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1141
1142   for (j = 0; j < nbbs; j++)
1143     {
1144       basic_block bb = bbs[j];
1145       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1146         free_stmt_vec_info (gsi_stmt (si));
1147
1148       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1149         {
1150           gimple *stmt = gsi_stmt (si);
1151
1152           /* We may have broken canonical form by moving a constant
1153              into RHS1 of a commutative op.  Fix such occurrences.  */
1154           if (swapped && is_gimple_assign (stmt))
1155             {
1156               enum tree_code code = gimple_assign_rhs_code (stmt);
1157
1158               if ((code == PLUS_EXPR
1159                    || code == POINTER_PLUS_EXPR
1160                    || code == MULT_EXPR)
1161                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1162                 swap_ssa_operands (stmt,
1163                                    gimple_assign_rhs1_ptr (stmt),
1164                                    gimple_assign_rhs2_ptr (stmt));
1165             }
1166
1167           /* Free stmt_vec_info.  */
1168           free_stmt_vec_info (stmt);
1169           gsi_next (&si);
1170         }
1171     }
1172
1173   free (LOOP_VINFO_BBS (loop_vinfo));
1174   vect_destroy_datarefs (loop_vinfo);
1175   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1176   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1177   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1178   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1179   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1180   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1181     vect_free_slp_instance (instance);
1182
1183   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1184   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1185   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1186   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1187
1188   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1189   loop_vinfo->scalar_cost_vec.release ();
1190
1191   free (loop_vinfo);
1192   loop->aux = NULL;
1193 }
1194
1195
1196 /* Calculate the cost of one scalar iteration of the loop.  */
1197 static void
1198 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1199 {
1200   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1201   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1202   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1203   int innerloop_iters, i;
1204
1205   /* Count statements in scalar loop.  Using this as scalar cost for a single
1206      iteration for now.
1207
1208      TODO: Add outer loop support.
1209
1210      TODO: Consider assigning different costs to different scalar
1211      statements.  */
1212
1213   /* FORNOW.  */
1214   innerloop_iters = 1;
1215   if (loop->inner)
1216     innerloop_iters = 50; /* FIXME */
1217
1218   for (i = 0; i < nbbs; i++)
1219     {
1220       gimple_stmt_iterator si;
1221       basic_block bb = bbs[i];
1222
1223       if (bb->loop_father == loop->inner)
1224         factor = innerloop_iters;
1225       else
1226         factor = 1;
1227
1228       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1229         {
1230           gimple *stmt = gsi_stmt (si);
1231           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1232
1233           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1234             continue;
1235
1236           /* Skip stmts that are not vectorized inside the loop.  */
1237           if (stmt_info
1238               && !STMT_VINFO_RELEVANT_P (stmt_info)
1239               && (!STMT_VINFO_LIVE_P (stmt_info)
1240                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1241               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1242             continue;
1243
1244           vect_cost_for_stmt kind;
1245           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1246             {
1247               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1248                kind = scalar_load;
1249              else
1250                kind = scalar_store;
1251             }
1252           else
1253             kind = scalar_stmt;
1254
1255           scalar_single_iter_cost
1256             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1257                                  factor, kind, NULL, 0, vect_prologue);
1258         }
1259     }
1260   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1261     = scalar_single_iter_cost;
1262 }
1263
1264
1265 /* Function vect_analyze_loop_form_1.
1266
1267    Verify that certain CFG restrictions hold, including:
1268    - the loop has a pre-header
1269    - the loop has a single entry and exit
1270    - the loop exit condition is simple enough, and the number of iterations
1271      can be analyzed (a countable loop).  */
1272
1273 bool
1274 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1275                           tree *number_of_iterationsm1,
1276                           tree *number_of_iterations, gcond **inner_loop_cond)
1277 {
1278   if (dump_enabled_p ())
1279     dump_printf_loc (MSG_NOTE, vect_location,
1280                      "=== vect_analyze_loop_form ===\n");
1281
1282   /* Different restrictions apply when we are considering an inner-most loop,
1283      vs. an outer (nested) loop.
1284      (FORNOW. May want to relax some of these restrictions in the future).  */
1285
1286   if (!loop->inner)
1287     {
1288       /* Inner-most loop.  We currently require that the number of BBs is
1289          exactly 2 (the header and latch).  Vectorizable inner-most loops
1290          look like this:
1291
1292                         (pre-header)
1293                            |
1294                           header <--------+
1295                            | |            |
1296                            | +--> latch --+
1297                            |
1298                         (exit-bb)  */
1299
1300       if (loop->num_nodes != 2)
1301         {
1302           if (dump_enabled_p ())
1303             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1304                              "not vectorized: control flow in loop.\n");
1305           return false;
1306         }
1307
1308       if (empty_block_p (loop->header))
1309         {
1310           if (dump_enabled_p ())
1311             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1312                              "not vectorized: empty loop.\n");
1313           return false;
1314         }
1315     }
1316   else
1317     {
1318       struct loop *innerloop = loop->inner;
1319       edge entryedge;
1320
1321       /* Nested loop. We currently require that the loop is doubly-nested,
1322          contains a single inner loop, and the number of BBs is exactly 5.
1323          Vectorizable outer-loops look like this:
1324
1325                         (pre-header)
1326                            |
1327                           header <---+
1328                            |         |
1329                           inner-loop |
1330                            |         |
1331                           tail ------+
1332                            |
1333                         (exit-bb)
1334
1335          The inner-loop has the properties expected of inner-most loops
1336          as described above.  */
1337
1338       if ((loop->inner)->inner || (loop->inner)->next)
1339         {
1340           if (dump_enabled_p ())
1341             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1342                              "not vectorized: multiple nested loops.\n");
1343           return false;
1344         }
1345
1346       if (loop->num_nodes != 5)
1347         {
1348           if (dump_enabled_p ())
1349             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350                              "not vectorized: control flow in loop.\n");
1351           return false;
1352         }
1353
1354       entryedge = loop_preheader_edge (innerloop);
1355       if (entryedge->src != loop->header
1356           || !single_exit (innerloop)
1357           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1358         {
1359           if (dump_enabled_p ())
1360             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1361                              "not vectorized: unsupported outerloop form.\n");
1362           return false;
1363         }
1364
1365       /* Analyze the inner-loop.  */
1366       tree inner_niterm1, inner_niter;
1367       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1368                                       &inner_niterm1, &inner_niter, NULL))
1369         {
1370           if (dump_enabled_p ())
1371             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372                              "not vectorized: Bad inner loop.\n");
1373           return false;
1374         }
1375
1376       if (!expr_invariant_in_loop_p (loop, inner_niter))
1377         {
1378           if (dump_enabled_p ())
1379             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1380                              "not vectorized: inner-loop count not"
1381                              " invariant.\n");
1382           return false;
1383         }
1384
1385       if (dump_enabled_p ())
1386         dump_printf_loc (MSG_NOTE, vect_location,
1387                          "Considering outer-loop vectorization.\n");
1388     }
1389
1390   if (!single_exit (loop)
1391       || EDGE_COUNT (loop->header->preds) != 2)
1392     {
1393       if (dump_enabled_p ())
1394         {
1395           if (!single_exit (loop))
1396             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397                              "not vectorized: multiple exits.\n");
1398           else if (EDGE_COUNT (loop->header->preds) != 2)
1399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                              "not vectorized: too many incoming edges.\n");
1401         }
1402       return false;
1403     }
1404
1405   /* We assume that the loop exit condition is at the end of the loop. i.e,
1406      that the loop is represented as a do-while (with a proper if-guard
1407      before the loop if needed), where the loop header contains all the
1408      executable statements, and the latch is empty.  */
1409   if (!empty_block_p (loop->latch)
1410       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1411     {
1412       if (dump_enabled_p ())
1413         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1414                          "not vectorized: latch block not empty.\n");
1415       return false;
1416     }
1417
1418   /* Make sure there exists a single-predecessor exit bb:  */
1419   if (!single_pred_p (single_exit (loop)->dest))
1420     {
1421       edge e = single_exit (loop);
1422       if (!(e->flags & EDGE_ABNORMAL))
1423         {
1424           split_loop_exit_edge (e);
1425           if (dump_enabled_p ())
1426             dump_printf (MSG_NOTE, "split exit edge.\n");
1427         }
1428       else
1429         {
1430           if (dump_enabled_p ())
1431             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432                              "not vectorized: abnormal loop exit edge.\n");
1433           return false;
1434         }
1435     }
1436
1437   *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1438                                      number_of_iterationsm1);
1439   if (!*loop_cond)
1440     {
1441       if (dump_enabled_p ())
1442         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443                          "not vectorized: complicated exit condition.\n");
1444       return false;
1445     }
1446
1447   if (!*number_of_iterations
1448       || chrec_contains_undetermined (*number_of_iterations))
1449     {
1450       if (dump_enabled_p ())
1451         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452                          "not vectorized: number of iterations cannot be "
1453                          "computed.\n");
1454       return false;
1455     }
1456
1457   if (integer_zerop (*number_of_iterations))
1458     {
1459       if (dump_enabled_p ())
1460         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461                          "not vectorized: number of iterations = 0.\n");
1462       return false;
1463     }
1464
1465   return true;
1466 }
1467
1468 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1469
1470 loop_vec_info
1471 vect_analyze_loop_form (struct loop *loop)
1472 {
1473   tree number_of_iterations, number_of_iterationsm1;
1474   gcond *loop_cond, *inner_loop_cond = NULL;
1475
1476   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1477                                   &number_of_iterations, &inner_loop_cond))
1478     return NULL;
1479
1480   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1481   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1482   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1483   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1484
1485   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1486     {
1487       if (dump_enabled_p ())
1488         {
1489           dump_printf_loc (MSG_NOTE, vect_location,
1490                            "Symbolic number of iterations is ");
1491           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1492           dump_printf (MSG_NOTE, "\n");
1493         }
1494     }
1495
1496   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1497   if (inner_loop_cond)
1498     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1499       = loop_exit_ctrl_vec_info_type;
1500
1501   gcc_assert (!loop->aux);
1502   loop->aux = loop_vinfo;
1503   return loop_vinfo;
1504 }
1505
1506
1507
1508 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1509    statements update the vectorization factor.  */
1510
1511 static void
1512 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1513 {
1514   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1515   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1516   int nbbs = loop->num_nodes;
1517   unsigned int vectorization_factor;
1518   int i;
1519
1520   if (dump_enabled_p ())
1521     dump_printf_loc (MSG_NOTE, vect_location,
1522                      "=== vect_update_vf_for_slp ===\n");
1523
1524   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1525   gcc_assert (vectorization_factor != 0);
1526
1527   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1528      vectorization factor of the loop is the unrolling factor required by
1529      the SLP instances.  If that unrolling factor is 1, we say, that we
1530      perform pure SLP on loop - cross iteration parallelism is not
1531      exploited.  */
1532   bool only_slp_in_loop = true;
1533   for (i = 0; i < nbbs; i++)
1534     {
1535       basic_block bb = bbs[i];
1536       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1537            gsi_next (&si))
1538         {
1539           gimple *stmt = gsi_stmt (si);
1540           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1541           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1542               && STMT_VINFO_RELATED_STMT (stmt_info))
1543             {
1544               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1545               stmt_info = vinfo_for_stmt (stmt);
1546             }
1547           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1548                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1549               && !PURE_SLP_STMT (stmt_info))
1550             /* STMT needs both SLP and loop-based vectorization.  */
1551             only_slp_in_loop = false;
1552         }
1553     }
1554
1555   if (only_slp_in_loop)
1556     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1557   else
1558     vectorization_factor
1559       = least_common_multiple (vectorization_factor,
1560                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1561
1562   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1563   if (dump_enabled_p ())
1564     dump_printf_loc (MSG_NOTE, vect_location,
1565                      "Updating vectorization factor to %d\n",
1566                      vectorization_factor);
1567 }
1568
1569 /* Function vect_analyze_loop_operations.
1570
1571    Scan the loop stmts and make sure they are all vectorizable.  */
1572
1573 static bool
1574 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1575 {
1576   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1577   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1578   int nbbs = loop->num_nodes;
1579   int i;
1580   stmt_vec_info stmt_info;
1581   bool need_to_vectorize = false;
1582   bool ok;
1583
1584   if (dump_enabled_p ())
1585     dump_printf_loc (MSG_NOTE, vect_location,
1586                      "=== vect_analyze_loop_operations ===\n");
1587
1588   for (i = 0; i < nbbs; i++)
1589     {
1590       basic_block bb = bbs[i];
1591
1592       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1593            gsi_next (&si))
1594         {
1595           gphi *phi = si.phi ();
1596           ok = true;
1597
1598           stmt_info = vinfo_for_stmt (phi);
1599           if (dump_enabled_p ())
1600             {
1601               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1602               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1603               dump_printf (MSG_NOTE, "\n");
1604             }
1605           if (virtual_operand_p (gimple_phi_result (phi)))
1606             continue;
1607
1608           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1609              (i.e., a phi in the tail of the outer-loop).  */
1610           if (! is_loop_header_bb_p (bb))
1611             {
1612               /* FORNOW: we currently don't support the case that these phis
1613                  are not used in the outerloop (unless it is double reduction,
1614                  i.e., this phi is vect_reduction_def), cause this case
1615                  requires to actually do something here.  */
1616               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1617                    || STMT_VINFO_LIVE_P (stmt_info))
1618                   && STMT_VINFO_DEF_TYPE (stmt_info)
1619                      != vect_double_reduction_def)
1620                 {
1621                   if (dump_enabled_p ())
1622                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623                                      "Unsupported loop-closed phi in "
1624                                      "outer-loop.\n");
1625                   return false;
1626                 }
1627
1628               /* If PHI is used in the outer loop, we check that its operand
1629                  is defined in the inner loop.  */
1630               if (STMT_VINFO_RELEVANT_P (stmt_info))
1631                 {
1632                   tree phi_op;
1633                   gimple *op_def_stmt;
1634
1635                   if (gimple_phi_num_args (phi) != 1)
1636                     return false;
1637
1638                   phi_op = PHI_ARG_DEF (phi, 0);
1639                   if (TREE_CODE (phi_op) != SSA_NAME)
1640                     return false;
1641
1642                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1643                   if (gimple_nop_p (op_def_stmt)
1644                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1645                       || !vinfo_for_stmt (op_def_stmt))
1646                     return false;
1647
1648                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1649                         != vect_used_in_outer
1650                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1651                            != vect_used_in_outer_by_reduction)
1652                     return false;
1653                 }
1654
1655               continue;
1656             }
1657
1658           gcc_assert (stmt_info);
1659
1660           if (STMT_VINFO_LIVE_P (stmt_info))
1661             {
1662               /* FORNOW: not yet supported.  */
1663               if (dump_enabled_p ())
1664                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665                                  "not vectorized: value used after loop.\n");
1666               return false;
1667             }
1668
1669           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1670               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1671             {
1672               /* A scalar-dependence cycle that we don't support.  */
1673               if (dump_enabled_p ())
1674                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675                                  "not vectorized: scalar dependence cycle.\n");
1676               return false;
1677             }
1678
1679           if (STMT_VINFO_RELEVANT_P (stmt_info))
1680             {
1681               need_to_vectorize = true;
1682               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1683                 ok = vectorizable_induction (phi, NULL, NULL);
1684             }
1685
1686           if (!ok)
1687             {
1688               if (dump_enabled_p ())
1689                 {
1690                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691                                    "not vectorized: relevant phi not "
1692                                    "supported: ");
1693                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1694                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1695                 }
1696               return false;
1697             }
1698         }
1699
1700       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1701            gsi_next (&si))
1702         {
1703           gimple *stmt = gsi_stmt (si);
1704           if (!gimple_clobber_p (stmt)
1705               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1706             return false;
1707         }
1708     } /* bbs */
1709
1710   /* All operations in the loop are either irrelevant (deal with loop
1711      control, or dead), or only used outside the loop and can be moved
1712      out of the loop (e.g. invariants, inductions).  The loop can be
1713      optimized away by scalar optimizations.  We're better off not
1714      touching this loop.  */
1715   if (!need_to_vectorize)
1716     {
1717       if (dump_enabled_p ())
1718         dump_printf_loc (MSG_NOTE, vect_location,
1719                          "All the computation can be taken out of the loop.\n");
1720       if (dump_enabled_p ())
1721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722                          "not vectorized: redundant loop. no profit to "
1723                          "vectorize.\n");
1724       return false;
1725     }
1726
1727   return true;
1728 }
1729
1730
1731 /* Function vect_analyze_loop_2.
1732
1733    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1734    for it.  The different analyses will record information in the
1735    loop_vec_info struct.  */
1736 static bool
1737 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1738 {
1739   bool ok;
1740   int max_vf = MAX_VECTORIZATION_FACTOR;
1741   int min_vf = 2;
1742   unsigned int n_stmts = 0;
1743
1744   /* The first group of checks is independent of the vector size.  */
1745   fatal = true;
1746
1747   /* Find all data references in the loop (which correspond to vdefs/vuses)
1748      and analyze their evolution in the loop.  */
1749
1750   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1751
1752   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1753   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1754     {
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1757                          "not vectorized: loop contains function calls"
1758                          " or data references that cannot be analyzed\n");
1759       return false;
1760     }
1761
1762   for (unsigned i = 0; i < loop->num_nodes; i++)
1763     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1764          !gsi_end_p (gsi); gsi_next (&gsi))
1765       {
1766         gimple *stmt = gsi_stmt (gsi);
1767         if (is_gimple_debug (stmt))
1768           continue;
1769         ++n_stmts;
1770         if (!find_data_references_in_stmt (loop, stmt,
1771                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1772           {
1773             if (is_gimple_call (stmt) && loop->safelen)
1774               {
1775                 tree fndecl = gimple_call_fndecl (stmt), op;
1776                 if (fndecl != NULL_TREE)
1777                   {
1778                     cgraph_node *node = cgraph_node::get (fndecl);
1779                     if (node != NULL && node->simd_clones != NULL)
1780                       {
1781                         unsigned int j, n = gimple_call_num_args (stmt);
1782                         for (j = 0; j < n; j++)
1783                           {
1784                             op = gimple_call_arg (stmt, j);
1785                             if (DECL_P (op)
1786                                 || (REFERENCE_CLASS_P (op)
1787                                     && get_base_address (op)))
1788                               break;
1789                           }
1790                         op = gimple_call_lhs (stmt);
1791                         /* Ignore #pragma omp declare simd functions
1792                            if they don't have data references in the
1793                            call stmt itself.  */
1794                         if (j == n
1795                             && !(op
1796                                  && (DECL_P (op)
1797                                      || (REFERENCE_CLASS_P (op)
1798                                          && get_base_address (op)))))
1799                           continue;
1800                       }
1801                   }
1802               }
1803             if (dump_enabled_p ())
1804               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805                                "not vectorized: loop contains function "
1806                                "calls or data references that cannot "
1807                                "be analyzed\n");
1808             return false;
1809           }
1810       }
1811
1812   /* Analyze the data references and also adjust the minimal
1813      vectorization factor according to the loads and stores.  */
1814
1815   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1816   if (!ok)
1817     {
1818       if (dump_enabled_p ())
1819         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1820                          "bad data references.\n");
1821       return false;
1822     }
1823
1824   /* Classify all cross-iteration scalar data-flow cycles.
1825      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1826   vect_analyze_scalar_cycles (loop_vinfo);
1827
1828   vect_pattern_recog (loop_vinfo);
1829
1830   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1831
1832   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1833      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1834
1835   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1836   if (!ok)
1837     {
1838       if (dump_enabled_p ())
1839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840                          "bad data access.\n");
1841       return false;
1842     }
1843
1844   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1845
1846   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1847   if (!ok)
1848     {
1849       if (dump_enabled_p ())
1850         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851                          "unexpected pattern.\n");
1852       return false;
1853     }
1854
1855   /* While the rest of the analysis below depends on it in some way.  */
1856   fatal = false;
1857
1858   /* Analyze data dependences between the data-refs in the loop
1859      and adjust the maximum vectorization factor according to
1860      the dependences.
1861      FORNOW: fail at the first data dependence that we encounter.  */
1862
1863   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1864   if (!ok
1865       || max_vf < min_vf)
1866     {
1867       if (dump_enabled_p ())
1868             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869                              "bad data dependence.\n");
1870       return false;
1871     }
1872
1873   ok = vect_determine_vectorization_factor (loop_vinfo);
1874   if (!ok)
1875     {
1876       if (dump_enabled_p ())
1877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878                          "can't determine vectorization factor.\n");
1879       return false;
1880     }
1881   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1882     {
1883       if (dump_enabled_p ())
1884         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885                          "bad data dependence.\n");
1886       return false;
1887     }
1888
1889   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1890   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1891   if (!ok)
1892     return false;
1893
1894   /* If there are any SLP instances mark them as pure_slp.  */
1895   bool slp = vect_make_slp_decision (loop_vinfo);
1896   if (slp)
1897     {
1898       /* Find stmts that need to be both vectorized and SLPed.  */
1899       vect_detect_hybrid_slp (loop_vinfo);
1900
1901       /* Update the vectorization factor based on the SLP decision.  */
1902       vect_update_vf_for_slp (loop_vinfo);
1903     }
1904
1905   /* Now the vectorization factor is final.  */
1906   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907   gcc_assert (vectorization_factor != 0);
1908
1909   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1910     dump_printf_loc (MSG_NOTE, vect_location,
1911                      "vectorization_factor = %d, niters = "
1912                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1913                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1914
1915   HOST_WIDE_INT max_niter
1916     = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1917   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1918        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1919       || (max_niter != -1
1920           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1921     {
1922       if (dump_enabled_p ())
1923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924                          "not vectorized: iteration count too small.\n");
1925       if (dump_enabled_p ())
1926         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1927                          "not vectorized: iteration count smaller than "
1928                          "vectorization factor.\n");
1929       return false;
1930     }
1931
1932   /* Analyze the alignment of the data-refs in the loop.
1933      Fail if a data reference is found that cannot be vectorized.  */
1934
1935   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1936   if (!ok)
1937     {
1938       if (dump_enabled_p ())
1939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                          "bad data alignment.\n");
1941       return false;
1942     }
1943
1944   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1945      It is important to call pruning after vect_analyze_data_ref_accesses,
1946      since we use grouping information gathered by interleaving analysis.  */
1947   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1948   if (!ok)
1949     {
1950       if (dump_enabled_p ())
1951         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1952                          "number of versioning for alias "
1953                          "run-time tests exceeds %d "
1954                          "(--param vect-max-version-for-alias-checks)\n",
1955                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1956       return false;
1957     }
1958
1959   /* Compute the scalar iteration cost.  */
1960   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1961
1962   /* This pass will decide on using loop versioning and/or loop peeling in
1963      order to enhance the alignment of data references in the loop.  */
1964
1965   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1966   if (!ok)
1967     {
1968       if (dump_enabled_p ())
1969         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1970                          "bad data alignment.\n");
1971       return false;
1972     }
1973
1974   if (slp)
1975     {
1976       /* Analyze operations in the SLP instances.  Note this may
1977          remove unsupported SLP instances which makes the above
1978          SLP kind detection invalid.  */
1979       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1980       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1981                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1982       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1983         return false;
1984     }
1985
1986   /* Scan all the remaining operations in the loop that are not subject
1987      to SLP and make sure they are vectorizable.  */
1988   ok = vect_analyze_loop_operations (loop_vinfo);
1989   if (!ok)
1990     {
1991       if (dump_enabled_p ())
1992         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1993                          "bad operation or unsupported loop bound.\n");
1994       return false;
1995     }
1996
1997   /* Analyze cost.  Decide if worth while to vectorize.  */
1998   int min_profitable_estimate, min_profitable_iters;
1999   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2000                                       &min_profitable_estimate);
2001
2002   if (min_profitable_iters < 0)
2003     {
2004       if (dump_enabled_p ())
2005         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2006                          "not vectorized: vectorization not profitable.\n");
2007       if (dump_enabled_p ())
2008         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009                          "not vectorized: vector version will never be "
2010                          "profitable.\n");
2011       return false;
2012     }
2013
2014   int min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2015                                 * vectorization_factor) - 1);
2016
2017   /* Use the cost model only if it is more conservative than user specified
2018      threshold.  */
2019   unsigned th = (unsigned) min_scalar_loop_bound;
2020   if (min_profitable_iters
2021       && (!min_scalar_loop_bound
2022           || min_profitable_iters > min_scalar_loop_bound))
2023     th = (unsigned) min_profitable_iters;
2024
2025   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2026
2027   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2028       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2029     {
2030       if (dump_enabled_p ())
2031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032                          "not vectorized: vectorization not profitable.\n");
2033       if (dump_enabled_p ())
2034         dump_printf_loc (MSG_NOTE, vect_location,
2035                          "not vectorized: iteration count smaller than user "
2036                          "specified loop bound parameter or minimum profitable "
2037                          "iterations (whichever is more conservative).\n");
2038       return false;
2039     }
2040
2041   HOST_WIDE_INT estimated_niter
2042     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2043   if (estimated_niter != -1
2044       && ((unsigned HOST_WIDE_INT) estimated_niter
2045           <= MAX (th, (unsigned)min_profitable_estimate)))
2046     {
2047       if (dump_enabled_p ())
2048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2049                          "not vectorized: estimated iteration count too "
2050                          "small.\n");
2051       if (dump_enabled_p ())
2052         dump_printf_loc (MSG_NOTE, vect_location,
2053                          "not vectorized: estimated iteration count smaller "
2054                          "than specified loop bound parameter or minimum "
2055                          "profitable iterations (whichever is more "
2056                          "conservative).\n");
2057       return false;
2058     }
2059
2060   /* Decide whether we need to create an epilogue loop to handle
2061      remaining scalar iterations.  */
2062   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2063         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2064        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2065
2066   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2067       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2068     {
2069       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2070                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2071           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2072         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2073     }
2074   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2075            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2076                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2077                /* In case of versioning, check if the maximum number of
2078                   iterations is greater than th.  If they are identical,
2079                   the epilogue is unnecessary.  */
2080                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2081                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2082                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2083     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2084
2085   /* If an epilogue loop is required make sure we can create one.  */
2086   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2087       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2088     {
2089       if (dump_enabled_p ())
2090         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2091       if (!vect_can_advance_ivs_p (loop_vinfo)
2092           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2093                                            single_exit (LOOP_VINFO_LOOP
2094                                                          (loop_vinfo))))
2095         {
2096           if (dump_enabled_p ())
2097             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                              "not vectorized: can't create required "
2099                              "epilog loop\n");
2100           return false;
2101         }
2102     }
2103
2104   gcc_assert (vectorization_factor
2105               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2106
2107   return true;
2108 }
2109
2110 /* Function vect_analyze_loop.
2111
2112    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2113    for it.  The different analyses will record information in the
2114    loop_vec_info struct.  */
2115 loop_vec_info
2116 vect_analyze_loop (struct loop *loop)
2117 {
2118   loop_vec_info loop_vinfo;
2119   unsigned int vector_sizes;
2120
2121   /* Autodetect first vector size we try.  */
2122   current_vector_size = 0;
2123   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2124
2125   if (dump_enabled_p ())
2126     dump_printf_loc (MSG_NOTE, vect_location,
2127                      "===== analyze_loop_nest =====\n");
2128
2129   if (loop_outer (loop)
2130       && loop_vec_info_for_loop (loop_outer (loop))
2131       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2132     {
2133       if (dump_enabled_p ())
2134         dump_printf_loc (MSG_NOTE, vect_location,
2135                          "outer-loop already vectorized.\n");
2136       return NULL;
2137     }
2138
2139   while (1)
2140     {
2141       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2142       loop_vinfo = vect_analyze_loop_form (loop);
2143       if (!loop_vinfo)
2144         {
2145           if (dump_enabled_p ())
2146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                              "bad loop form.\n");
2148           return NULL;
2149         }
2150
2151       bool fatal = false;
2152       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2153         {
2154           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2155
2156           return loop_vinfo;
2157         }
2158
2159       destroy_loop_vec_info (loop_vinfo, true);
2160
2161       vector_sizes &= ~current_vector_size;
2162       if (fatal
2163           || vector_sizes == 0
2164           || current_vector_size == 0)
2165         return NULL;
2166
2167       /* Try the next biggest vector size.  */
2168       current_vector_size = 1 << floor_log2 (vector_sizes);
2169       if (dump_enabled_p ())
2170         dump_printf_loc (MSG_NOTE, vect_location,
2171                          "***** Re-trying analysis with "
2172                          "vector size %d\n", current_vector_size);
2173     }
2174 }
2175
2176
2177 /* Function reduction_code_for_scalar_code
2178
2179    Input:
2180    CODE - tree_code of a reduction operations.
2181
2182    Output:
2183    REDUC_CODE - the corresponding tree-code to be used to reduce the
2184       vector of partial results into a single scalar result, or ERROR_MARK
2185       if the operation is a supported reduction operation, but does not have
2186       such a tree-code.
2187
2188    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2189
2190 static bool
2191 reduction_code_for_scalar_code (enum tree_code code,
2192                                 enum tree_code *reduc_code)
2193 {
2194   switch (code)
2195     {
2196       case MAX_EXPR:
2197         *reduc_code = REDUC_MAX_EXPR;
2198         return true;
2199
2200       case MIN_EXPR:
2201         *reduc_code = REDUC_MIN_EXPR;
2202         return true;
2203
2204       case PLUS_EXPR:
2205         *reduc_code = REDUC_PLUS_EXPR;
2206         return true;
2207
2208       case MULT_EXPR:
2209       case MINUS_EXPR:
2210       case BIT_IOR_EXPR:
2211       case BIT_XOR_EXPR:
2212       case BIT_AND_EXPR:
2213         *reduc_code = ERROR_MARK;
2214         return true;
2215
2216       default:
2217        return false;
2218     }
2219 }
2220
2221
2222 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2223    STMT is printed with a message MSG. */
2224
2225 static void
2226 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2227 {
2228   dump_printf_loc (msg_type, vect_location, "%s", msg);
2229   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2230   dump_printf (msg_type, "\n");
2231 }
2232
2233
2234 /* Detect SLP reduction of the form:
2235
2236    #a1 = phi <a5, a0>
2237    a2 = operation (a1)
2238    a3 = operation (a2)
2239    a4 = operation (a3)
2240    a5 = operation (a4)
2241
2242    #a = phi <a5>
2243
2244    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2245    FIRST_STMT is the first reduction stmt in the chain
2246    (a2 = operation (a1)).
2247
2248    Return TRUE if a reduction chain was detected.  */
2249
2250 static bool
2251 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2252                        gimple *first_stmt)
2253 {
2254   struct loop *loop = (gimple_bb (phi))->loop_father;
2255   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2256   enum tree_code code;
2257   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2258   stmt_vec_info use_stmt_info, current_stmt_info;
2259   tree lhs;
2260   imm_use_iterator imm_iter;
2261   use_operand_p use_p;
2262   int nloop_uses, size = 0, n_out_of_loop_uses;
2263   bool found = false;
2264
2265   if (loop != vect_loop)
2266     return false;
2267
2268   lhs = PHI_RESULT (phi);
2269   code = gimple_assign_rhs_code (first_stmt);
2270   while (1)
2271     {
2272       nloop_uses = 0;
2273       n_out_of_loop_uses = 0;
2274       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2275         {
2276           gimple *use_stmt = USE_STMT (use_p);
2277           if (is_gimple_debug (use_stmt))
2278             continue;
2279
2280           /* Check if we got back to the reduction phi.  */
2281           if (use_stmt == phi)
2282             {
2283               loop_use_stmt = use_stmt;
2284               found = true;
2285               break;
2286             }
2287
2288           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2289             {
2290               loop_use_stmt = use_stmt;
2291               nloop_uses++;
2292             }
2293            else
2294              n_out_of_loop_uses++;
2295
2296            /* There are can be either a single use in the loop or two uses in
2297               phi nodes.  */
2298            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2299              return false;
2300         }
2301
2302       if (found)
2303         break;
2304
2305       /* We reached a statement with no loop uses.  */
2306       if (nloop_uses == 0)
2307         return false;
2308
2309       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2310       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2311         return false;
2312
2313       if (!is_gimple_assign (loop_use_stmt)
2314           || code != gimple_assign_rhs_code (loop_use_stmt)
2315           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2316         return false;
2317
2318       /* Insert USE_STMT into reduction chain.  */
2319       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2320       if (current_stmt)
2321         {
2322           current_stmt_info = vinfo_for_stmt (current_stmt);
2323           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2324           GROUP_FIRST_ELEMENT (use_stmt_info)
2325             = GROUP_FIRST_ELEMENT (current_stmt_info);
2326         }
2327       else
2328         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2329
2330       lhs = gimple_assign_lhs (loop_use_stmt);
2331       current_stmt = loop_use_stmt;
2332       size++;
2333    }
2334
2335   if (!found || loop_use_stmt != phi || size < 2)
2336     return false;
2337
2338   /* Swap the operands, if needed, to make the reduction operand be the second
2339      operand.  */
2340   lhs = PHI_RESULT (phi);
2341   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2342   while (next_stmt)
2343     {
2344       if (gimple_assign_rhs2 (next_stmt) == lhs)
2345         {
2346           tree op = gimple_assign_rhs1 (next_stmt);
2347           gimple *def_stmt = NULL;
2348
2349           if (TREE_CODE (op) == SSA_NAME)
2350             def_stmt = SSA_NAME_DEF_STMT (op);
2351
2352           /* Check that the other def is either defined in the loop
2353              ("vect_internal_def"), or it's an induction (defined by a
2354              loop-header phi-node).  */
2355           if (def_stmt
2356               && gimple_bb (def_stmt)
2357               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2358               && (is_gimple_assign (def_stmt)
2359                   || is_gimple_call (def_stmt)
2360                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2361                            == vect_induction_def
2362                   || (gimple_code (def_stmt) == GIMPLE_PHI
2363                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2364                                   == vect_internal_def
2365                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2366             {
2367               lhs = gimple_assign_lhs (next_stmt);
2368               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2369               continue;
2370             }
2371
2372           return false;
2373         }
2374       else
2375         {
2376           tree op = gimple_assign_rhs2 (next_stmt);
2377           gimple *def_stmt = NULL;
2378
2379           if (TREE_CODE (op) == SSA_NAME)
2380             def_stmt = SSA_NAME_DEF_STMT (op);
2381
2382           /* Check that the other def is either defined in the loop
2383             ("vect_internal_def"), or it's an induction (defined by a
2384             loop-header phi-node).  */
2385           if (def_stmt
2386               && gimple_bb (def_stmt)
2387               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2388               && (is_gimple_assign (def_stmt)
2389                   || is_gimple_call (def_stmt)
2390                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2391                               == vect_induction_def
2392                   || (gimple_code (def_stmt) == GIMPLE_PHI
2393                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2394                                   == vect_internal_def
2395                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2396             {
2397               if (dump_enabled_p ())
2398                 {
2399                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2400                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2401                   dump_printf (MSG_NOTE, "\n");
2402                 }
2403
2404               swap_ssa_operands (next_stmt,
2405                                  gimple_assign_rhs1_ptr (next_stmt),
2406                                  gimple_assign_rhs2_ptr (next_stmt));
2407               update_stmt (next_stmt);
2408
2409               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2410                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2411             }
2412           else
2413             return false;
2414         }
2415
2416       lhs = gimple_assign_lhs (next_stmt);
2417       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2418     }
2419
2420   /* Save the chain for further analysis in SLP detection.  */
2421   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2422   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2423   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2424
2425   return true;
2426 }
2427
2428
2429 /* Function vect_is_simple_reduction_1
2430
2431    (1) Detect a cross-iteration def-use cycle that represents a simple
2432    reduction computation.  We look for the following pattern:
2433
2434    loop_header:
2435      a1 = phi < a0, a2 >
2436      a3 = ...
2437      a2 = operation (a3, a1)
2438
2439    or
2440
2441    a3 = ...
2442    loop_header:
2443      a1 = phi < a0, a2 >
2444      a2 = operation (a3, a1)
2445
2446    such that:
2447    1. operation is commutative and associative and it is safe to
2448       change the order of the computation (if CHECK_REDUCTION is true)
2449    2. no uses for a2 in the loop (a2 is used out of the loop)
2450    3. no uses of a1 in the loop besides the reduction operation
2451    4. no uses of a1 outside the loop.
2452
2453    Conditions 1,4 are tested here.
2454    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2455
2456    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2457    nested cycles, if CHECK_REDUCTION is false.
2458
2459    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2460    reductions:
2461
2462      a1 = phi < a0, a2 >
2463      inner loop (def of a3)
2464      a2 = phi < a3 >
2465
2466    (4) Detect condition expressions, ie:
2467      for (int i = 0; i < N; i++)
2468        if (a[i] < val)
2469         ret_val = a[i];
2470
2471 */
2472
2473 static gimple *
2474 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2475                           bool check_reduction, bool *double_reduc,
2476                           bool need_wrapping_integral_overflow,
2477                           enum vect_reduction_type *v_reduc_type)
2478 {
2479   struct loop *loop = (gimple_bb (phi))->loop_father;
2480   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2481   edge latch_e = loop_latch_edge (loop);
2482   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2483   gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2484   enum tree_code orig_code, code;
2485   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2486   tree type;
2487   int nloop_uses;
2488   tree name;
2489   imm_use_iterator imm_iter;
2490   use_operand_p use_p;
2491   bool phi_def;
2492
2493   *double_reduc = false;
2494   *v_reduc_type = TREE_CODE_REDUCTION;
2495
2496   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2497      otherwise, we assume outer loop vectorization.  */
2498   gcc_assert ((check_reduction && loop == vect_loop)
2499               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2500
2501   name = PHI_RESULT (phi);
2502   /* ???  If there are no uses of the PHI result the inner loop reduction
2503      won't be detected as possibly double-reduction by vectorizable_reduction
2504      because that tries to walk the PHI arg from the preheader edge which
2505      can be constant.  See PR60382.  */
2506   if (has_zero_uses (name))
2507     return NULL;
2508   nloop_uses = 0;
2509   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2510     {
2511       gimple *use_stmt = USE_STMT (use_p);
2512       if (is_gimple_debug (use_stmt))
2513         continue;
2514
2515       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2516         {
2517           if (dump_enabled_p ())
2518             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2519                              "intermediate value used outside loop.\n");
2520
2521           return NULL;
2522         }
2523
2524       nloop_uses++;
2525       if (nloop_uses > 1)
2526         {
2527           if (dump_enabled_p ())
2528             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2529                              "reduction used in loop.\n");
2530           return NULL;
2531         }
2532     }
2533
2534   if (TREE_CODE (loop_arg) != SSA_NAME)
2535     {
2536       if (dump_enabled_p ())
2537         {
2538           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2539                            "reduction: not ssa_name: ");
2540           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2541           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2542         }
2543       return NULL;
2544     }
2545
2546   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2547   if (!def_stmt)
2548     {
2549       if (dump_enabled_p ())
2550         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551                          "reduction: no def_stmt.\n");
2552       return NULL;
2553     }
2554
2555   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2556     {
2557       if (dump_enabled_p ())
2558         {
2559           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2560           dump_printf (MSG_NOTE, "\n");
2561         }
2562       return NULL;
2563     }
2564
2565   if (is_gimple_assign (def_stmt))
2566     {
2567       name = gimple_assign_lhs (def_stmt);
2568       phi_def = false;
2569     }
2570   else
2571     {
2572       name = PHI_RESULT (def_stmt);
2573       phi_def = true;
2574     }
2575
2576   nloop_uses = 0;
2577   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2578     {
2579       gimple *use_stmt = USE_STMT (use_p);
2580       if (is_gimple_debug (use_stmt))
2581         continue;
2582       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2583         nloop_uses++;
2584       if (nloop_uses > 1)
2585         {
2586           if (dump_enabled_p ())
2587             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2588                              "reduction used in loop.\n");
2589           return NULL;
2590         }
2591     }
2592
2593   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2594      defined in the inner loop.  */
2595   if (phi_def)
2596     {
2597       op1 = PHI_ARG_DEF (def_stmt, 0);
2598
2599       if (gimple_phi_num_args (def_stmt) != 1
2600           || TREE_CODE (op1) != SSA_NAME)
2601         {
2602           if (dump_enabled_p ())
2603             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2604                              "unsupported phi node definition.\n");
2605
2606           return NULL;
2607         }
2608
2609       def1 = SSA_NAME_DEF_STMT (op1);
2610       if (gimple_bb (def1)
2611           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2612           && loop->inner
2613           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2614           && is_gimple_assign (def1))
2615         {
2616           if (dump_enabled_p ())
2617             report_vect_op (MSG_NOTE, def_stmt,
2618                             "detected double reduction: ");
2619
2620           *double_reduc = true;
2621           return def_stmt;
2622         }
2623
2624       return NULL;
2625     }
2626
2627   code = orig_code = gimple_assign_rhs_code (def_stmt);
2628
2629   /* We can handle "res -= x[i]", which is non-associative by
2630      simply rewriting this into "res += -x[i]".  Avoid changing
2631      gimple instruction for the first simple tests and only do this
2632      if we're allowed to change code at all.  */
2633   if (code == MINUS_EXPR
2634       && (op1 = gimple_assign_rhs1 (def_stmt))
2635       && TREE_CODE (op1) == SSA_NAME
2636       && SSA_NAME_DEF_STMT (op1) == phi)
2637     code = PLUS_EXPR;
2638
2639   if (check_reduction)
2640     {
2641       if (code == COND_EXPR)
2642         *v_reduc_type = COND_REDUCTION;
2643       else if (!commutative_tree_code (code) || !associative_tree_code (code))
2644         {
2645           if (dump_enabled_p ())
2646             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2647                             "reduction: not commutative/associative: ");
2648           return NULL;
2649         }
2650     }
2651
2652   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2653     {
2654       if (code != COND_EXPR)
2655         {
2656           if (dump_enabled_p ())
2657             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2658                             "reduction: not binary operation: ");
2659
2660           return NULL;
2661         }
2662
2663       op3 = gimple_assign_rhs1 (def_stmt);
2664       if (COMPARISON_CLASS_P (op3))
2665         {
2666           op4 = TREE_OPERAND (op3, 1);
2667           op3 = TREE_OPERAND (op3, 0);
2668         }
2669
2670       op1 = gimple_assign_rhs2 (def_stmt);
2671       op2 = gimple_assign_rhs3 (def_stmt);
2672
2673       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2674         {
2675           if (dump_enabled_p ())
2676             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2677                             "reduction: uses not ssa_names: ");
2678
2679           return NULL;
2680         }
2681     }
2682   else
2683     {
2684       op1 = gimple_assign_rhs1 (def_stmt);
2685       op2 = gimple_assign_rhs2 (def_stmt);
2686
2687       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2688         {
2689           if (dump_enabled_p ())
2690             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2691                             "reduction: uses not ssa_names: ");
2692
2693           return NULL;
2694         }
2695    }
2696
2697   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2698   if ((TREE_CODE (op1) == SSA_NAME
2699        && !types_compatible_p (type,TREE_TYPE (op1)))
2700       || (TREE_CODE (op2) == SSA_NAME
2701           && !types_compatible_p (type, TREE_TYPE (op2)))
2702       || (op3 && TREE_CODE (op3) == SSA_NAME
2703           && !types_compatible_p (type, TREE_TYPE (op3)))
2704       || (op4 && TREE_CODE (op4) == SSA_NAME
2705           && !types_compatible_p (type, TREE_TYPE (op4))))
2706     {
2707       if (dump_enabled_p ())
2708         {
2709           dump_printf_loc (MSG_NOTE, vect_location,
2710                            "reduction: multiple types: operation type: ");
2711           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2712           dump_printf (MSG_NOTE, ", operands types: ");
2713           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2714                              TREE_TYPE (op1));
2715           dump_printf (MSG_NOTE, ",");
2716           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2717                              TREE_TYPE (op2));
2718           if (op3)
2719             {
2720               dump_printf (MSG_NOTE, ",");
2721               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2722                                  TREE_TYPE (op3));
2723             }
2724
2725           if (op4)
2726             {
2727               dump_printf (MSG_NOTE, ",");
2728               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2729                                  TREE_TYPE (op4));
2730             }
2731           dump_printf (MSG_NOTE, "\n");
2732         }
2733
2734       return NULL;
2735     }
2736
2737   /* Check that it's ok to change the order of the computation.
2738      Generally, when vectorizing a reduction we change the order of the
2739      computation.  This may change the behavior of the program in some
2740      cases, so we need to check that this is ok.  One exception is when
2741      vectorizing an outer-loop: the inner-loop is executed sequentially,
2742      and therefore vectorizing reductions in the inner-loop during
2743      outer-loop vectorization is safe.  */
2744
2745   if (*v_reduc_type != COND_REDUCTION)
2746     {
2747       /* CHECKME: check for !flag_finite_math_only too?  */
2748       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2749           && check_reduction)
2750         {
2751           /* Changing the order of operations changes the semantics.  */
2752           if (dump_enabled_p ())
2753             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2754                         "reduction: unsafe fp math optimization: ");
2755           return NULL;
2756         }
2757       else if (INTEGRAL_TYPE_P (type) && check_reduction)
2758         {
2759           if (!operation_no_trapping_overflow (type, code))
2760             {
2761               /* Changing the order of operations changes the semantics.  */
2762               if (dump_enabled_p ())
2763                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2764                                 "reduction: unsafe int math optimization"
2765                                 " (overflow traps): ");
2766               return NULL;
2767             }
2768           if (need_wrapping_integral_overflow
2769               && !TYPE_OVERFLOW_WRAPS (type)
2770               && operation_can_overflow (code))
2771             {
2772               /* Changing the order of operations changes the semantics.  */
2773               if (dump_enabled_p ())
2774                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2775                                 "reduction: unsafe int math optimization"
2776                                 " (overflow doesn't wrap): ");
2777               return NULL;
2778             }
2779         }
2780       else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2781         {
2782           /* Changing the order of operations changes the semantics.  */
2783           if (dump_enabled_p ())
2784           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2785                           "reduction: unsafe fixed-point math optimization: ");
2786           return NULL;
2787         }
2788     }
2789
2790   /* Reduction is safe. We're dealing with one of the following:
2791      1) integer arithmetic and no trapv
2792      2) floating point arithmetic, and special flags permit this optimization
2793      3) nested cycle (i.e., outer loop vectorization).  */
2794   if (TREE_CODE (op1) == SSA_NAME)
2795     def1 = SSA_NAME_DEF_STMT (op1);
2796
2797   if (TREE_CODE (op2) == SSA_NAME)
2798     def2 = SSA_NAME_DEF_STMT (op2);
2799
2800   if (code != COND_EXPR
2801       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2802     {
2803       if (dump_enabled_p ())
2804         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2805       return NULL;
2806     }
2807
2808   /* Check that one def is the reduction def, defined by PHI,
2809      the other def is either defined in the loop ("vect_internal_def"),
2810      or it's an induction (defined by a loop-header phi-node).  */
2811
2812   if (def2 && def2 == phi
2813       && (code == COND_EXPR
2814           || !def1 || gimple_nop_p (def1)
2815           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2816           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2817               && (is_gimple_assign (def1)
2818                   || is_gimple_call (def1)
2819                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2820                       == vect_induction_def
2821                   || (gimple_code (def1) == GIMPLE_PHI
2822                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2823                           == vect_internal_def
2824                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2825     {
2826       if (dump_enabled_p ())
2827         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2828       return def_stmt;
2829     }
2830
2831   if (def1 && def1 == phi
2832       && (code == COND_EXPR
2833           || !def2 || gimple_nop_p (def2)
2834           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2835           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2836               && (is_gimple_assign (def2)
2837                   || is_gimple_call (def2)
2838                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2839                       == vect_induction_def
2840                   || (gimple_code (def2) == GIMPLE_PHI
2841                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2842                           == vect_internal_def
2843                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2844     {
2845       if (check_reduction
2846           && orig_code != MINUS_EXPR)
2847         {
2848           if (code == COND_EXPR)
2849             {
2850               /* No current known use where this case would be useful.  */
2851               if (dump_enabled_p ())
2852                 report_vect_op (MSG_NOTE, def_stmt,
2853                                 "detected reduction: cannot currently swap "
2854                                 "operands for cond_expr");
2855               return NULL;
2856             }
2857
2858           /* Swap operands (just for simplicity - so that the rest of the code
2859              can assume that the reduction variable is always the last (second)
2860              argument).  */
2861           if (dump_enabled_p ())
2862             report_vect_op (MSG_NOTE, def_stmt,
2863                             "detected reduction: need to swap operands: ");
2864
2865           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2866                              gimple_assign_rhs2_ptr (def_stmt));
2867
2868           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2869             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2870         }
2871       else
2872         {
2873           if (dump_enabled_p ())
2874             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2875         }
2876
2877       return def_stmt;
2878     }
2879
2880   /* Try to find SLP reduction chain.  */
2881   if (check_reduction && code != COND_EXPR
2882       && vect_is_slp_reduction (loop_info, phi, def_stmt))
2883     {
2884       if (dump_enabled_p ())
2885         report_vect_op (MSG_NOTE, def_stmt,
2886                         "reduction: detected reduction chain: ");
2887
2888       return def_stmt;
2889     }
2890
2891   if (dump_enabled_p ())
2892     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2893                     "reduction: unknown pattern: ");
2894
2895   return NULL;
2896 }
2897
2898 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2899    in-place if it enables detection of more reductions.  Arguments
2900    as there.  */
2901
2902 gimple *
2903 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2904                              bool check_reduction, bool *double_reduc,
2905                              bool need_wrapping_integral_overflow)
2906 {
2907   enum vect_reduction_type v_reduc_type;
2908   return vect_is_simple_reduction (loop_info, phi, check_reduction,
2909                                    double_reduc,
2910                                    need_wrapping_integral_overflow,
2911                                    &v_reduc_type);
2912 }
2913
2914 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2915 int
2916 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2917                              int *peel_iters_epilogue,
2918                              stmt_vector_for_cost *scalar_cost_vec,
2919                              stmt_vector_for_cost *prologue_cost_vec,
2920                              stmt_vector_for_cost *epilogue_cost_vec)
2921 {
2922   int retval = 0;
2923   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2924
2925   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2926     {
2927       *peel_iters_epilogue = vf/2;
2928       if (dump_enabled_p ())
2929         dump_printf_loc (MSG_NOTE, vect_location,
2930                          "cost model: epilogue peel iters set to vf/2 "
2931                          "because loop iterations are unknown .\n");
2932
2933       /* If peeled iterations are known but number of scalar loop
2934          iterations are unknown, count a taken branch per peeled loop.  */
2935       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2936                                  NULL, 0, vect_prologue);
2937       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2938                                  NULL, 0, vect_epilogue);
2939     }
2940   else
2941     {
2942       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2943       peel_iters_prologue = niters < peel_iters_prologue ?
2944                             niters : peel_iters_prologue;
2945       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2946       /* If we need to peel for gaps, but no peeling is required, we have to
2947          peel VF iterations.  */
2948       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2949         *peel_iters_epilogue = vf;
2950     }
2951
2952   stmt_info_for_cost *si;
2953   int j;
2954   if (peel_iters_prologue)
2955     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2956       retval += record_stmt_cost (prologue_cost_vec,
2957                                   si->count * peel_iters_prologue,
2958                                   si->kind, NULL, si->misalign,
2959                                   vect_prologue);
2960   if (*peel_iters_epilogue)
2961     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2962       retval += record_stmt_cost (epilogue_cost_vec,
2963                                   si->count * *peel_iters_epilogue,
2964                                   si->kind, NULL, si->misalign,
2965                                   vect_epilogue);
2966
2967   return retval;
2968 }
2969
2970 /* Function vect_estimate_min_profitable_iters
2971
2972    Return the number of iterations required for the vector version of the
2973    loop to be profitable relative to the cost of the scalar version of the
2974    loop.  */
2975
2976 static void
2977 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2978                                     int *ret_min_profitable_niters,
2979                                     int *ret_min_profitable_estimate)
2980 {
2981   int min_profitable_iters;
2982   int min_profitable_estimate;
2983   int peel_iters_prologue;
2984   int peel_iters_epilogue;
2985   unsigned vec_inside_cost = 0;
2986   int vec_outside_cost = 0;
2987   unsigned vec_prologue_cost = 0;
2988   unsigned vec_epilogue_cost = 0;
2989   int scalar_single_iter_cost = 0;
2990   int scalar_outside_cost = 0;
2991   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2992   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2993   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2994
2995   /* Cost model disabled.  */
2996   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2997     {
2998       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2999       *ret_min_profitable_niters = 0;
3000       *ret_min_profitable_estimate = 0;
3001       return;
3002     }
3003
3004   /* Requires loop versioning tests to handle misalignment.  */
3005   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3006     {
3007       /*  FIXME: Make cost depend on complexity of individual check.  */
3008       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3009       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3010                             vect_prologue);
3011       dump_printf (MSG_NOTE,
3012                    "cost model: Adding cost of checks for loop "
3013                    "versioning to treat misalignment.\n");
3014     }
3015
3016   /* Requires loop versioning with alias checks.  */
3017   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3018     {
3019       /*  FIXME: Make cost depend on complexity of individual check.  */
3020       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3021       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3022                             vect_prologue);
3023       dump_printf (MSG_NOTE,
3024                    "cost model: Adding cost of checks for loop "
3025                    "versioning aliasing.\n");
3026     }
3027
3028   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3029       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3030     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3031                           vect_prologue);
3032
3033   /* Count statements in scalar loop.  Using this as scalar cost for a single
3034      iteration for now.
3035
3036      TODO: Add outer loop support.
3037
3038      TODO: Consider assigning different costs to different scalar
3039      statements.  */
3040
3041   scalar_single_iter_cost
3042     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3043
3044   /* Add additional cost for the peeled instructions in prologue and epilogue
3045      loop.
3046
3047      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3048      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3049
3050      TODO: Build an expression that represents peel_iters for prologue and
3051      epilogue to be used in a run-time test.  */
3052
3053   if (npeel  < 0)
3054     {
3055       peel_iters_prologue = vf/2;
3056       dump_printf (MSG_NOTE, "cost model: "
3057                    "prologue peel iters set to vf/2.\n");
3058
3059       /* If peeling for alignment is unknown, loop bound of main loop becomes
3060          unknown.  */
3061       peel_iters_epilogue = vf/2;
3062       dump_printf (MSG_NOTE, "cost model: "
3063                    "epilogue peel iters set to vf/2 because "
3064                    "peeling for alignment is unknown.\n");
3065
3066       /* If peeled iterations are unknown, count a taken branch and a not taken
3067          branch per peeled loop. Even if scalar loop iterations are known,
3068          vector iterations are not known since peeled prologue iterations are
3069          not known. Hence guards remain the same.  */
3070       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3071                             NULL, 0, vect_prologue);
3072       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3073                             NULL, 0, vect_prologue);
3074       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3075                             NULL, 0, vect_epilogue);
3076       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3077                             NULL, 0, vect_epilogue);
3078       stmt_info_for_cost *si;
3079       int j;
3080       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3081         {
3082           struct _stmt_vec_info *stmt_info
3083             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3084           (void) add_stmt_cost (target_cost_data,
3085                                 si->count * peel_iters_prologue,
3086                                 si->kind, stmt_info, si->misalign,
3087                                 vect_prologue);
3088           (void) add_stmt_cost (target_cost_data,
3089                                 si->count * peel_iters_epilogue,
3090                                 si->kind, stmt_info, si->misalign,
3091                                 vect_epilogue);
3092         }
3093     }
3094   else
3095     {
3096       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3097       stmt_info_for_cost *si;
3098       int j;
3099       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3100
3101       prologue_cost_vec.create (2);
3102       epilogue_cost_vec.create (2);
3103       peel_iters_prologue = npeel;
3104
3105       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3106                                           &peel_iters_epilogue,
3107                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3108                                             (loop_vinfo),
3109                                           &prologue_cost_vec,
3110                                           &epilogue_cost_vec);
3111
3112       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3113         {
3114           struct _stmt_vec_info *stmt_info
3115             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3116           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3117                                 si->misalign, vect_prologue);
3118         }
3119
3120       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3121         {
3122           struct _stmt_vec_info *stmt_info
3123             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3124           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3125                                 si->misalign, vect_epilogue);
3126         }
3127
3128       prologue_cost_vec.release ();
3129       epilogue_cost_vec.release ();
3130     }
3131
3132   /* FORNOW: The scalar outside cost is incremented in one of the
3133      following ways:
3134
3135      1. The vectorizer checks for alignment and aliasing and generates
3136      a condition that allows dynamic vectorization.  A cost model
3137      check is ANDED with the versioning condition.  Hence scalar code
3138      path now has the added cost of the versioning check.
3139
3140        if (cost > th & versioning_check)
3141          jmp to vector code
3142
3143      Hence run-time scalar is incremented by not-taken branch cost.
3144
3145      2. The vectorizer then checks if a prologue is required.  If the
3146      cost model check was not done before during versioning, it has to
3147      be done before the prologue check.
3148
3149        if (cost <= th)
3150          prologue = scalar_iters
3151        if (prologue == 0)
3152          jmp to vector code
3153        else
3154          execute prologue
3155        if (prologue == num_iters)
3156          go to exit
3157
3158      Hence the run-time scalar cost is incremented by a taken branch,
3159      plus a not-taken branch, plus a taken branch cost.
3160
3161      3. The vectorizer then checks if an epilogue is required.  If the
3162      cost model check was not done before during prologue check, it
3163      has to be done with the epilogue check.
3164
3165        if (prologue == 0)
3166          jmp to vector code
3167        else
3168          execute prologue
3169        if (prologue == num_iters)
3170          go to exit
3171        vector code:
3172          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3173            jmp to epilogue
3174
3175      Hence the run-time scalar cost should be incremented by 2 taken
3176      branches.
3177
3178      TODO: The back end may reorder the BBS's differently and reverse
3179      conditions/branch directions.  Change the estimates below to
3180      something more reasonable.  */
3181
3182   /* If the number of iterations is known and we do not do versioning, we can
3183      decide whether to vectorize at compile time.  Hence the scalar version
3184      do not carry cost model guard costs.  */
3185   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3186       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3187       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3188     {
3189       /* Cost model check occurs at versioning.  */
3190       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3191           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3192         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3193       else
3194         {
3195           /* Cost model check occurs at prologue generation.  */
3196           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3197             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3198               + vect_get_stmt_cost (cond_branch_not_taken);
3199           /* Cost model check occurs at epilogue generation.  */
3200           else
3201             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3202         }
3203     }
3204
3205   /* Complete the target-specific cost calculations.  */
3206   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3207                &vec_inside_cost, &vec_epilogue_cost);
3208
3209   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3210
3211   if (dump_enabled_p ())
3212     {
3213       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3214       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3215                    vec_inside_cost);
3216       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3217                    vec_prologue_cost);
3218       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3219                    vec_epilogue_cost);
3220       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3221                    scalar_single_iter_cost);
3222       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3223                    scalar_outside_cost);
3224       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3225                    vec_outside_cost);
3226       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3227                    peel_iters_prologue);
3228       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3229                    peel_iters_epilogue);
3230     }
3231
3232   /* Calculate number of iterations required to make the vector version
3233      profitable, relative to the loop bodies only.  The following condition
3234      must hold true:
3235      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3236      where
3237      SIC = scalar iteration cost, VIC = vector iteration cost,
3238      VOC = vector outside cost, VF = vectorization factor,
3239      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3240      SOC = scalar outside cost for run time cost model check.  */
3241
3242   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3243     {
3244       if (vec_outside_cost <= 0)
3245         min_profitable_iters = 1;
3246       else
3247         {
3248           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3249                                   - vec_inside_cost * peel_iters_prologue
3250                                   - vec_inside_cost * peel_iters_epilogue)
3251                                  / ((scalar_single_iter_cost * vf)
3252                                     - vec_inside_cost);
3253
3254           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3255               <= (((int) vec_inside_cost * min_profitable_iters)
3256                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3257             min_profitable_iters++;
3258         }
3259     }
3260   /* vector version will never be profitable.  */
3261   else
3262     {
3263       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3264         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3265                     "did not happen for a simd loop");
3266
3267       if (dump_enabled_p ())
3268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3269                          "cost model: the vector iteration cost = %d "
3270                          "divided by the scalar iteration cost = %d "
3271                          "is greater or equal to the vectorization factor = %d"
3272                          ".\n",
3273                          vec_inside_cost, scalar_single_iter_cost, vf);
3274       *ret_min_profitable_niters = -1;
3275       *ret_min_profitable_estimate = -1;
3276       return;
3277     }
3278
3279   dump_printf (MSG_NOTE,
3280                "  Calculated minimum iters for profitability: %d\n",
3281                min_profitable_iters);
3282
3283   min_profitable_iters =
3284         min_profitable_iters < vf ? vf : min_profitable_iters;
3285
3286   /* Because the condition we create is:
3287      if (niters <= min_profitable_iters)
3288        then skip the vectorized loop.  */
3289   min_profitable_iters--;
3290
3291   if (dump_enabled_p ())
3292     dump_printf_loc (MSG_NOTE, vect_location,
3293                      "  Runtime profitability threshold = %d\n",
3294                      min_profitable_iters);
3295
3296   *ret_min_profitable_niters = min_profitable_iters;
3297
3298   /* Calculate number of iterations required to make the vector version
3299      profitable, relative to the loop bodies only.
3300
3301      Non-vectorized variant is SIC * niters and it must win over vector
3302      variant on the expected loop trip count.  The following condition must hold true:
3303      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3304
3305   if (vec_outside_cost <= 0)
3306     min_profitable_estimate = 1;
3307   else
3308     {
3309       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3310                                  - vec_inside_cost * peel_iters_prologue
3311                                  - vec_inside_cost * peel_iters_epilogue)
3312                                  / ((scalar_single_iter_cost * vf)
3313                                    - vec_inside_cost);
3314     }
3315   min_profitable_estimate --;
3316   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3317   if (dump_enabled_p ())
3318     dump_printf_loc (MSG_NOTE, vect_location,
3319                      "  Static estimate profitability threshold = %d\n",
3320                       min_profitable_iters);
3321
3322   *ret_min_profitable_estimate = min_profitable_estimate;
3323 }
3324
3325 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3326    vector elements (not bits) for a vector of mode MODE.  */
3327 static void
3328 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3329                               unsigned char *sel)
3330 {
3331   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3332
3333   for (i = 0; i < nelt; i++)
3334     sel[i] = (i + offset) & (2*nelt - 1);
3335 }
3336
3337 /* Checks whether the target supports whole-vector shifts for vectors of mode
3338    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3339    it supports vec_perm_const with masks for all necessary shift amounts.  */
3340 static bool
3341 have_whole_vector_shift (enum machine_mode mode)
3342 {
3343   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3344     return true;
3345
3346   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3347     return false;
3348
3349   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3350   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3351
3352   for (i = nelt/2; i >= 1; i/=2)
3353     {
3354       calc_vec_perm_mask_for_shift (mode, i, sel);
3355       if (!can_vec_perm_p (mode, false, sel))
3356         return false;
3357     }
3358   return true;
3359 }
3360
3361 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3362
3363 static tree
3364 get_reduction_op (gimple *stmt, int reduc_index)
3365 {
3366   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3367     {
3368     case GIMPLE_SINGLE_RHS:
3369       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3370                   == ternary_op);
3371       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3372     case GIMPLE_UNARY_RHS:
3373       return gimple_assign_rhs1 (stmt);
3374     case GIMPLE_BINARY_RHS:
3375       return (reduc_index
3376               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3377     case GIMPLE_TERNARY_RHS:
3378       return gimple_op (stmt, reduc_index + 1);
3379     default:
3380       gcc_unreachable ();
3381     }
3382 }
3383
3384 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3385    functions. Design better to avoid maintenance issues.  */
3386
3387 /* Function vect_model_reduction_cost.
3388
3389    Models cost for a reduction operation, including the vector ops
3390    generated within the strip-mine loop, the initial definition before
3391    the loop, and the epilogue code that must be generated.  */
3392
3393 static bool
3394 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3395                            int ncopies, int reduc_index)
3396 {
3397   int prologue_cost = 0, epilogue_cost = 0;
3398   enum tree_code code;
3399   optab optab;
3400   tree vectype;
3401   gimple *stmt, *orig_stmt;
3402   tree reduction_op;
3403   machine_mode mode;
3404   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3405   struct loop *loop = NULL;
3406   void *target_cost_data;
3407
3408   if (loop_vinfo)
3409     {
3410       loop = LOOP_VINFO_LOOP (loop_vinfo);
3411       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3412     }
3413   else
3414     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3415
3416   /* Condition reductions generate two reductions in the loop.  */
3417   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3418     ncopies *= 2;
3419
3420   /* Cost of reduction op inside loop.  */
3421   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3422                                         stmt_info, 0, vect_body);
3423   stmt = STMT_VINFO_STMT (stmt_info);
3424
3425   reduction_op = get_reduction_op (stmt, reduc_index);
3426
3427   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3428   if (!vectype)
3429     {
3430       if (dump_enabled_p ())
3431         {
3432           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3433                            "unsupported data-type ");
3434           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3435                              TREE_TYPE (reduction_op));
3436           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3437         }
3438       return false;
3439    }
3440
3441   mode = TYPE_MODE (vectype);
3442   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3443
3444   if (!orig_stmt)
3445     orig_stmt = STMT_VINFO_STMT (stmt_info);
3446
3447   code = gimple_assign_rhs_code (orig_stmt);
3448
3449   /* Add in cost for initial definition.
3450      For cond reduction we have four vectors: initial index, step, initial
3451      result of the data reduction, initial value of the index reduction.  */
3452   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3453                        == COND_REDUCTION ? 4 : 1;
3454   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3455                                   scalar_to_vec, stmt_info, 0,
3456                                   vect_prologue);
3457
3458   /* Determine cost of epilogue code.
3459
3460      We have a reduction operator that will reduce the vector in one statement.
3461      Also requires scalar extract.  */
3462
3463   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3464     {
3465       if (reduc_code != ERROR_MARK)
3466         {
3467           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3468             {
3469               /* An EQ stmt and an COND_EXPR stmt.  */
3470               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3471                                               vector_stmt, stmt_info, 0,
3472                                               vect_epilogue);
3473               /* Reduction of the max index and a reduction of the found
3474                  values.  */
3475               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3476                                               vec_to_scalar, stmt_info, 0,
3477                                               vect_epilogue);
3478               /* A broadcast of the max value.  */
3479               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3480                                               scalar_to_vec, stmt_info, 0,
3481                                               vect_epilogue);
3482             }
3483           else
3484             {
3485               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3486                                               stmt_info, 0, vect_epilogue);
3487               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3488                                               vec_to_scalar, stmt_info, 0,
3489                                               vect_epilogue);
3490             }
3491         }
3492       else
3493         {
3494           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3495           tree bitsize =
3496             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3497           int element_bitsize = tree_to_uhwi (bitsize);
3498           int nelements = vec_size_in_bits / element_bitsize;
3499
3500           optab = optab_for_tree_code (code, vectype, optab_default);
3501
3502           /* We have a whole vector shift available.  */
3503           if (VECTOR_MODE_P (mode)
3504               && optab_handler (optab, mode) != CODE_FOR_nothing
3505               && have_whole_vector_shift (mode))
3506             {
3507               /* Final reduction via vector shifts and the reduction operator.
3508                  Also requires scalar extract.  */
3509               epilogue_cost += add_stmt_cost (target_cost_data,
3510                                               exact_log2 (nelements) * 2,
3511                                               vector_stmt, stmt_info, 0,
3512                                               vect_epilogue);
3513               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3514                                               vec_to_scalar, stmt_info, 0,
3515                                               vect_epilogue);
3516             }
3517           else
3518             /* Use extracts and reduction op for final reduction.  For N
3519                elements, we have N extracts and N-1 reduction ops.  */
3520             epilogue_cost += add_stmt_cost (target_cost_data,
3521                                             nelements + nelements - 1,
3522                                             vector_stmt, stmt_info, 0,
3523                                             vect_epilogue);
3524         }
3525     }
3526
3527   if (dump_enabled_p ())
3528     dump_printf (MSG_NOTE,
3529                  "vect_model_reduction_cost: inside_cost = %d, "
3530                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3531                  prologue_cost, epilogue_cost);
3532
3533   return true;
3534 }
3535
3536
3537 /* Function vect_model_induction_cost.
3538
3539    Models cost for induction operations.  */
3540
3541 static void
3542 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3543 {
3544   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3545   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3546   unsigned inside_cost, prologue_cost;
3547
3548   /* loop cost for vec_loop.  */
3549   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3550                                stmt_info, 0, vect_body);
3551
3552   /* prologue cost for vec_init and vec_step.  */
3553   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3554                                  stmt_info, 0, vect_prologue);
3555
3556   if (dump_enabled_p ())
3557     dump_printf_loc (MSG_NOTE, vect_location,
3558                      "vect_model_induction_cost: inside_cost = %d, "
3559                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3560 }
3561
3562
3563 /* Function get_initial_def_for_induction
3564
3565    Input:
3566    STMT - a stmt that performs an induction operation in the loop.
3567    IV_PHI - the initial value of the induction variable
3568
3569    Output:
3570    Return a vector variable, initialized with the first VF values of
3571    the induction variable.  E.g., for an iv with IV_PHI='X' and
3572    evolution S, for a vector of 4 units, we want to return:
3573    [X, X + S, X + 2*S, X + 3*S].  */
3574
3575 static tree
3576 get_initial_def_for_induction (gimple *iv_phi)
3577 {
3578   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3579   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3580   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3581   tree vectype;
3582   int nunits;
3583   edge pe = loop_preheader_edge (loop);
3584   struct loop *iv_loop;
3585   basic_block new_bb;
3586   tree new_vec, vec_init, vec_step, t;
3587   tree new_name;
3588   gimple *new_stmt;
3589   gphi *induction_phi;
3590   tree induc_def, vec_def, vec_dest;
3591   tree init_expr, step_expr;
3592   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3593   int i;
3594   int ncopies;
3595   tree expr;
3596   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3597   bool nested_in_vect_loop = false;
3598   gimple_seq stmts;
3599   imm_use_iterator imm_iter;
3600   use_operand_p use_p;
3601   gimple *exit_phi;
3602   edge latch_e;
3603   tree loop_arg;
3604   gimple_stmt_iterator si;
3605   basic_block bb = gimple_bb (iv_phi);
3606   tree stepvectype;
3607   tree resvectype;
3608
3609   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3610   if (nested_in_vect_loop_p (loop, iv_phi))
3611     {
3612       nested_in_vect_loop = true;
3613       iv_loop = loop->inner;
3614     }
3615   else
3616     iv_loop = loop;
3617   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3618
3619   latch_e = loop_latch_edge (iv_loop);
3620   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3621
3622   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3623   gcc_assert (step_expr != NULL_TREE);
3624
3625   pe = loop_preheader_edge (iv_loop);
3626   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3627                                      loop_preheader_edge (iv_loop));
3628
3629   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3630   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3631   gcc_assert (vectype);
3632   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3633   ncopies = vf / nunits;
3634
3635   gcc_assert (phi_info);
3636   gcc_assert (ncopies >= 1);
3637
3638   /* Convert the step to the desired type.  */
3639   stmts = NULL;
3640   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3641   if (stmts)
3642     {
3643       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3644       gcc_assert (!new_bb);
3645     }
3646
3647   /* Find the first insertion point in the BB.  */
3648   si = gsi_after_labels (bb);
3649
3650   /* Create the vector that holds the initial_value of the induction.  */
3651   if (nested_in_vect_loop)
3652     {
3653       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3654          been created during vectorization of previous stmts.  We obtain it
3655          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3656       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3657       /* If the initial value is not of proper type, convert it.  */
3658       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3659         {
3660           new_stmt
3661             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3662                                                           vect_simple_var,
3663                                                           "vec_iv_"),
3664                                    VIEW_CONVERT_EXPR,
3665                                    build1 (VIEW_CONVERT_EXPR, vectype,
3666                                            vec_init));
3667           vec_init = gimple_assign_lhs (new_stmt);
3668           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3669                                                  new_stmt);
3670           gcc_assert (!new_bb);
3671           set_vinfo_for_stmt (new_stmt,
3672                               new_stmt_vec_info (new_stmt, loop_vinfo));
3673         }
3674     }
3675   else
3676     {
3677       vec<constructor_elt, va_gc> *v;
3678
3679       /* iv_loop is the loop to be vectorized. Create:
3680          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3681       stmts = NULL;
3682       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3683
3684       vec_alloc (v, nunits);
3685       bool constant_p = is_gimple_min_invariant (new_name);
3686       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3687       for (i = 1; i < nunits; i++)
3688         {
3689           /* Create: new_name_i = new_name + step_expr  */
3690           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3691                                    new_name, step_expr);
3692           if (!is_gimple_min_invariant (new_name))
3693             constant_p = false;
3694           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3695         }
3696       if (stmts)
3697         {
3698           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3699           gcc_assert (!new_bb);
3700         }
3701
3702       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3703       if (constant_p)
3704         new_vec = build_vector_from_ctor (vectype, v);
3705       else
3706         new_vec = build_constructor (vectype, v);
3707       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3708     }
3709
3710
3711   /* Create the vector that holds the step of the induction.  */
3712   if (nested_in_vect_loop)
3713     /* iv_loop is nested in the loop to be vectorized. Generate:
3714        vec_step = [S, S, S, S]  */
3715     new_name = step_expr;
3716   else
3717     {
3718       /* iv_loop is the loop to be vectorized. Generate:
3719           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3720       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3721         {
3722           expr = build_int_cst (integer_type_node, vf);
3723           expr = fold_convert (TREE_TYPE (step_expr), expr);
3724         }
3725       else
3726         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3727       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3728                               expr, step_expr);
3729       if (TREE_CODE (step_expr) == SSA_NAME)
3730         new_name = vect_init_vector (iv_phi, new_name,
3731                                      TREE_TYPE (step_expr), NULL);
3732     }
3733
3734   t = unshare_expr (new_name);
3735   gcc_assert (CONSTANT_CLASS_P (new_name)
3736               || TREE_CODE (new_name) == SSA_NAME);
3737   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3738   gcc_assert (stepvectype);
3739   new_vec = build_vector_from_val (stepvectype, t);
3740   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3741
3742
3743   /* Create the following def-use cycle:
3744      loop prolog:
3745          vec_init = ...
3746          vec_step = ...
3747      loop:
3748          vec_iv = PHI <vec_init, vec_loop>
3749          ...
3750          STMT
3751          ...
3752          vec_loop = vec_iv + vec_step;  */
3753
3754   /* Create the induction-phi that defines the induction-operand.  */
3755   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3756   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3757   set_vinfo_for_stmt (induction_phi,
3758                       new_stmt_vec_info (induction_phi, loop_vinfo));
3759   induc_def = PHI_RESULT (induction_phi);
3760
3761   /* Create the iv update inside the loop  */
3762   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3763   vec_def = make_ssa_name (vec_dest, new_stmt);
3764   gimple_assign_set_lhs (new_stmt, vec_def);
3765   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3766   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3767
3768   /* Set the arguments of the phi node:  */
3769   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3770   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3771                UNKNOWN_LOCATION);
3772
3773
3774   /* In case that vectorization factor (VF) is bigger than the number
3775      of elements that we can fit in a vectype (nunits), we have to generate
3776      more than one vector stmt - i.e - we need to "unroll" the
3777      vector stmt by a factor VF/nunits.  For more details see documentation
3778      in vectorizable_operation.  */
3779
3780   if (ncopies > 1)
3781     {
3782       stmt_vec_info prev_stmt_vinfo;
3783       /* FORNOW. This restriction should be relaxed.  */
3784       gcc_assert (!nested_in_vect_loop);
3785
3786       /* Create the vector that holds the step of the induction.  */
3787       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3788         {
3789           expr = build_int_cst (integer_type_node, nunits);
3790           expr = fold_convert (TREE_TYPE (step_expr), expr);
3791         }
3792       else
3793         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3794       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3795                               expr, step_expr);
3796       if (TREE_CODE (step_expr) == SSA_NAME)
3797         new_name = vect_init_vector (iv_phi, new_name,
3798                                      TREE_TYPE (step_expr), NULL);
3799       t = unshare_expr (new_name);
3800       gcc_assert (CONSTANT_CLASS_P (new_name)
3801                   || TREE_CODE (new_name) == SSA_NAME);
3802       new_vec = build_vector_from_val (stepvectype, t);
3803       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3804
3805       vec_def = induc_def;
3806       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3807       for (i = 1; i < ncopies; i++)
3808         {
3809           /* vec_i = vec_prev + vec_step  */
3810           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3811                                           vec_def, vec_step);
3812           vec_def = make_ssa_name (vec_dest, new_stmt);
3813           gimple_assign_set_lhs (new_stmt, vec_def);
3814
3815           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3816           if (!useless_type_conversion_p (resvectype, vectype))
3817             {
3818               new_stmt
3819                 = gimple_build_assign
3820                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3821                                                 "vec_iv_"),
3822                          VIEW_CONVERT_EXPR,
3823                          build1 (VIEW_CONVERT_EXPR, resvectype,
3824                                  gimple_assign_lhs (new_stmt)));
3825               gimple_assign_set_lhs (new_stmt,
3826                                      make_ssa_name
3827                                        (gimple_assign_lhs (new_stmt), new_stmt));
3828               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3829             }
3830           set_vinfo_for_stmt (new_stmt,
3831                               new_stmt_vec_info (new_stmt, loop_vinfo));
3832           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3833           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3834         }
3835     }
3836
3837   if (nested_in_vect_loop)
3838     {
3839       /* Find the loop-closed exit-phi of the induction, and record
3840          the final vector of induction results:  */
3841       exit_phi = NULL;
3842       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3843         {
3844           gimple *use_stmt = USE_STMT (use_p);
3845           if (is_gimple_debug (use_stmt))
3846             continue;
3847
3848           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3849             {
3850               exit_phi = use_stmt;
3851               break;
3852             }
3853         }
3854       if (exit_phi)
3855         {
3856           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3857           /* FORNOW. Currently not supporting the case that an inner-loop induction
3858              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3859           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3860                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3861
3862           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3863           if (dump_enabled_p ())
3864             {
3865               dump_printf_loc (MSG_NOTE, vect_location,
3866                                "vector of inductions after inner-loop:");
3867               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3868               dump_printf (MSG_NOTE, "\n");
3869             }
3870         }
3871     }
3872
3873
3874   if (dump_enabled_p ())
3875     {
3876       dump_printf_loc (MSG_NOTE, vect_location,
3877                        "transform induction: created def-use cycle: ");
3878       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3879       dump_printf (MSG_NOTE, "\n");
3880       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3881                         SSA_NAME_DEF_STMT (vec_def), 0);
3882       dump_printf (MSG_NOTE, "\n");
3883     }
3884
3885   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3886   if (!useless_type_conversion_p (resvectype, vectype))
3887     {
3888       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3889                                                              vect_simple_var,
3890                                                              "vec_iv_"),
3891                                       VIEW_CONVERT_EXPR,
3892                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3893                                               induc_def));
3894       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3895       gimple_assign_set_lhs (new_stmt, induc_def);
3896       si = gsi_after_labels (bb);
3897       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3898       set_vinfo_for_stmt (new_stmt,
3899                           new_stmt_vec_info (new_stmt, loop_vinfo));
3900       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3901         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3902     }
3903
3904   return induc_def;
3905 }
3906
3907
3908 /* Function get_initial_def_for_reduction
3909
3910    Input:
3911    STMT - a stmt that performs a reduction operation in the loop.
3912    INIT_VAL - the initial value of the reduction variable
3913
3914    Output:
3915    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3916         of the reduction (used for adjusting the epilog - see below).
3917    Return a vector variable, initialized according to the operation that STMT
3918         performs. This vector will be used as the initial value of the
3919         vector of partial results.
3920
3921    Option1 (adjust in epilog): Initialize the vector as follows:
3922      add/bit or/xor:    [0,0,...,0,0]
3923      mult/bit and:      [1,1,...,1,1]
3924      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3925    and when necessary (e.g. add/mult case) let the caller know
3926    that it needs to adjust the result by init_val.
3927
3928    Option2: Initialize the vector as follows:
3929      add/bit or/xor:    [init_val,0,0,...,0]
3930      mult/bit and:      [init_val,1,1,...,1]
3931      min/max/cond_expr: [init_val,init_val,...,init_val]
3932    and no adjustments are needed.
3933
3934    For example, for the following code:
3935
3936    s = init_val;
3937    for (i=0;i<n;i++)
3938      s = s + a[i];
3939
3940    STMT is 's = s + a[i]', and the reduction variable is 's'.
3941    For a vector of 4 units, we want to return either [0,0,0,init_val],
3942    or [0,0,0,0] and let the caller know that it needs to adjust
3943    the result at the end by 'init_val'.
3944
3945    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3946    initialization vector is simpler (same element in all entries), if
3947    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3948
3949    A cost model should help decide between these two schemes.  */
3950
3951 tree
3952 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3953                                tree *adjustment_def)
3954 {
3955   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3956   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3957   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3958   tree scalar_type = TREE_TYPE (init_val);
3959   tree vectype = get_vectype_for_scalar_type (scalar_type);
3960   int nunits;
3961   enum tree_code code = gimple_assign_rhs_code (stmt);
3962   tree def_for_init;
3963   tree init_def;
3964   tree *elts;
3965   int i;
3966   bool nested_in_vect_loop = false;
3967   tree init_value;
3968   REAL_VALUE_TYPE real_init_val = dconst0;
3969   int int_init_val = 0;
3970   gimple *def_stmt = NULL;
3971
3972   gcc_assert (vectype);
3973   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3974
3975   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3976               || SCALAR_FLOAT_TYPE_P (scalar_type));
3977
3978   if (nested_in_vect_loop_p (loop, stmt))
3979     nested_in_vect_loop = true;
3980   else
3981     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3982
3983   /* In case of double reduction we only create a vector variable to be put
3984      in the reduction phi node.  The actual statement creation is done in
3985      vect_create_epilog_for_reduction.  */
3986   if (adjustment_def && nested_in_vect_loop
3987       && TREE_CODE (init_val) == SSA_NAME
3988       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3989       && gimple_code (def_stmt) == GIMPLE_PHI
3990       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3991       && vinfo_for_stmt (def_stmt)
3992       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3993           == vect_double_reduction_def)
3994     {
3995       *adjustment_def = NULL;
3996       return vect_create_destination_var (init_val, vectype);
3997     }
3998
3999   if (TREE_CONSTANT (init_val))
4000     {
4001       if (SCALAR_FLOAT_TYPE_P (scalar_type))
4002         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
4003       else
4004         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
4005     }
4006   else
4007     init_value = init_val;
4008
4009   switch (code)
4010     {
4011       case WIDEN_SUM_EXPR:
4012       case DOT_PROD_EXPR:
4013       case SAD_EXPR:
4014       case PLUS_EXPR:
4015       case MINUS_EXPR:
4016       case BIT_IOR_EXPR:
4017       case BIT_XOR_EXPR:
4018       case MULT_EXPR:
4019       case BIT_AND_EXPR:
4020         /* ADJUSMENT_DEF is NULL when called from
4021            vect_create_epilog_for_reduction to vectorize double reduction.  */
4022         if (adjustment_def)
4023           {
4024             if (nested_in_vect_loop)
4025               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
4026             else
4027               *adjustment_def = init_val;
4028           }
4029
4030         if (code == MULT_EXPR)
4031           {
4032             real_init_val = dconst1;
4033             int_init_val = 1;
4034           }
4035
4036         if (code == BIT_AND_EXPR)
4037           int_init_val = -1;
4038
4039         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4040           def_for_init = build_real (scalar_type, real_init_val);
4041         else
4042           def_for_init = build_int_cst (scalar_type, int_init_val);
4043
4044         /* Create a vector of '0' or '1' except the first element.  */
4045         elts = XALLOCAVEC (tree, nunits);
4046         for (i = nunits - 2; i >= 0; --i)
4047           elts[i + 1] = def_for_init;
4048
4049         /* Option1: the first element is '0' or '1' as well.  */
4050         if (adjustment_def)
4051           {
4052             elts[0] = def_for_init;
4053             init_def = build_vector (vectype, elts);
4054             break;
4055           }
4056
4057         /* Option2: the first element is INIT_VAL.  */
4058         elts[0] = init_val;
4059         if (TREE_CONSTANT (init_val))
4060           init_def = build_vector (vectype, elts);
4061         else
4062           {
4063             vec<constructor_elt, va_gc> *v;
4064             vec_alloc (v, nunits);
4065             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4066             for (i = 1; i < nunits; ++i)
4067               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4068             init_def = build_constructor (vectype, v);
4069           }
4070
4071         break;
4072
4073       case MIN_EXPR:
4074       case MAX_EXPR:
4075       case COND_EXPR:
4076         if (adjustment_def)
4077           {
4078             *adjustment_def = NULL_TREE;
4079             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4080               {
4081                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4082                 break;
4083               }
4084           }
4085         init_def = build_vector_from_val (vectype, init_value);
4086         break;
4087
4088       default:
4089         gcc_unreachable ();
4090     }
4091
4092   return init_def;
4093 }
4094
4095 /* Function vect_create_epilog_for_reduction
4096
4097    Create code at the loop-epilog to finalize the result of a reduction
4098    computation.
4099
4100    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4101      reduction statements.
4102    STMT is the scalar reduction stmt that is being vectorized.
4103    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4104      number of elements that we can fit in a vectype (nunits).  In this case
4105      we have to generate more than one vector stmt - i.e - we need to "unroll"
4106      the vector stmt by a factor VF/nunits.  For more details see documentation
4107      in vectorizable_operation.
4108    REDUC_CODE is the tree-code for the epilog reduction.
4109    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4110      computation.
4111    REDUC_INDEX is the index of the operand in the right hand side of the
4112      statement that is defined by REDUCTION_PHI.
4113    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4114    SLP_NODE is an SLP node containing a group of reduction statements. The
4115      first one in this group is STMT.
4116    INDUCTION_INDEX is the index of the loop for condition reductions.
4117      Otherwise it is undefined.
4118
4119    This function:
4120    1. Creates the reduction def-use cycles: sets the arguments for
4121       REDUCTION_PHIS:
4122       The loop-entry argument is the vectorized initial-value of the reduction.
4123       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4124       sums.
4125    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4126       by applying the operation specified by REDUC_CODE if available, or by
4127       other means (whole-vector shifts or a scalar loop).
4128       The function also creates a new phi node at the loop exit to preserve
4129       loop-closed form, as illustrated below.
4130
4131      The flow at the entry to this function:
4132
4133         loop:
4134           vec_def = phi <null, null>            # REDUCTION_PHI
4135           VECT_DEF = vector_stmt                # vectorized form of STMT
4136           s_loop = scalar_stmt                  # (scalar) STMT
4137         loop_exit:
4138           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4139           use <s_out0>
4140           use <s_out0>
4141
4142      The above is transformed by this function into:
4143
4144         loop:
4145           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4146           VECT_DEF = vector_stmt                # vectorized form of STMT
4147           s_loop = scalar_stmt                  # (scalar) STMT
4148         loop_exit:
4149           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4150           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4151           v_out2 = reduce <v_out1>
4152           s_out3 = extract_field <v_out2, 0>
4153           s_out4 = adjust_result <s_out3>
4154           use <s_out4>
4155           use <s_out4>
4156 */
4157
4158 static void
4159 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4160                                   int ncopies, enum tree_code reduc_code,
4161                                   vec<gimple *> reduction_phis,
4162                                   int reduc_index, bool double_reduc,
4163                                   slp_tree slp_node, tree induction_index)
4164 {
4165   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4166   stmt_vec_info prev_phi_info;
4167   tree vectype;
4168   machine_mode mode;
4169   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4170   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4171   basic_block exit_bb;
4172   tree scalar_dest;
4173   tree scalar_type;
4174   gimple *new_phi = NULL, *phi;
4175   gimple_stmt_iterator exit_gsi;
4176   tree vec_dest;
4177   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4178   gimple *epilog_stmt = NULL;
4179   enum tree_code code = gimple_assign_rhs_code (stmt);
4180   gimple *exit_phi;
4181   tree bitsize;
4182   tree adjustment_def = NULL;
4183   tree vec_initial_def = NULL;
4184   tree reduction_op, expr, def, initial_def = NULL;
4185   tree orig_name, scalar_result;
4186   imm_use_iterator imm_iter, phi_imm_iter;
4187   use_operand_p use_p, phi_use_p;
4188   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4189   bool nested_in_vect_loop = false;
4190   auto_vec<gimple *> new_phis;
4191   auto_vec<gimple *> inner_phis;
4192   enum vect_def_type dt = vect_unknown_def_type;
4193   int j, i;
4194   auto_vec<tree> scalar_results;
4195   unsigned int group_size = 1, k, ratio;
4196   auto_vec<tree> vec_initial_defs;
4197   auto_vec<gimple *> phis;
4198   bool slp_reduc = false;
4199   tree new_phi_result;
4200   gimple *inner_phi = NULL;
4201
4202   if (slp_node)
4203     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4204
4205   if (nested_in_vect_loop_p (loop, stmt))
4206     {
4207       outer_loop = loop;
4208       loop = loop->inner;
4209       nested_in_vect_loop = true;
4210       gcc_assert (!slp_node);
4211     }
4212
4213   reduction_op = get_reduction_op (stmt, reduc_index);
4214
4215   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4216   gcc_assert (vectype);
4217   mode = TYPE_MODE (vectype);
4218
4219   /* 1. Create the reduction def-use cycle:
4220      Set the arguments of REDUCTION_PHIS, i.e., transform
4221
4222         loop:
4223           vec_def = phi <null, null>            # REDUCTION_PHI
4224           VECT_DEF = vector_stmt                # vectorized form of STMT
4225           ...
4226
4227      into:
4228
4229         loop:
4230           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4231           VECT_DEF = vector_stmt                # vectorized form of STMT
4232           ...
4233
4234      (in case of SLP, do it for all the phis). */
4235
4236   /* Get the loop-entry arguments.  */
4237   if (slp_node)
4238     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4239                        NULL, slp_node, reduc_index);
4240   else
4241     {
4242       /* Get at the scalar def before the loop, that defines the initial value
4243          of the reduction variable.  */
4244       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4245       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4246                                            loop_preheader_edge (loop));
4247       vec_initial_defs.create (1);
4248       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4249                                                        &adjustment_def);
4250       vec_initial_defs.quick_push (vec_initial_def);
4251     }
4252
4253   /* Set phi nodes arguments.  */
4254   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4255     {
4256       tree vec_init_def, def;
4257       gimple_seq stmts;
4258       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4259                                            true, NULL_TREE);
4260       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4261       def = vect_defs[i];
4262       for (j = 0; j < ncopies; j++)
4263         {
4264           /* Set the loop-entry arg of the reduction-phi.  */
4265
4266           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4267               == INTEGER_INDUC_COND_REDUCTION)
4268             {
4269               /* Initialise the reduction phi to zero.  This prevents initial
4270                  values of non-zero interferring with the reduction op.  */
4271               gcc_assert (ncopies == 1);
4272               gcc_assert (i == 0);
4273
4274               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4275               tree zero_vec = build_zero_cst (vec_init_def_type);
4276
4277               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4278                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4279             }
4280           else
4281             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4282                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4283
4284           /* Set the loop-latch arg for the reduction-phi.  */
4285           if (j > 0)
4286             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4287
4288           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4289                        UNKNOWN_LOCATION);
4290
4291           if (dump_enabled_p ())
4292             {
4293               dump_printf_loc (MSG_NOTE, vect_location,
4294                                "transform reduction: created def-use cycle: ");
4295               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4296               dump_printf (MSG_NOTE, "\n");
4297               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4298               dump_printf (MSG_NOTE, "\n");
4299             }
4300
4301           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4302         }
4303     }
4304
4305   /* 2. Create epilog code.
4306         The reduction epilog code operates across the elements of the vector
4307         of partial results computed by the vectorized loop.
4308         The reduction epilog code consists of:
4309
4310         step 1: compute the scalar result in a vector (v_out2)
4311         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4312         step 3: adjust the scalar result (s_out3) if needed.
4313
4314         Step 1 can be accomplished using one the following three schemes:
4315           (scheme 1) using reduc_code, if available.
4316           (scheme 2) using whole-vector shifts, if available.
4317           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4318                      combined.
4319
4320           The overall epilog code looks like this:
4321
4322           s_out0 = phi <s_loop>         # original EXIT_PHI
4323           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4324           v_out2 = reduce <v_out1>              # step 1
4325           s_out3 = extract_field <v_out2, 0>    # step 2
4326           s_out4 = adjust_result <s_out3>       # step 3
4327
4328           (step 3 is optional, and steps 1 and 2 may be combined).
4329           Lastly, the uses of s_out0 are replaced by s_out4.  */
4330
4331
4332   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4333          v_out1 = phi <VECT_DEF>
4334          Store them in NEW_PHIS.  */
4335
4336   exit_bb = single_exit (loop)->dest;
4337   prev_phi_info = NULL;
4338   new_phis.create (vect_defs.length ());
4339   FOR_EACH_VEC_ELT (vect_defs, i, def)
4340     {
4341       for (j = 0; j < ncopies; j++)
4342         {
4343           tree new_def = copy_ssa_name (def);
4344           phi = create_phi_node (new_def, exit_bb);
4345           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4346           if (j == 0)
4347             new_phis.quick_push (phi);
4348           else
4349             {
4350               def = vect_get_vec_def_for_stmt_copy (dt, def);
4351               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4352             }
4353
4354           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4355           prev_phi_info = vinfo_for_stmt (phi);
4356         }
4357     }
4358
4359   /* The epilogue is created for the outer-loop, i.e., for the loop being
4360      vectorized.  Create exit phis for the outer loop.  */
4361   if (double_reduc)
4362     {
4363       loop = outer_loop;
4364       exit_bb = single_exit (loop)->dest;
4365       inner_phis.create (vect_defs.length ());
4366       FOR_EACH_VEC_ELT (new_phis, i, phi)
4367         {
4368           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4369           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4370           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4371                            PHI_RESULT (phi));
4372           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4373                                                             loop_vinfo));
4374           inner_phis.quick_push (phi);
4375           new_phis[i] = outer_phi;
4376           prev_phi_info = vinfo_for_stmt (outer_phi);
4377           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4378             {
4379               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4380               new_result = copy_ssa_name (PHI_RESULT (phi));
4381               outer_phi = create_phi_node (new_result, exit_bb);
4382               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4383                                PHI_RESULT (phi));
4384               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4385                                                                 loop_vinfo));
4386               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4387               prev_phi_info = vinfo_for_stmt (outer_phi);
4388             }
4389         }
4390     }
4391
4392   exit_gsi = gsi_after_labels (exit_bb);
4393
4394   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4395          (i.e. when reduc_code is not available) and in the final adjustment
4396          code (if needed).  Also get the original scalar reduction variable as
4397          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4398          represents a reduction pattern), the tree-code and scalar-def are
4399          taken from the original stmt that the pattern-stmt (STMT) replaces.
4400          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4401          are taken from STMT.  */
4402
4403   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4404   if (!orig_stmt)
4405     {
4406       /* Regular reduction  */
4407       orig_stmt = stmt;
4408     }
4409   else
4410     {
4411       /* Reduction pattern  */
4412       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4413       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4414       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4415     }
4416
4417   code = gimple_assign_rhs_code (orig_stmt);
4418   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4419      partial results are added and not subtracted.  */
4420   if (code == MINUS_EXPR)
4421     code = PLUS_EXPR;
4422
4423   scalar_dest = gimple_assign_lhs (orig_stmt);
4424   scalar_type = TREE_TYPE (scalar_dest);
4425   scalar_results.create (group_size);
4426   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4427   bitsize = TYPE_SIZE (scalar_type);
4428
4429   /* In case this is a reduction in an inner-loop while vectorizing an outer
4430      loop - we don't need to extract a single scalar result at the end of the
4431      inner-loop (unless it is double reduction, i.e., the use of reduction is
4432      outside the outer-loop).  The final vector of partial results will be used
4433      in the vectorized outer-loop, or reduced to a scalar result at the end of
4434      the outer-loop.  */
4435   if (nested_in_vect_loop && !double_reduc)
4436     goto vect_finalize_reduction;
4437
4438   /* SLP reduction without reduction chain, e.g.,
4439      # a1 = phi <a2, a0>
4440      # b1 = phi <b2, b0>
4441      a2 = operation (a1)
4442      b2 = operation (b1)  */
4443   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4444
4445   /* In case of reduction chain, e.g.,
4446      # a1 = phi <a3, a0>
4447      a2 = operation (a1)
4448      a3 = operation (a2),
4449
4450      we may end up with more than one vector result.  Here we reduce them to
4451      one vector.  */
4452   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4453     {
4454       tree first_vect = PHI_RESULT (new_phis[0]);
4455       tree tmp;
4456       gassign *new_vec_stmt = NULL;
4457
4458       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4459       for (k = 1; k < new_phis.length (); k++)
4460         {
4461           gimple *next_phi = new_phis[k];
4462           tree second_vect = PHI_RESULT (next_phi);
4463
4464           tmp = build2 (code, vectype,  first_vect, second_vect);
4465           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4466           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4467           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4468           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4469         }
4470
4471       new_phi_result = first_vect;
4472       if (new_vec_stmt)
4473         {
4474           new_phis.truncate (0);
4475           new_phis.safe_push (new_vec_stmt);
4476         }
4477     }
4478   else
4479     new_phi_result = PHI_RESULT (new_phis[0]);
4480
4481   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4482     {
4483       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4484          various data values where the condition matched and another vector
4485          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4486          need to extract the last matching index (which will be the index with
4487          highest value) and use this to index into the data vector.
4488          For the case where there were no matches, the data vector will contain
4489          all default values and the index vector will be all zeros.  */
4490
4491       /* Get various versions of the type of the vector of indexes.  */
4492       tree index_vec_type = TREE_TYPE (induction_index);
4493       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4494       tree index_scalar_type = TREE_TYPE (index_vec_type);
4495       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4496         (index_vec_type);
4497
4498       /* Get an unsigned integer version of the type of the data vector.  */
4499       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4500       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4501       tree vectype_unsigned = build_vector_type
4502         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4503
4504       /* First we need to create a vector (ZERO_VEC) of zeros and another
4505          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4506          can create using a MAX reduction and then expanding.
4507          In the case where the loop never made any matches, the max index will
4508          be zero.  */
4509
4510       /* Vector of {0, 0, 0,...}.  */
4511       tree zero_vec = make_ssa_name (vectype);
4512       tree zero_vec_rhs = build_zero_cst (vectype);
4513       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4514       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4515
4516       /* Find maximum value from the vector of found indexes.  */
4517       tree max_index = make_ssa_name (index_scalar_type);
4518       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4519                                                     induction_index);
4520       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4521
4522       /* Vector of {max_index, max_index, max_index,...}.  */
4523       tree max_index_vec = make_ssa_name (index_vec_type);
4524       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4525                                                       max_index);
4526       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4527                                                         max_index_vec_rhs);
4528       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4529
4530       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4531          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4532          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4533          otherwise.  Only one value should match, resulting in a vector
4534          (VEC_COND) with one data value and the rest zeros.
4535          In the case where the loop never made any matches, every index will
4536          match, resulting in a vector with all data values (which will all be
4537          the default value).  */
4538
4539       /* Compare the max index vector to the vector of found indexes to find
4540          the position of the max value.  */
4541       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4542       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4543                                                       induction_index,
4544                                                       max_index_vec);
4545       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4546
4547       /* Use the compare to choose either values from the data vector or
4548          zero.  */
4549       tree vec_cond = make_ssa_name (vectype);
4550       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4551                                                    vec_compare, new_phi_result,
4552                                                    zero_vec);
4553       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4554
4555       /* Finally we need to extract the data value from the vector (VEC_COND)
4556          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4557          reduction, but because this doesn't exist, we can use a MAX reduction
4558          instead.  The data value might be signed or a float so we need to cast
4559          it first.
4560          In the case where the loop never made any matches, the data values are
4561          all identical, and so will reduce down correctly.  */
4562
4563       /* Make the matched data values unsigned.  */
4564       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4565       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4566                                        vec_cond);
4567       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4568                                                         VIEW_CONVERT_EXPR,
4569                                                         vec_cond_cast_rhs);
4570       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4571
4572       /* Reduce down to a scalar value.  */
4573       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4574       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4575                                       optab_default);
4576       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4577                   != CODE_FOR_nothing);
4578       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4579                                                      REDUC_MAX_EXPR,
4580                                                      vec_cond_cast);
4581       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4582
4583       /* Convert the reduced value back to the result type and set as the
4584          result.  */
4585       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4586                                      data_reduc);
4587       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4588       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4589       gimple_assign_set_lhs (epilog_stmt, new_temp);
4590       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4591       scalar_results.safe_push (new_temp);
4592     }
4593
4594   /* 2.3 Create the reduction code, using one of the three schemes described
4595          above. In SLP we simply need to extract all the elements from the
4596          vector (without reducing them), so we use scalar shifts.  */
4597   else if (reduc_code != ERROR_MARK && !slp_reduc)
4598     {
4599       tree tmp;
4600       tree vec_elem_type;
4601
4602       /*** Case 1:  Create:
4603            v_out2 = reduc_expr <v_out1>  */
4604
4605       if (dump_enabled_p ())
4606         dump_printf_loc (MSG_NOTE, vect_location,
4607                          "Reduce using direct vector reduction.\n");
4608
4609       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4610       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4611         {
4612           tree tmp_dest =
4613               vect_create_destination_var (scalar_dest, vec_elem_type);
4614           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4615           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4616           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4617           gimple_assign_set_lhs (epilog_stmt, new_temp);
4618           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4619
4620           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4621         }
4622       else
4623         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4624
4625       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4626       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4627       gimple_assign_set_lhs (epilog_stmt, new_temp);
4628       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4629
4630       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4631           == INTEGER_INDUC_COND_REDUCTION)
4632         {
4633           /* Earlier we set the initial value to be zero.  Check the result
4634              and if it is zero then replace with the original initial
4635              value.  */
4636           tree zero = build_zero_cst (scalar_type);
4637           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4638
4639           tmp = make_ssa_name (new_scalar_dest);
4640           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4641                                              initial_def, new_temp);
4642           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4643           new_temp = tmp;
4644         }
4645
4646       scalar_results.safe_push (new_temp);
4647     }
4648   else
4649     {
4650       bool reduce_with_shift = have_whole_vector_shift (mode);
4651       int element_bitsize = tree_to_uhwi (bitsize);
4652       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4653       tree vec_temp;
4654
4655       /* Regardless of whether we have a whole vector shift, if we're
4656          emulating the operation via tree-vect-generic, we don't want
4657          to use it.  Only the first round of the reduction is likely
4658          to still be profitable via emulation.  */
4659       /* ??? It might be better to emit a reduction tree code here, so that
4660          tree-vect-generic can expand the first round via bit tricks.  */
4661       if (!VECTOR_MODE_P (mode))
4662         reduce_with_shift = false;
4663       else
4664         {
4665           optab optab = optab_for_tree_code (code, vectype, optab_default);
4666           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4667             reduce_with_shift = false;
4668         }
4669
4670       if (reduce_with_shift && !slp_reduc)
4671         {
4672           int nelements = vec_size_in_bits / element_bitsize;
4673           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4674
4675           int elt_offset;
4676
4677           tree zero_vec = build_zero_cst (vectype);
4678           /*** Case 2: Create:
4679              for (offset = nelements/2; offset >= 1; offset/=2)
4680                 {
4681                   Create:  va' = vec_shift <va, offset>
4682                   Create:  va = vop <va, va'>
4683                 }  */
4684
4685           tree rhs;
4686
4687           if (dump_enabled_p ())
4688             dump_printf_loc (MSG_NOTE, vect_location,
4689                              "Reduce using vector shifts\n");
4690
4691           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4692           new_temp = new_phi_result;
4693           for (elt_offset = nelements / 2;
4694                elt_offset >= 1;
4695                elt_offset /= 2)
4696             {
4697               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4698               tree mask = vect_gen_perm_mask_any (vectype, sel);
4699               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4700                                                  new_temp, zero_vec, mask);
4701               new_name = make_ssa_name (vec_dest, epilog_stmt);
4702               gimple_assign_set_lhs (epilog_stmt, new_name);
4703               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4704
4705               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4706                                                  new_temp);
4707               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4708               gimple_assign_set_lhs (epilog_stmt, new_temp);
4709               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4710             }
4711
4712           /* 2.4  Extract the final scalar result.  Create:
4713              s_out3 = extract_field <v_out2, bitpos>  */
4714
4715           if (dump_enabled_p ())
4716             dump_printf_loc (MSG_NOTE, vect_location,
4717                              "extract scalar result\n");
4718
4719           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4720                         bitsize, bitsize_zero_node);
4721           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4722           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4723           gimple_assign_set_lhs (epilog_stmt, new_temp);
4724           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4725           scalar_results.safe_push (new_temp);
4726         }
4727       else
4728         {
4729           /*** Case 3: Create:
4730              s = extract_field <v_out2, 0>
4731              for (offset = element_size;
4732                   offset < vector_size;
4733                   offset += element_size;)
4734                {
4735                  Create:  s' = extract_field <v_out2, offset>
4736                  Create:  s = op <s, s'>  // For non SLP cases
4737                }  */
4738
4739           if (dump_enabled_p ())
4740             dump_printf_loc (MSG_NOTE, vect_location,
4741                              "Reduce using scalar code.\n");
4742
4743           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4744           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4745             {
4746               int bit_offset;
4747               if (gimple_code (new_phi) == GIMPLE_PHI)
4748                 vec_temp = PHI_RESULT (new_phi);
4749               else
4750                 vec_temp = gimple_assign_lhs (new_phi);
4751               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4752                             bitsize_zero_node);
4753               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4754               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4755               gimple_assign_set_lhs (epilog_stmt, new_temp);
4756               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4757
4758               /* In SLP we don't need to apply reduction operation, so we just
4759                  collect s' values in SCALAR_RESULTS.  */
4760               if (slp_reduc)
4761                 scalar_results.safe_push (new_temp);
4762
4763               for (bit_offset = element_bitsize;
4764                    bit_offset < vec_size_in_bits;
4765                    bit_offset += element_bitsize)
4766                 {
4767                   tree bitpos = bitsize_int (bit_offset);
4768                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4769                                      bitsize, bitpos);
4770
4771                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4772                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4773                   gimple_assign_set_lhs (epilog_stmt, new_name);
4774                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4775
4776                   if (slp_reduc)
4777                     {
4778                       /* In SLP we don't need to apply reduction operation, so
4779                          we just collect s' values in SCALAR_RESULTS.  */
4780                       new_temp = new_name;
4781                       scalar_results.safe_push (new_name);
4782                     }
4783                   else
4784                     {
4785                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4786                                                          new_name, new_temp);
4787                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4788                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4789                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4790                     }
4791                 }
4792             }
4793
4794           /* The only case where we need to reduce scalar results in SLP, is
4795              unrolling.  If the size of SCALAR_RESULTS is greater than
4796              GROUP_SIZE, we reduce them combining elements modulo
4797              GROUP_SIZE.  */
4798           if (slp_reduc)
4799             {
4800               tree res, first_res, new_res;
4801               gimple *new_stmt;
4802
4803               /* Reduce multiple scalar results in case of SLP unrolling.  */
4804               for (j = group_size; scalar_results.iterate (j, &res);
4805                    j++)
4806                 {
4807                   first_res = scalar_results[j % group_size];
4808                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4809                                                   first_res, res);
4810                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4811                   gimple_assign_set_lhs (new_stmt, new_res);
4812                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4813                   scalar_results[j % group_size] = new_res;
4814                 }
4815             }
4816           else
4817             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4818             scalar_results.safe_push (new_temp);
4819         }
4820     }
4821
4822 vect_finalize_reduction:
4823
4824   if (double_reduc)
4825     loop = loop->inner;
4826
4827   /* 2.5 Adjust the final result by the initial value of the reduction
4828          variable. (When such adjustment is not needed, then
4829          'adjustment_def' is zero).  For example, if code is PLUS we create:
4830          new_temp = loop_exit_def + adjustment_def  */
4831
4832   if (adjustment_def)
4833     {
4834       gcc_assert (!slp_reduc);
4835       if (nested_in_vect_loop)
4836         {
4837           new_phi = new_phis[0];
4838           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4839           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4840           new_dest = vect_create_destination_var (scalar_dest, vectype);
4841         }
4842       else
4843         {
4844           new_temp = scalar_results[0];
4845           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4846           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4847           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4848         }
4849
4850       epilog_stmt = gimple_build_assign (new_dest, expr);
4851       new_temp = make_ssa_name (new_dest, epilog_stmt);
4852       gimple_assign_set_lhs (epilog_stmt, new_temp);
4853       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4854       if (nested_in_vect_loop)
4855         {
4856           set_vinfo_for_stmt (epilog_stmt,
4857                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
4858           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4859                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4860
4861           if (!double_reduc)
4862             scalar_results.quick_push (new_temp);
4863           else
4864             scalar_results[0] = new_temp;
4865         }
4866       else
4867         scalar_results[0] = new_temp;
4868
4869       new_phis[0] = epilog_stmt;
4870     }
4871
4872   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4873           phis with new adjusted scalar results, i.e., replace use <s_out0>
4874           with use <s_out4>.
4875
4876      Transform:
4877         loop_exit:
4878           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4879           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4880           v_out2 = reduce <v_out1>
4881           s_out3 = extract_field <v_out2, 0>
4882           s_out4 = adjust_result <s_out3>
4883           use <s_out0>
4884           use <s_out0>
4885
4886      into:
4887
4888         loop_exit:
4889           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4890           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4891           v_out2 = reduce <v_out1>
4892           s_out3 = extract_field <v_out2, 0>
4893           s_out4 = adjust_result <s_out3>
4894           use <s_out4>
4895           use <s_out4> */
4896
4897
4898   /* In SLP reduction chain we reduce vector results into one vector if
4899      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4900      the last stmt in the reduction chain, since we are looking for the loop
4901      exit phi node.  */
4902   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4903     {
4904       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4905       /* Handle reduction patterns.  */
4906       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4907         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4908
4909       scalar_dest = gimple_assign_lhs (dest_stmt);
4910       group_size = 1;
4911     }
4912
4913   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4914      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4915      need to match SCALAR_RESULTS with corresponding statements.  The first
4916      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4917      the first vector stmt, etc.
4918      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4919   if (group_size > new_phis.length ())
4920     {
4921       ratio = group_size / new_phis.length ();
4922       gcc_assert (!(group_size % new_phis.length ()));
4923     }
4924   else
4925     ratio = 1;
4926
4927   for (k = 0; k < group_size; k++)
4928     {
4929       if (k % ratio == 0)
4930         {
4931           epilog_stmt = new_phis[k / ratio];
4932           reduction_phi = reduction_phis[k / ratio];
4933           if (double_reduc)
4934             inner_phi = inner_phis[k / ratio];
4935         }
4936
4937       if (slp_reduc)
4938         {
4939           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4940
4941           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4942           /* SLP statements can't participate in patterns.  */
4943           gcc_assert (!orig_stmt);
4944           scalar_dest = gimple_assign_lhs (current_stmt);
4945         }
4946
4947       phis.create (3);
4948       /* Find the loop-closed-use at the loop exit of the original scalar
4949          result.  (The reduction result is expected to have two immediate uses -
4950          one at the latch block, and one at the loop exit).  */
4951       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4952         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4953             && !is_gimple_debug (USE_STMT (use_p)))
4954           phis.safe_push (USE_STMT (use_p));
4955
4956       /* While we expect to have found an exit_phi because of loop-closed-ssa
4957          form we can end up without one if the scalar cycle is dead.  */
4958
4959       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4960         {
4961           if (outer_loop)
4962             {
4963               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4964               gphi *vect_phi;
4965
4966               /* FORNOW. Currently not supporting the case that an inner-loop
4967                  reduction is not used in the outer-loop (but only outside the
4968                  outer-loop), unless it is double reduction.  */
4969               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4970                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4971                           || double_reduc);
4972
4973               if (double_reduc)
4974                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4975               else
4976                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4977               if (!double_reduc
4978                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4979                       != vect_double_reduction_def)
4980                 continue;
4981
4982               /* Handle double reduction:
4983
4984                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4985                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4986                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4987                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4988
4989                  At that point the regular reduction (stmt2 and stmt3) is
4990                  already vectorized, as well as the exit phi node, stmt4.
4991                  Here we vectorize the phi node of double reduction, stmt1, and
4992                  update all relevant statements.  */
4993
4994               /* Go through all the uses of s2 to find double reduction phi
4995                  node, i.e., stmt1 above.  */
4996               orig_name = PHI_RESULT (exit_phi);
4997               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4998                 {
4999                   stmt_vec_info use_stmt_vinfo;
5000                   stmt_vec_info new_phi_vinfo;
5001                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5002                   basic_block bb = gimple_bb (use_stmt);
5003                   gimple *use;
5004
5005                   /* Check that USE_STMT is really double reduction phi
5006                      node.  */
5007                   if (gimple_code (use_stmt) != GIMPLE_PHI
5008                       || gimple_phi_num_args (use_stmt) != 2
5009                       || bb->loop_father != outer_loop)
5010                     continue;
5011                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5012                   if (!use_stmt_vinfo
5013                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5014                           != vect_double_reduction_def)
5015                     continue;
5016
5017                   /* Create vector phi node for double reduction:
5018                      vs1 = phi <vs0, vs2>
5019                      vs1 was created previously in this function by a call to
5020                        vect_get_vec_def_for_operand and is stored in
5021                        vec_initial_def;
5022                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5023                      vs0 is created here.  */
5024
5025                   /* Create vector phi node.  */
5026                   vect_phi = create_phi_node (vec_initial_def, bb);
5027                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5028                                     loop_vec_info_for_loop (outer_loop));
5029                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5030
5031                   /* Create vs0 - initial def of the double reduction phi.  */
5032                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5033                                              loop_preheader_edge (outer_loop));
5034                   init_def = get_initial_def_for_reduction (stmt,
5035                                                           preheader_arg, NULL);
5036                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5037                                                     vectype, NULL);
5038
5039                   /* Update phi node arguments with vs0 and vs2.  */
5040                   add_phi_arg (vect_phi, vect_phi_init,
5041                                loop_preheader_edge (outer_loop),
5042                                UNKNOWN_LOCATION);
5043                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5044                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5045                   if (dump_enabled_p ())
5046                     {
5047                       dump_printf_loc (MSG_NOTE, vect_location,
5048                                        "created double reduction phi node: ");
5049                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5050                       dump_printf (MSG_NOTE, "\n");
5051                     }
5052
5053                   vect_phi_res = PHI_RESULT (vect_phi);
5054
5055                   /* Replace the use, i.e., set the correct vs1 in the regular
5056                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5057                      loop is redundant.  */
5058                   use = reduction_phi;
5059                   for (j = 0; j < ncopies; j++)
5060                     {
5061                       edge pr_edge = loop_preheader_edge (loop);
5062                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5063                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5064                     }
5065                 }
5066             }
5067         }
5068
5069       phis.release ();
5070       if (nested_in_vect_loop)
5071         {
5072           if (double_reduc)
5073             loop = outer_loop;
5074           else
5075             continue;
5076         }
5077
5078       phis.create (3);
5079       /* Find the loop-closed-use at the loop exit of the original scalar
5080          result.  (The reduction result is expected to have two immediate uses,
5081          one at the latch block, and one at the loop exit).  For double
5082          reductions we are looking for exit phis of the outer loop.  */
5083       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5084         {
5085           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5086             {
5087               if (!is_gimple_debug (USE_STMT (use_p)))
5088                 phis.safe_push (USE_STMT (use_p));
5089             }
5090           else
5091             {
5092               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5093                 {
5094                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5095
5096                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5097                     {
5098                       if (!flow_bb_inside_loop_p (loop,
5099                                              gimple_bb (USE_STMT (phi_use_p)))
5100                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5101                         phis.safe_push (USE_STMT (phi_use_p));
5102                     }
5103                 }
5104             }
5105         }
5106
5107       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5108         {
5109           /* Replace the uses:  */
5110           orig_name = PHI_RESULT (exit_phi);
5111           scalar_result = scalar_results[k];
5112           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5113             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5114               SET_USE (use_p, scalar_result);
5115         }
5116
5117       phis.release ();
5118     }
5119 }
5120
5121
5122 /* Function is_nonwrapping_integer_induction.
5123
5124    Check if STMT (which is part of loop LOOP) both increments and
5125    does not cause overflow.  */
5126
5127 static bool
5128 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5129 {
5130   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5131   tree base = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
5132   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5133   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5134   widest_int ni, max_loop_value, lhs_max;
5135   bool overflow = false;
5136
5137   /* Make sure the loop is integer based.  */
5138   if (TREE_CODE (base) != INTEGER_CST
5139       || TREE_CODE (step) != INTEGER_CST)
5140     return false;
5141
5142   /* Check that the induction increments.  */
5143   if (tree_int_cst_sgn (step) == -1)
5144     return false;
5145
5146   /* Check that the max size of the loop will not wrap.  */
5147
5148   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5149     return true;
5150
5151   if (! max_stmt_executions (loop, &ni))
5152     return false;
5153
5154   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5155                             &overflow);
5156   if (overflow)
5157     return false;
5158
5159   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5160                             TYPE_SIGN (lhs_type), &overflow);
5161   if (overflow)
5162     return false;
5163
5164   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5165           <= TYPE_PRECISION (lhs_type));
5166 }
5167
5168 /* Function vectorizable_reduction.
5169
5170    Check if STMT performs a reduction operation that can be vectorized.
5171    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5172    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5173    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5174
5175    This function also handles reduction idioms (patterns) that have been
5176    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5177    of this form:
5178      X = pattern_expr (arg0, arg1, ..., X)
5179    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5180    sequence that had been detected and replaced by the pattern-stmt (STMT).
5181
5182    This function also handles reduction of condition expressions, for example:
5183      for (int i = 0; i < N; i++)
5184        if (a[i] < value)
5185          last = a[i];
5186    This is handled by vectorising the loop and creating an additional vector
5187    containing the loop indexes for which "a[i] < value" was true.  In the
5188    function epilogue this is reduced to a single max value and then used to
5189    index into the vector of results.
5190
5191    In some cases of reduction patterns, the type of the reduction variable X is
5192    different than the type of the other arguments of STMT.
5193    In such cases, the vectype that is used when transforming STMT into a vector
5194    stmt is different than the vectype that is used to determine the
5195    vectorization factor, because it consists of a different number of elements
5196    than the actual number of elements that are being operated upon in parallel.
5197
5198    For example, consider an accumulation of shorts into an int accumulator.
5199    On some targets it's possible to vectorize this pattern operating on 8
5200    shorts at a time (hence, the vectype for purposes of determining the
5201    vectorization factor should be V8HI); on the other hand, the vectype that
5202    is used to create the vector form is actually V4SI (the type of the result).
5203
5204    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5205    indicates what is the actual level of parallelism (V8HI in the example), so
5206    that the right vectorization factor would be derived.  This vectype
5207    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5208    be used to create the vectorized stmt.  The right vectype for the vectorized
5209    stmt is obtained from the type of the result X:
5210         get_vectype_for_scalar_type (TREE_TYPE (X))
5211
5212    This means that, contrary to "regular" reductions (or "regular" stmts in
5213    general), the following equation:
5214       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5215    does *NOT* necessarily hold for reduction patterns.  */
5216
5217 bool
5218 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5219                         gimple **vec_stmt, slp_tree slp_node)
5220 {
5221   tree vec_dest;
5222   tree scalar_dest;
5223   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5224   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5225   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5226   tree vectype_in = NULL_TREE;
5227   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5228   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5229   enum tree_code code, orig_code, epilog_reduc_code;
5230   machine_mode vec_mode;
5231   int op_type;
5232   optab optab, reduc_optab;
5233   tree new_temp = NULL_TREE;
5234   gimple *def_stmt;
5235   enum vect_def_type dt;
5236   gphi *new_phi = NULL;
5237   tree scalar_type;
5238   bool is_simple_use;
5239   gimple *orig_stmt;
5240   stmt_vec_info orig_stmt_info;
5241   tree expr = NULL_TREE;
5242   int i;
5243   int ncopies;
5244   int epilog_copies;
5245   stmt_vec_info prev_stmt_info, prev_phi_info;
5246   bool single_defuse_cycle = false;
5247   tree reduc_def = NULL_TREE;
5248   gimple *new_stmt = NULL;
5249   int j;
5250   tree ops[3];
5251   bool nested_cycle = false, found_nested_cycle_def = false;
5252   gimple *reduc_def_stmt = NULL;
5253   bool double_reduc = false, dummy;
5254   basic_block def_bb;
5255   struct loop * def_stmt_loop, *outer_loop = NULL;
5256   tree def_arg;
5257   gimple *def_arg_stmt;
5258   auto_vec<tree> vec_oprnds0;
5259   auto_vec<tree> vec_oprnds1;
5260   auto_vec<tree> vect_defs;
5261   auto_vec<gimple *> phis;
5262   int vec_num;
5263   tree def0, def1, tem, op0, op1 = NULL_TREE;
5264   bool first_p = true;
5265   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5266   bool cond_expr_is_nonwrapping_integer_induction = false;
5267
5268   /* In case of reduction chain we switch to the first stmt in the chain, but
5269      we don't update STMT_INFO, since only the last stmt is marked as reduction
5270      and has reduction properties.  */
5271   if (GROUP_FIRST_ELEMENT (stmt_info)
5272       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5273     {
5274       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5275       first_p = false;
5276     }
5277
5278   if (nested_in_vect_loop_p (loop, stmt))
5279     {
5280       outer_loop = loop;
5281       loop = loop->inner;
5282       nested_cycle = true;
5283     }
5284
5285   /* 1. Is vectorizable reduction?  */
5286   /* Not supportable if the reduction variable is used in the loop, unless
5287      it's a reduction chain.  */
5288   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5289       && !GROUP_FIRST_ELEMENT (stmt_info))
5290     return false;
5291
5292   /* Reductions that are not used even in an enclosing outer-loop,
5293      are expected to be "live" (used out of the loop).  */
5294   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5295       && !STMT_VINFO_LIVE_P (stmt_info))
5296     return false;
5297
5298   /* Make sure it was already recognized as a reduction computation.  */
5299   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5300       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5301     return false;
5302
5303   /* 2. Has this been recognized as a reduction pattern?
5304
5305      Check if STMT represents a pattern that has been recognized
5306      in earlier analysis stages.  For stmts that represent a pattern,
5307      the STMT_VINFO_RELATED_STMT field records the last stmt in
5308      the original sequence that constitutes the pattern.  */
5309
5310   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5311   if (orig_stmt)
5312     {
5313       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5314       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5315       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5316     }
5317
5318   /* 3. Check the operands of the operation.  The first operands are defined
5319         inside the loop body. The last operand is the reduction variable,
5320         which is defined by the loop-header-phi.  */
5321
5322   gcc_assert (is_gimple_assign (stmt));
5323
5324   /* Flatten RHS.  */
5325   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5326     {
5327     case GIMPLE_SINGLE_RHS:
5328       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5329       if (op_type == ternary_op)
5330         {
5331           tree rhs = gimple_assign_rhs1 (stmt);
5332           ops[0] = TREE_OPERAND (rhs, 0);
5333           ops[1] = TREE_OPERAND (rhs, 1);
5334           ops[2] = TREE_OPERAND (rhs, 2);
5335           code = TREE_CODE (rhs);
5336         }
5337       else
5338         return false;
5339       break;
5340
5341     case GIMPLE_BINARY_RHS:
5342       code = gimple_assign_rhs_code (stmt);
5343       op_type = TREE_CODE_LENGTH (code);
5344       gcc_assert (op_type == binary_op);
5345       ops[0] = gimple_assign_rhs1 (stmt);
5346       ops[1] = gimple_assign_rhs2 (stmt);
5347       break;
5348
5349     case GIMPLE_TERNARY_RHS:
5350       code = gimple_assign_rhs_code (stmt);
5351       op_type = TREE_CODE_LENGTH (code);
5352       gcc_assert (op_type == ternary_op);
5353       ops[0] = gimple_assign_rhs1 (stmt);
5354       ops[1] = gimple_assign_rhs2 (stmt);
5355       ops[2] = gimple_assign_rhs3 (stmt);
5356       break;
5357
5358     case GIMPLE_UNARY_RHS:
5359       return false;
5360
5361     default:
5362       gcc_unreachable ();
5363     }
5364   /* The default is that the reduction variable is the last in statement.  */
5365   int reduc_index = op_type - 1;
5366   if (code == MINUS_EXPR)
5367     reduc_index = 0;
5368
5369   if (code == COND_EXPR && slp_node)
5370     return false;
5371
5372   scalar_dest = gimple_assign_lhs (stmt);
5373   scalar_type = TREE_TYPE (scalar_dest);
5374   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5375       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5376     return false;
5377
5378   /* Do not try to vectorize bit-precision reductions.  */
5379   if ((TYPE_PRECISION (scalar_type)
5380        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5381     return false;
5382
5383   /* All uses but the last are expected to be defined in the loop.
5384      The last use is the reduction variable.  In case of nested cycle this
5385      assumption is not true: we use reduc_index to record the index of the
5386      reduction variable.  */
5387   for (i = 0; i < op_type; i++)
5388     {
5389       if (i == reduc_index)
5390         continue;
5391
5392       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5393       if (i == 0 && code == COND_EXPR)
5394         continue;
5395
5396       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5397                                           &def_stmt, &dt, &tem);
5398       if (!vectype_in)
5399         vectype_in = tem;
5400       gcc_assert (is_simple_use);
5401
5402       if (dt != vect_internal_def
5403           && dt != vect_external_def
5404           && dt != vect_constant_def
5405           && dt != vect_induction_def
5406           && !(dt == vect_nested_cycle && nested_cycle))
5407         return false;
5408
5409       if (dt == vect_nested_cycle)
5410         {
5411           found_nested_cycle_def = true;
5412           reduc_def_stmt = def_stmt;
5413           reduc_index = i;
5414         }
5415
5416       if (i == 1 && code == COND_EXPR && dt == vect_induction_def
5417           && is_nonwrapping_integer_induction (def_stmt, loop))
5418         {
5419           if (dump_enabled_p ())
5420             dump_printf_loc (MSG_NOTE, vect_location,
5421                              "condition expression based on integer "
5422                              "induction.\n");
5423           cond_expr_is_nonwrapping_integer_induction = true;
5424         }
5425     }
5426
5427   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5428                                       &def_stmt, &dt, &tem);
5429   if (!vectype_in)
5430     vectype_in = tem;
5431   gcc_assert (is_simple_use);
5432   if (!found_nested_cycle_def)
5433     reduc_def_stmt = def_stmt;
5434
5435   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5436     return false;
5437
5438   if (!(dt == vect_reduction_def
5439         || dt == vect_nested_cycle
5440         || ((dt == vect_internal_def || dt == vect_external_def
5441              || dt == vect_constant_def || dt == vect_induction_def)
5442             && nested_cycle && found_nested_cycle_def)))
5443     {
5444       /* For pattern recognized stmts, orig_stmt might be a reduction,
5445          but some helper statements for the pattern might not, or
5446          might be COND_EXPRs with reduction uses in the condition.  */
5447       gcc_assert (orig_stmt);
5448       return false;
5449     }
5450
5451   gimple *tmp = vect_is_simple_reduction
5452                   (loop_vinfo, reduc_def_stmt,
5453                   !nested_cycle, &dummy, false,
5454                   &STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info));
5455
5456   if (cond_expr_is_nonwrapping_integer_induction
5457       && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5458     STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5459
5460   if (orig_stmt)
5461     gcc_assert (tmp == orig_stmt
5462                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5463   else
5464     /* We changed STMT to be the first stmt in reduction chain, hence we
5465        check that in this case the first element in the chain is STMT.  */
5466     gcc_assert (stmt == tmp
5467                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5468
5469   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5470     return false;
5471
5472   if (slp_node || PURE_SLP_STMT (stmt_info))
5473     ncopies = 1;
5474   else
5475     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5476                / TYPE_VECTOR_SUBPARTS (vectype_in));
5477
5478   gcc_assert (ncopies >= 1);
5479
5480   vec_mode = TYPE_MODE (vectype_in);
5481
5482   if (code == COND_EXPR)
5483     {
5484       /* Only call during the analysis stage, otherwise we'll lose
5485          STMT_VINFO_TYPE.  */
5486       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5487                                                 ops[reduc_index], 0, NULL))
5488         {
5489           if (dump_enabled_p ())
5490             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5491                              "unsupported condition in reduction\n");
5492           return false;
5493         }
5494     }
5495   else
5496     {
5497       /* 4. Supportable by target?  */
5498
5499       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5500           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5501         {
5502           /* Shifts and rotates are only supported by vectorizable_shifts,
5503              not vectorizable_reduction.  */
5504           if (dump_enabled_p ())
5505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5506                              "unsupported shift or rotation.\n");
5507           return false;
5508         }
5509
5510       /* 4.1. check support for the operation in the loop  */
5511       optab = optab_for_tree_code (code, vectype_in, optab_default);
5512       if (!optab)
5513         {
5514           if (dump_enabled_p ())
5515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5516                              "no optab.\n");
5517
5518           return false;
5519         }
5520
5521       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5522         {
5523           if (dump_enabled_p ())
5524             dump_printf (MSG_NOTE, "op not supported by target.\n");
5525
5526           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5527               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5528                   < vect_min_worthwhile_factor (code))
5529             return false;
5530
5531           if (dump_enabled_p ())
5532             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5533         }
5534
5535       /* Worthwhile without SIMD support?  */
5536       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5537           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5538              < vect_min_worthwhile_factor (code))
5539         {
5540           if (dump_enabled_p ())
5541             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5542                              "not worthwhile without SIMD support.\n");
5543
5544           return false;
5545         }
5546     }
5547
5548   /* 4.2. Check support for the epilog operation.
5549
5550           If STMT represents a reduction pattern, then the type of the
5551           reduction variable may be different than the type of the rest
5552           of the arguments.  For example, consider the case of accumulation
5553           of shorts into an int accumulator; The original code:
5554                         S1: int_a = (int) short_a;
5555           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5556
5557           was replaced with:
5558                         STMT: int_acc = widen_sum <short_a, int_acc>
5559
5560           This means that:
5561           1. The tree-code that is used to create the vector operation in the
5562              epilog code (that reduces the partial results) is not the
5563              tree-code of STMT, but is rather the tree-code of the original
5564              stmt from the pattern that STMT is replacing.  I.e, in the example
5565              above we want to use 'widen_sum' in the loop, but 'plus' in the
5566              epilog.
5567           2. The type (mode) we use to check available target support
5568              for the vector operation to be created in the *epilog*, is
5569              determined by the type of the reduction variable (in the example
5570              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5571              However the type (mode) we use to check available target support
5572              for the vector operation to be created *inside the loop*, is
5573              determined by the type of the other arguments to STMT (in the
5574              example we'd check this: optab_handler (widen_sum_optab,
5575              vect_short_mode)).
5576
5577           This is contrary to "regular" reductions, in which the types of all
5578           the arguments are the same as the type of the reduction variable.
5579           For "regular" reductions we can therefore use the same vector type
5580           (and also the same tree-code) when generating the epilog code and
5581           when generating the code inside the loop.  */
5582
5583   if (orig_stmt)
5584     {
5585       /* This is a reduction pattern: get the vectype from the type of the
5586          reduction variable, and get the tree-code from orig_stmt.  */
5587       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5588                   == TREE_CODE_REDUCTION);
5589       orig_code = gimple_assign_rhs_code (orig_stmt);
5590       gcc_assert (vectype_out);
5591       vec_mode = TYPE_MODE (vectype_out);
5592     }
5593   else
5594     {
5595       /* Regular reduction: use the same vectype and tree-code as used for
5596          the vector code inside the loop can be used for the epilog code. */
5597       orig_code = code;
5598
5599       if (code == MINUS_EXPR)
5600         orig_code = PLUS_EXPR;
5601
5602       /* For simple condition reductions, replace with the actual expression
5603          we want to base our reduction around.  */
5604       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5605           == INTEGER_INDUC_COND_REDUCTION)
5606         orig_code = MAX_EXPR;
5607     }
5608
5609   if (nested_cycle)
5610     {
5611       def_bb = gimple_bb (reduc_def_stmt);
5612       def_stmt_loop = def_bb->loop_father;
5613       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5614                                        loop_preheader_edge (def_stmt_loop));
5615       if (TREE_CODE (def_arg) == SSA_NAME
5616           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5617           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5618           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5619           && vinfo_for_stmt (def_arg_stmt)
5620           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5621               == vect_double_reduction_def)
5622         double_reduc = true;
5623     }
5624
5625   epilog_reduc_code = ERROR_MARK;
5626
5627   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5628       || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5629                 == INTEGER_INDUC_COND_REDUCTION)
5630     {
5631       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5632         {
5633           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5634                                          optab_default);
5635           if (!reduc_optab)
5636             {
5637               if (dump_enabled_p ())
5638                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5639                                  "no optab for reduction.\n");
5640
5641               epilog_reduc_code = ERROR_MARK;
5642             }
5643           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5644             {
5645               optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5646               if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5647                 {
5648                   if (dump_enabled_p ())
5649                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5650                                      "reduc op not supported by target.\n");
5651
5652                   epilog_reduc_code = ERROR_MARK;
5653                 }
5654             }
5655
5656           /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5657              generated in the epilog using multiple expressions.  This does not
5658              work for condition reductions.  */
5659           if (epilog_reduc_code == ERROR_MARK
5660               && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5661                         == INTEGER_INDUC_COND_REDUCTION)
5662             {
5663               if (dump_enabled_p ())
5664                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665                                  "no reduc code for scalar code.\n");
5666               return false;
5667             }
5668         }
5669       else
5670         {
5671           if (!nested_cycle || double_reduc)
5672             {
5673               if (dump_enabled_p ())
5674                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5675                                  "no reduc code for scalar code.\n");
5676
5677               return false;
5678             }
5679         }
5680     }
5681   else
5682     {
5683       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5684       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5685       cr_index_vector_type = build_vector_type
5686         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5687
5688       epilog_reduc_code = REDUC_MAX_EXPR;
5689       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5690                                    optab_default);
5691       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5692           == CODE_FOR_nothing)
5693         {
5694           if (dump_enabled_p ())
5695             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696                              "reduc max op not supported by target.\n");
5697           return false;
5698         }
5699     }
5700
5701   if ((double_reduc
5702        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5703        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5704                 == INTEGER_INDUC_COND_REDUCTION)
5705       && ncopies > 1)
5706     {
5707       if (dump_enabled_p ())
5708         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5709                          "multiple types in double reduction or condition "
5710                          "reduction.\n");
5711       return false;
5712     }
5713
5714   /* In case of widenning multiplication by a constant, we update the type
5715      of the constant to be the type of the other operand.  We check that the
5716      constant fits the type in the pattern recognition pass.  */
5717   if (code == DOT_PROD_EXPR
5718       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5719     {
5720       if (TREE_CODE (ops[0]) == INTEGER_CST)
5721         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5722       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5723         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5724       else
5725         {
5726           if (dump_enabled_p ())
5727             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728                              "invalid types in dot-prod\n");
5729
5730           return false;
5731         }
5732     }
5733
5734   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5735     {
5736       widest_int ni;
5737
5738       if (! max_loop_iterations (loop, &ni))
5739         {
5740           if (dump_enabled_p ())
5741             dump_printf_loc (MSG_NOTE, vect_location,
5742                              "loop count not known, cannot create cond "
5743                              "reduction.\n");
5744           return false;
5745         }
5746       /* Convert backedges to iterations.  */
5747       ni += 1;
5748
5749       /* The additional index will be the same type as the condition.  Check
5750          that the loop can fit into this less one (because we'll use up the
5751          zero slot for when there are no matches).  */
5752       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5753       if (wi::geu_p (ni, wi::to_widest (max_index)))
5754         {
5755           if (dump_enabled_p ())
5756             dump_printf_loc (MSG_NOTE, vect_location,
5757                              "loop size is greater than data size.\n");
5758           return false;
5759         }
5760     }
5761
5762   if (!vec_stmt) /* transformation not required.  */
5763     {
5764       if (first_p
5765           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5766                                          reduc_index))
5767         return false;
5768       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5769       return true;
5770     }
5771
5772   /** Transform.  **/
5773
5774   if (dump_enabled_p ())
5775     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5776
5777   /* FORNOW: Multiple types are not supported for condition.  */
5778   if (code == COND_EXPR)
5779     gcc_assert (ncopies == 1);
5780
5781   /* Create the destination vector  */
5782   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5783
5784   /* In case the vectorization factor (VF) is bigger than the number
5785      of elements that we can fit in a vectype (nunits), we have to generate
5786      more than one vector stmt - i.e - we need to "unroll" the
5787      vector stmt by a factor VF/nunits.  For more details see documentation
5788      in vectorizable_operation.  */
5789
5790   /* If the reduction is used in an outer loop we need to generate
5791      VF intermediate results, like so (e.g. for ncopies=2):
5792         r0 = phi (init, r0)
5793         r1 = phi (init, r1)
5794         r0 = x0 + r0;
5795         r1 = x1 + r1;
5796     (i.e. we generate VF results in 2 registers).
5797     In this case we have a separate def-use cycle for each copy, and therefore
5798     for each copy we get the vector def for the reduction variable from the
5799     respective phi node created for this copy.
5800
5801     Otherwise (the reduction is unused in the loop nest), we can combine
5802     together intermediate results, like so (e.g. for ncopies=2):
5803         r = phi (init, r)
5804         r = x0 + r;
5805         r = x1 + r;
5806    (i.e. we generate VF/2 results in a single register).
5807    In this case for each copy we get the vector def for the reduction variable
5808    from the vectorized reduction operation generated in the previous iteration.
5809   */
5810
5811   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5812     {
5813       single_defuse_cycle = true;
5814       epilog_copies = 1;
5815     }
5816   else
5817     epilog_copies = ncopies;
5818
5819   prev_stmt_info = NULL;
5820   prev_phi_info = NULL;
5821   if (slp_node)
5822     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5823   else
5824     {
5825       vec_num = 1;
5826       vec_oprnds0.create (1);
5827       if (op_type == ternary_op)
5828         vec_oprnds1.create (1);
5829     }
5830
5831   phis.create (vec_num);
5832   vect_defs.create (vec_num);
5833   if (!slp_node)
5834     vect_defs.quick_push (NULL_TREE);
5835
5836   for (j = 0; j < ncopies; j++)
5837     {
5838       if (j == 0 || !single_defuse_cycle)
5839         {
5840           for (i = 0; i < vec_num; i++)
5841             {
5842               /* Create the reduction-phi that defines the reduction
5843                  operand.  */
5844               new_phi = create_phi_node (vec_dest, loop->header);
5845               set_vinfo_for_stmt (new_phi,
5846                                   new_stmt_vec_info (new_phi, loop_vinfo));
5847                if (j == 0 || slp_node)
5848                  phis.quick_push (new_phi);
5849             }
5850         }
5851
5852       if (code == COND_EXPR)
5853         {
5854           gcc_assert (!slp_node);
5855           vectorizable_condition (stmt, gsi, vec_stmt,
5856                                   PHI_RESULT (phis[0]),
5857                                   reduc_index, NULL);
5858           /* Multiple types are not supported for condition.  */
5859           break;
5860         }
5861
5862       /* Handle uses.  */
5863       if (j == 0)
5864         {
5865           op0 = ops[!reduc_index];
5866           if (op_type == ternary_op)
5867             {
5868               if (reduc_index == 0)
5869                 op1 = ops[2];
5870               else
5871                 op1 = ops[1];
5872             }
5873
5874           if (slp_node)
5875             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5876                                slp_node, -1);
5877           else
5878             {
5879               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5880                                                             stmt);
5881               vec_oprnds0.quick_push (loop_vec_def0);
5882               if (op_type == ternary_op)
5883                {
5884                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5885                  vec_oprnds1.quick_push (loop_vec_def1);
5886                }
5887             }
5888         }
5889       else
5890         {
5891           if (!slp_node)
5892             {
5893               enum vect_def_type dt;
5894               gimple *dummy_stmt;
5895
5896               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5897                                   &dummy_stmt, &dt);
5898               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5899                                                               loop_vec_def0);
5900               vec_oprnds0[0] = loop_vec_def0;
5901               if (op_type == ternary_op)
5902                 {
5903                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5904                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5905                                                                 loop_vec_def1);
5906                   vec_oprnds1[0] = loop_vec_def1;
5907                 }
5908             }
5909
5910           if (single_defuse_cycle)
5911             reduc_def = gimple_assign_lhs (new_stmt);
5912
5913           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5914         }
5915
5916       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5917         {
5918           if (slp_node)
5919             reduc_def = PHI_RESULT (phis[i]);
5920           else
5921             {
5922               if (!single_defuse_cycle || j == 0)
5923                 reduc_def = PHI_RESULT (new_phi);
5924             }
5925
5926           def1 = ((op_type == ternary_op)
5927                   ? vec_oprnds1[i] : NULL);
5928           if (op_type == binary_op)
5929             {
5930               if (reduc_index == 0)
5931                 expr = build2 (code, vectype_out, reduc_def, def0);
5932               else
5933                 expr = build2 (code, vectype_out, def0, reduc_def);
5934             }
5935           else
5936             {
5937               if (reduc_index == 0)
5938                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5939               else
5940                 {
5941                   if (reduc_index == 1)
5942                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5943                   else
5944                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5945                 }
5946             }
5947
5948           new_stmt = gimple_build_assign (vec_dest, expr);
5949           new_temp = make_ssa_name (vec_dest, new_stmt);
5950           gimple_assign_set_lhs (new_stmt, new_temp);
5951           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5952
5953           if (slp_node)
5954             {
5955               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5956               vect_defs.quick_push (new_temp);
5957             }
5958           else
5959             vect_defs[0] = new_temp;
5960         }
5961
5962       if (slp_node)
5963         continue;
5964
5965       if (j == 0)
5966         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5967       else
5968         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5969
5970       prev_stmt_info = vinfo_for_stmt (new_stmt);
5971       prev_phi_info = vinfo_for_stmt (new_phi);
5972     }
5973
5974   tree indx_before_incr, indx_after_incr, cond_name = NULL;
5975
5976   /* Finalize the reduction-phi (set its arguments) and create the
5977      epilog reduction code.  */
5978   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5979     {
5980       new_temp = gimple_assign_lhs (*vec_stmt);
5981       vect_defs[0] = new_temp;
5982
5983       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5984          which is updated with the current index of the loop for every match of
5985          the original loop's cond_expr (VEC_STMT).  This results in a vector
5986          containing the last time the condition passed for that vector lane.
5987          The first match will be a 1 to allow 0 to be used for non-matching
5988          indexes.  If there are no matches at all then the vector will be all
5989          zeroes.  */
5990       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5991         {
5992           int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5993           int k;
5994
5995           gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
5996
5997           /* First we create a simple vector induction variable which starts
5998              with the values {1,2,3,...} (SERIES_VECT) and increments by the
5999              vector size (STEP).  */
6000
6001           /* Create a {1,2,3,...} vector.  */
6002           tree *vtemp = XALLOCAVEC (tree, nunits_out);
6003           for (k = 0; k < nunits_out; ++k)
6004             vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6005           tree series_vect = build_vector (cr_index_vector_type, vtemp);
6006
6007           /* Create a vector of the step value.  */
6008           tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6009           tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6010
6011           /* Create an induction variable.  */
6012           gimple_stmt_iterator incr_gsi;
6013           bool insert_after;
6014           standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6015           create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6016                      insert_after, &indx_before_incr, &indx_after_incr);
6017
6018           /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6019              filled with zeros (VEC_ZERO).  */
6020
6021           /* Create a vector of 0s.  */
6022           tree zero = build_zero_cst (cr_index_scalar_type);
6023           tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6024
6025           /* Create a vector phi node.  */
6026           tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6027           new_phi = create_phi_node (new_phi_tree, loop->header);
6028           set_vinfo_for_stmt (new_phi,
6029                               new_stmt_vec_info (new_phi, loop_vinfo));
6030           add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6031                        UNKNOWN_LOCATION);
6032
6033           /* Now take the condition from the loops original cond_expr
6034              (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6035              every match uses values from the induction variable
6036              (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6037              (NEW_PHI_TREE).
6038              Finally, we update the phi (NEW_PHI_TREE) to take the value of
6039              the new cond_expr (INDEX_COND_EXPR).  */
6040
6041           /* Turn the condition from vec_stmt into an ssa name.  */
6042           gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6043           tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6044           tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6045           gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6046                                                        ccompare);
6047           gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6048           gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6049           update_stmt (*vec_stmt);
6050
6051           /* Create a conditional, where the condition is taken from vec_stmt
6052              (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6053              and else is the phi (NEW_PHI_TREE).  */
6054           tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6055                                          ccompare_name, indx_before_incr,
6056                                          new_phi_tree);
6057           cond_name = make_ssa_name (cr_index_vector_type);
6058           gimple *index_condition = gimple_build_assign (cond_name,
6059                                                          index_cond_expr);
6060           gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6061           stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6062                                                             loop_vinfo);
6063           STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6064           set_vinfo_for_stmt (index_condition, index_vec_info);
6065
6066           /* Update the phi with the vec cond.  */
6067           add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6068                        UNKNOWN_LOCATION);
6069         }
6070     }
6071
6072   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6073                                     epilog_reduc_code, phis, reduc_index,
6074                                     double_reduc, slp_node, cond_name);
6075
6076   return true;
6077 }
6078
6079 /* Function vect_min_worthwhile_factor.
6080
6081    For a loop where we could vectorize the operation indicated by CODE,
6082    return the minimum vectorization factor that makes it worthwhile
6083    to use generic vectors.  */
6084 int
6085 vect_min_worthwhile_factor (enum tree_code code)
6086 {
6087   switch (code)
6088     {
6089     case PLUS_EXPR:
6090     case MINUS_EXPR:
6091     case NEGATE_EXPR:
6092       return 4;
6093
6094     case BIT_AND_EXPR:
6095     case BIT_IOR_EXPR:
6096     case BIT_XOR_EXPR:
6097     case BIT_NOT_EXPR:
6098       return 2;
6099
6100     default:
6101       return INT_MAX;
6102     }
6103 }
6104
6105
6106 /* Function vectorizable_induction
6107
6108    Check if PHI performs an induction computation that can be vectorized.
6109    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6110    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6111    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6112
6113 bool
6114 vectorizable_induction (gimple *phi,
6115                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6116                         gimple **vec_stmt)
6117 {
6118   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6119   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6120   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6121   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6122   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6123   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6124   tree vec_def;
6125
6126   gcc_assert (ncopies >= 1);
6127   /* FORNOW. These restrictions should be relaxed.  */
6128   if (nested_in_vect_loop_p (loop, phi))
6129     {
6130       imm_use_iterator imm_iter;
6131       use_operand_p use_p;
6132       gimple *exit_phi;
6133       edge latch_e;
6134       tree loop_arg;
6135
6136       if (ncopies > 1)
6137         {
6138           if (dump_enabled_p ())
6139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6140                              "multiple types in nested loop.\n");
6141           return false;
6142         }
6143
6144       exit_phi = NULL;
6145       latch_e = loop_latch_edge (loop->inner);
6146       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6147       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6148         {
6149           gimple *use_stmt = USE_STMT (use_p);
6150           if (is_gimple_debug (use_stmt))
6151             continue;
6152
6153           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6154             {
6155               exit_phi = use_stmt;
6156               break;
6157             }
6158         }
6159       if (exit_phi)
6160         {
6161           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6162           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6163                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6164             {
6165               if (dump_enabled_p ())
6166                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6167                                  "inner-loop induction only used outside "
6168                                  "of the outer vectorized loop.\n");
6169               return false;
6170             }
6171         }
6172     }
6173
6174   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6175     return false;
6176
6177   /* FORNOW: SLP not supported.  */
6178   if (STMT_SLP_TYPE (stmt_info))
6179     return false;
6180
6181   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6182
6183   if (gimple_code (phi) != GIMPLE_PHI)
6184     return false;
6185
6186   if (!vec_stmt) /* transformation not required.  */
6187     {
6188       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6189       if (dump_enabled_p ())
6190         dump_printf_loc (MSG_NOTE, vect_location,
6191                          "=== vectorizable_induction ===\n");
6192       vect_model_induction_cost (stmt_info, ncopies);
6193       return true;
6194     }
6195
6196   /** Transform.  **/
6197
6198   if (dump_enabled_p ())
6199     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6200
6201   vec_def = get_initial_def_for_induction (phi);
6202   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6203   return true;
6204 }
6205
6206 /* Function vectorizable_live_operation.
6207
6208    STMT computes a value that is used outside the loop.  Check if
6209    it can be supported.  */
6210
6211 bool
6212 vectorizable_live_operation (gimple *stmt,
6213                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6214                              gimple **vec_stmt)
6215 {
6216   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6217   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6219   tree op;
6220   gimple *def_stmt;
6221   ssa_op_iter iter;
6222
6223   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6224
6225   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6226     return false;
6227
6228   if (!is_gimple_assign (stmt))
6229     {
6230       if (gimple_call_internal_p (stmt)
6231           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6232           && gimple_call_lhs (stmt)
6233           && loop->simduid
6234           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6235           && loop->simduid
6236              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6237         {
6238           edge e = single_exit (loop);
6239           basic_block merge_bb = e->dest;
6240           imm_use_iterator imm_iter;
6241           use_operand_p use_p;
6242           tree lhs = gimple_call_lhs (stmt);
6243
6244           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6245             {
6246               gimple *use_stmt = USE_STMT (use_p);
6247               if (gimple_code (use_stmt) == GIMPLE_PHI
6248                   && gimple_bb (use_stmt) == merge_bb)
6249                 {
6250                   if (vec_stmt)
6251                     {
6252                       tree vfm1
6253                         = build_int_cst (unsigned_type_node,
6254                                          loop_vinfo->vectorization_factor - 1);
6255                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6256                     }
6257                   return true;
6258                 }
6259             }
6260         }
6261
6262       return false;
6263     }
6264
6265   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6266     return false;
6267
6268   /* FORNOW. CHECKME. */
6269   if (nested_in_vect_loop_p (loop, stmt))
6270     return false;
6271
6272   /* FORNOW: support only if all uses are invariant.  This means
6273      that the scalar operations can remain in place, unvectorized.
6274      The original last scalar value that they compute will be used.  */
6275   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6276     {
6277       enum vect_def_type dt = vect_uninitialized_def;
6278
6279       if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6280         {
6281           if (dump_enabled_p ())
6282             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283                              "use not simple.\n");
6284           return false;
6285         }
6286
6287       if (dt != vect_external_def && dt != vect_constant_def)
6288         return false;
6289     }
6290
6291   /* No transformation is required for the cases we currently support.  */
6292   return true;
6293 }
6294
6295 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6296
6297 static void
6298 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6299 {
6300   ssa_op_iter op_iter;
6301   imm_use_iterator imm_iter;
6302   def_operand_p def_p;
6303   gimple *ustmt;
6304
6305   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6306     {
6307       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6308         {
6309           basic_block bb;
6310
6311           if (!is_gimple_debug (ustmt))
6312             continue;
6313
6314           bb = gimple_bb (ustmt);
6315
6316           if (!flow_bb_inside_loop_p (loop, bb))
6317             {
6318               if (gimple_debug_bind_p (ustmt))
6319                 {
6320                   if (dump_enabled_p ())
6321                     dump_printf_loc (MSG_NOTE, vect_location,
6322                                      "killing debug use\n");
6323
6324                   gimple_debug_bind_reset_value (ustmt);
6325                   update_stmt (ustmt);
6326                 }
6327               else
6328                 gcc_unreachable ();
6329             }
6330         }
6331     }
6332 }
6333
6334
6335 /* This function builds ni_name = number of iterations.  Statements
6336    are emitted on the loop preheader edge.  */
6337
6338 static tree
6339 vect_build_loop_niters (loop_vec_info loop_vinfo)
6340 {
6341   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6342   if (TREE_CODE (ni) == INTEGER_CST)
6343     return ni;
6344   else
6345     {
6346       tree ni_name, var;
6347       gimple_seq stmts = NULL;
6348       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6349
6350       var = create_tmp_var (TREE_TYPE (ni), "niters");
6351       ni_name = force_gimple_operand (ni, &stmts, false, var);
6352       if (stmts)
6353         gsi_insert_seq_on_edge_immediate (pe, stmts);
6354
6355       return ni_name;
6356     }
6357 }
6358
6359
6360 /* This function generates the following statements:
6361
6362    ni_name = number of iterations loop executes
6363    ratio = ni_name / vf
6364    ratio_mult_vf_name = ratio * vf
6365
6366    and places them on the loop preheader edge.  */
6367
6368 static void
6369 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6370                                  tree ni_name,
6371                                  tree *ratio_mult_vf_name_ptr,
6372                                  tree *ratio_name_ptr)
6373 {
6374   tree ni_minus_gap_name;
6375   tree var;
6376   tree ratio_name;
6377   tree ratio_mult_vf_name;
6378   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6379   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6380   tree log_vf;
6381
6382   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6383
6384   /* If epilogue loop is required because of data accesses with gaps, we
6385      subtract one iteration from the total number of iterations here for
6386      correct calculation of RATIO.  */
6387   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6388     {
6389       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6390                                        ni_name,
6391                                        build_one_cst (TREE_TYPE (ni_name)));
6392       if (!is_gimple_val (ni_minus_gap_name))
6393         {
6394           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6395           gimple *stmts = NULL;
6396           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6397                                                     true, var);
6398           gsi_insert_seq_on_edge_immediate (pe, stmts);
6399         }
6400     }
6401   else
6402     ni_minus_gap_name = ni_name;
6403
6404   /* Create: ratio = ni >> log2(vf) */
6405   /* ???  As we have ni == number of latch executions + 1, ni could
6406      have overflown to zero.  So avoid computing ratio based on ni
6407      but compute it using the fact that we know ratio will be at least
6408      one, thus via (ni - vf) >> log2(vf) + 1.  */
6409   ratio_name
6410     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6411                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6412                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6413                                              ni_minus_gap_name,
6414                                              build_int_cst
6415                                                (TREE_TYPE (ni_name), vf)),
6416                                 log_vf),
6417                    build_int_cst (TREE_TYPE (ni_name), 1));
6418   if (!is_gimple_val (ratio_name))
6419     {
6420       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6421       gimple *stmts = NULL;
6422       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6423       gsi_insert_seq_on_edge_immediate (pe, stmts);
6424     }
6425   *ratio_name_ptr = ratio_name;
6426
6427   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6428
6429   if (ratio_mult_vf_name_ptr)
6430     {
6431       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6432                                         ratio_name, log_vf);
6433       if (!is_gimple_val (ratio_mult_vf_name))
6434         {
6435           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6436           gimple *stmts = NULL;
6437           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6438                                                      true, var);
6439           gsi_insert_seq_on_edge_immediate (pe, stmts);
6440         }
6441       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6442     }
6443
6444   return;
6445 }
6446
6447
6448 /* Function vect_transform_loop.
6449
6450    The analysis phase has determined that the loop is vectorizable.
6451    Vectorize the loop - created vectorized stmts to replace the scalar
6452    stmts in the loop, and update the loop exit condition.  */
6453
6454 void
6455 vect_transform_loop (loop_vec_info loop_vinfo)
6456 {
6457   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6458   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6459   int nbbs = loop->num_nodes;
6460   int i;
6461   tree ratio = NULL;
6462   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6463   bool grouped_store;
6464   bool slp_scheduled = false;
6465   gimple *stmt, *pattern_stmt;
6466   gimple_seq pattern_def_seq = NULL;
6467   gimple_stmt_iterator pattern_def_si = gsi_none ();
6468   bool transform_pattern_stmt = false;
6469   bool check_profitability = false;
6470   int th;
6471   /* Record number of iterations before we started tampering with the profile. */
6472   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6473
6474   if (dump_enabled_p ())
6475     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6476
6477   /* If profile is inprecise, we have chance to fix it up.  */
6478   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6479     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6480
6481   /* Use the more conservative vectorization threshold.  If the number
6482      of iterations is constant assume the cost check has been performed
6483      by our caller.  If the threshold makes all loops profitable that
6484      run at least the vectorization factor number of times checking
6485      is pointless, too.  */
6486   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6487   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6488       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6489     {
6490       if (dump_enabled_p ())
6491         dump_printf_loc (MSG_NOTE, vect_location,
6492                          "Profitability threshold is %d loop iterations.\n",
6493                          th);
6494       check_profitability = true;
6495     }
6496
6497   /* Version the loop first, if required, so the profitability check
6498      comes first.  */
6499
6500   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6501       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6502     {
6503       vect_loop_versioning (loop_vinfo, th, check_profitability);
6504       check_profitability = false;
6505     }
6506
6507   tree ni_name = vect_build_loop_niters (loop_vinfo);
6508   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6509
6510   /* Peel the loop if there are data refs with unknown alignment.
6511      Only one data ref with unknown store is allowed.  */
6512
6513   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6514     {
6515       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6516                                      th, check_profitability);
6517       check_profitability = false;
6518       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6519          be re-computed.  */
6520       ni_name = NULL_TREE;
6521     }
6522
6523   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6524      compile time constant), or it is a constant that doesn't divide by the
6525      vectorization factor, then an epilog loop needs to be created.
6526      We therefore duplicate the loop: the original loop will be vectorized,
6527      and will compute the first (n/VF) iterations.  The second copy of the loop
6528      will remain scalar and will compute the remaining (n%VF) iterations.
6529      (VF is the vectorization factor).  */
6530
6531   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6532       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6533     {
6534       tree ratio_mult_vf;
6535       if (!ni_name)
6536         ni_name = vect_build_loop_niters (loop_vinfo);
6537       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6538                                        &ratio);
6539       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6540                                       th, check_profitability);
6541     }
6542   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6543     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6544                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6545   else
6546     {
6547       if (!ni_name)
6548         ni_name = vect_build_loop_niters (loop_vinfo);
6549       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6550     }
6551
6552   /* 1) Make sure the loop header has exactly two entries
6553      2) Make sure we have a preheader basic block.  */
6554
6555   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6556
6557   split_edge (loop_preheader_edge (loop));
6558
6559   /* FORNOW: the vectorizer supports only loops which body consist
6560      of one basic block (header + empty latch). When the vectorizer will
6561      support more involved loop forms, the order by which the BBs are
6562      traversed need to be reconsidered.  */
6563
6564   for (i = 0; i < nbbs; i++)
6565     {
6566       basic_block bb = bbs[i];
6567       stmt_vec_info stmt_info;
6568
6569       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6570            gsi_next (&si))
6571         {
6572           gphi *phi = si.phi ();
6573           if (dump_enabled_p ())
6574             {
6575               dump_printf_loc (MSG_NOTE, vect_location,
6576                                "------>vectorizing phi: ");
6577               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6578               dump_printf (MSG_NOTE, "\n");
6579             }
6580           stmt_info = vinfo_for_stmt (phi);
6581           if (!stmt_info)
6582             continue;
6583
6584           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6585             vect_loop_kill_debug_uses (loop, phi);
6586
6587           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6588               && !STMT_VINFO_LIVE_P (stmt_info))
6589             continue;
6590
6591           if (STMT_VINFO_VECTYPE (stmt_info)
6592               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6593                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6594               && dump_enabled_p ())
6595             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6596
6597           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6598             {
6599               if (dump_enabled_p ())
6600                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6601               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6602             }
6603         }
6604
6605       pattern_stmt = NULL;
6606       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6607            !gsi_end_p (si) || transform_pattern_stmt;)
6608         {
6609           bool is_store;
6610
6611           if (transform_pattern_stmt)
6612             stmt = pattern_stmt;
6613           else
6614             {
6615               stmt = gsi_stmt (si);
6616               /* During vectorization remove existing clobber stmts.  */
6617               if (gimple_clobber_p (stmt))
6618                 {
6619                   unlink_stmt_vdef (stmt);
6620                   gsi_remove (&si, true);
6621                   release_defs (stmt);
6622                   continue;
6623                 }
6624             }
6625
6626           if (dump_enabled_p ())
6627             {
6628               dump_printf_loc (MSG_NOTE, vect_location,
6629                                "------>vectorizing statement: ");
6630               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6631               dump_printf (MSG_NOTE, "\n");
6632             }
6633
6634           stmt_info = vinfo_for_stmt (stmt);
6635
6636           /* vector stmts created in the outer-loop during vectorization of
6637              stmts in an inner-loop may not have a stmt_info, and do not
6638              need to be vectorized.  */
6639           if (!stmt_info)
6640             {
6641               gsi_next (&si);
6642               continue;
6643             }
6644
6645           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6646             vect_loop_kill_debug_uses (loop, stmt);
6647
6648           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6649               && !STMT_VINFO_LIVE_P (stmt_info))
6650             {
6651               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6652                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6653                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6654                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6655                 {
6656                   stmt = pattern_stmt;
6657                   stmt_info = vinfo_for_stmt (stmt);
6658                 }
6659               else
6660                 {
6661                   gsi_next (&si);
6662                   continue;
6663                 }
6664             }
6665           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6666                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6667                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6668                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6669             transform_pattern_stmt = true;
6670
6671           /* If pattern statement has def stmts, vectorize them too.  */
6672           if (is_pattern_stmt_p (stmt_info))
6673             {
6674               if (pattern_def_seq == NULL)
6675                 {
6676                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6677                   pattern_def_si = gsi_start (pattern_def_seq);
6678                 }
6679               else if (!gsi_end_p (pattern_def_si))
6680                 gsi_next (&pattern_def_si);
6681               if (pattern_def_seq != NULL)
6682                 {
6683                   gimple *pattern_def_stmt = NULL;
6684                   stmt_vec_info pattern_def_stmt_info = NULL;
6685
6686                   while (!gsi_end_p (pattern_def_si))
6687                     {
6688                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6689                       pattern_def_stmt_info
6690                         = vinfo_for_stmt (pattern_def_stmt);
6691                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6692                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6693                         break;
6694                       gsi_next (&pattern_def_si);
6695                     }
6696
6697                   if (!gsi_end_p (pattern_def_si))
6698                     {
6699                       if (dump_enabled_p ())
6700                         {
6701                           dump_printf_loc (MSG_NOTE, vect_location,
6702                                            "==> vectorizing pattern def "
6703                                            "stmt: ");
6704                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6705                                             pattern_def_stmt, 0);
6706                           dump_printf (MSG_NOTE, "\n");
6707                         }
6708
6709                       stmt = pattern_def_stmt;
6710                       stmt_info = pattern_def_stmt_info;
6711                     }
6712                   else
6713                     {
6714                       pattern_def_si = gsi_none ();
6715                       transform_pattern_stmt = false;
6716                     }
6717                 }
6718               else
6719                 transform_pattern_stmt = false;
6720             }
6721
6722           if (STMT_VINFO_VECTYPE (stmt_info))
6723             {
6724               unsigned int nunits
6725                 = (unsigned int)
6726                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6727               if (!STMT_SLP_TYPE (stmt_info)
6728                   && nunits != (unsigned int) vectorization_factor
6729                   && dump_enabled_p ())
6730                   /* For SLP VF is set according to unrolling factor, and not
6731                      to vector size, hence for SLP this print is not valid.  */
6732                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6733             }
6734
6735           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6736              reached.  */
6737           if (STMT_SLP_TYPE (stmt_info))
6738             {
6739               if (!slp_scheduled)
6740                 {
6741                   slp_scheduled = true;
6742
6743                   if (dump_enabled_p ())
6744                     dump_printf_loc (MSG_NOTE, vect_location,
6745                                      "=== scheduling SLP instances ===\n");
6746
6747                   vect_schedule_slp (loop_vinfo);
6748                 }
6749
6750               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6751               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6752                 {
6753                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6754                     {
6755                       pattern_def_seq = NULL;
6756                       gsi_next (&si);
6757                     }
6758                   continue;
6759                 }
6760             }
6761
6762           /* -------- vectorize statement ------------ */
6763           if (dump_enabled_p ())
6764             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6765
6766           grouped_store = false;
6767           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6768           if (is_store)
6769             {
6770               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6771                 {
6772                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6773                      interleaving chain was completed - free all the stores in
6774                      the chain.  */
6775                   gsi_next (&si);
6776                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6777                 }
6778               else
6779                 {
6780                   /* Free the attached stmt_vec_info and remove the stmt.  */
6781                   gimple *store = gsi_stmt (si);
6782                   free_stmt_vec_info (store);
6783                   unlink_stmt_vdef (store);
6784                   gsi_remove (&si, true);
6785                   release_defs (store);
6786                 }
6787
6788               /* Stores can only appear at the end of pattern statements.  */
6789               gcc_assert (!transform_pattern_stmt);
6790               pattern_def_seq = NULL;
6791             }
6792           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6793             {
6794               pattern_def_seq = NULL;
6795               gsi_next (&si);
6796             }
6797         }                       /* stmts in BB */
6798     }                           /* BBs in loop */
6799
6800   slpeel_make_loop_iterate_ntimes (loop, ratio);
6801
6802   /* Reduce loop iterations by the vectorization factor.  */
6803   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6804                       expected_iterations / vectorization_factor);
6805   loop->nb_iterations_upper_bound
6806     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6807   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6808       && loop->nb_iterations_upper_bound != 0)
6809     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6810   if (loop->any_estimate)
6811     {
6812       loop->nb_iterations_estimate
6813         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6814        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6815            && loop->nb_iterations_estimate != 0)
6816          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6817     }
6818
6819   if (dump_enabled_p ())
6820     {
6821       dump_printf_loc (MSG_NOTE, vect_location,
6822                        "LOOP VECTORIZED\n");
6823       if (loop->inner)
6824         dump_printf_loc (MSG_NOTE, vect_location,
6825                          "OUTER LOOP VECTORIZED\n");
6826       dump_printf (MSG_NOTE, "\n");
6827     }
6828 }