gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "cfgloop.h"
  45 #include "params.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50
  51 /* Loop Vectorization Pass.
  52
  53    This pass tries to vectorize loops.
  54
  55    For example, the vectorizer transforms the following simple loop:
  56
  57         short a[N]; short b[N]; short c[N]; int i;
  58
  59         for (i=0; i<N; i++){
  60           a[i] = b[i] + c[i];
  61         }
  62
  63    as if it was manually vectorized by rewriting the source code into:
  64
  65         typedef int __attribute__((mode(V8HI))) v8hi;
  66         short a[N];  short b[N]; short c[N];   int i;
  67         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  68         v8hi va, vb, vc;
  69
  70         for (i=0; i<N/8; i++){
  71           vb = pb[i];
  72           vc = pc[i];
  73           va = vb + vc;
  74           pa[i] = va;
  75         }
  76
  77         The main entry to this pass is vectorize_loops(), in which
  78    the vectorizer applies a set of analyses on a given set of loops,
  79    followed by the actual vectorization transformation for the loops that
  80    had successfully passed the analysis phase.
  81         Throughout this pass we make a distinction between two types of
  82    data: scalars (which are represented by SSA_NAMES), and memory references
  83    ("data-refs").  These two types of data require different handling both
  84    during analysis and transformation. The types of data-refs that the
  85    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  86    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  87    accesses are required to have a simple (consecutive) access pattern.
  88
  89    Analysis phase:
  90    ===============
  91         The driver for the analysis phase is vect_analyze_loop().
  92    It applies a set of analyses, some of which rely on the scalar evolution
  93    analyzer (scev) developed by Sebastian Pop.
  94
  95         During the analysis phase the vectorizer records some information
  96    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  97    loop, as well as general information about the loop as a whole, which is
  98    recorded in a "loop_vec_info" struct attached to each loop.
  99
 100    Transformation phase:
 101    =====================
 102         The loop transformation phase scans all the stmts in the loop, and
 103    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 104    the loop that needs to be vectorized.  It inserts the vector code sequence
 105    just before the scalar stmt S, and records a pointer to the vector code
 106    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 107    attached to S).  This pointer will be used for the vectorization of following
 108    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 109    otherwise, we rely on dead code elimination for removing it.
 110
 111         For example, say stmt S1 was vectorized into stmt VS1:
 112
 113    VS1: vb = px[i];
 114    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 115    S2:  a = b;
 116
 117    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 118    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 119    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 120    resulting sequence would be:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    VS2: va = vb;
 125    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 126
 127         Operands that are not SSA_NAMEs, are data-refs that appear in
 128    load/store operations (like 'x[i]' in S1), and are handled differently.
 129
 130    Target modeling:
 131    =================
 132         Currently the only target specific information that is used is the
 133    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 134    Targets that can support different sizes of vectors, for now will need
 135    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 136    flexibility will be added in the future.
 137
 138         Since we only vectorize operations which vector form can be
 139    expressed using existing tree codes, to verify that an operation is
 140    supported, the vectorizer checks the relevant optab at the relevant
 141    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 142    the value found is CODE_FOR_nothing, then there's no target support, and
 143    we can't vectorize the stmt.
 144
 145    For additional information on this project see:
 146    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 147 */
 148
 149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 150
 151 /* Function vect_determine_vectorization_factor
 152
 153    Determine the vectorization factor (VF).  VF is the number of data elements
 154    that are operated upon in parallel in a single iteration of the vectorized
 155    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 156    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 157    elements can fit in a single vector register.
 158
 159    We currently support vectorization of loops in which all types operated upon
 160    are of the same size.  Therefore this function currently sets VF according to
 161    the size of the types operated upon, and fails if there are multiple sizes
 162    in the loop.
 163
 164    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 165    original loop:
 166         for (i=0; i<N; i++){
 167           a[i] = b[i] + c[i];
 168         }
 169
 170    vectorized loop:
 171         for (i=0; i<N; i+=VF){
 172           a[i:VF] = b[i:VF] + c[i:VF];
 173         }
 174 */
 175
 176 static bool
 177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 178 {
 179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 180   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 181   unsigned nbbs = loop->num_nodes;
 182   unsigned int vectorization_factor = 0;
 183   tree scalar_type;
 184   gphi *phi;
 185   tree vectype;
 186   unsigned int nunits;
 187   stmt_vec_info stmt_info;
 188   unsigned i;
 189   HOST_WIDE_INT dummy;
 190   gimple *stmt, *pattern_stmt = NULL;
 191   gimple_seq pattern_def_seq = NULL;
 192   gimple_stmt_iterator pattern_def_si = gsi_none ();
 193   bool analyze_pattern_stmt = false;
 194   bool bool_result;
 195   auto_vec<stmt_vec_info> mask_producers;
 196
 197   if (dump_enabled_p ())
 198     dump_printf_loc (MSG_NOTE, vect_location,
 199                      "=== vect_determine_vectorization_factor ===\n");
 200
 201   for (i = 0; i < nbbs; i++)
 202     {
 203       basic_block bb = bbs[i];
 204
 205       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 206            gsi_next (&si))
 207         {
 208           phi = si.phi ();
 209           stmt_info = vinfo_for_stmt (phi);
 210           if (dump_enabled_p ())
 211             {
 212               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 213               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 214               dump_printf (MSG_NOTE, "\n");
 215             }
 216
 217           gcc_assert (stmt_info);
 218
 219           if (STMT_VINFO_RELEVANT_P (stmt_info))
 220             {
 221               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 222               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 223
 224               if (dump_enabled_p ())
 225                 {
 226                   dump_printf_loc (MSG_NOTE, vect_location,
 227                                    "get vectype for scalar type:  ");
 228                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 229                   dump_printf (MSG_NOTE, "\n");
 230                 }
 231
 232               vectype = get_vectype_for_scalar_type (scalar_type);
 233               if (!vectype)
 234                 {
 235                   if (dump_enabled_p ())
 236                     {
 237                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 238                                        "not vectorized: unsupported "
 239                                        "data-type ");
 240                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 241                                          scalar_type);
 242                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 243                     }
 244                   return false;
 245                 }
 246               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 247
 248               if (dump_enabled_p ())
 249                 {
 250                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 251                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 252                   dump_printf (MSG_NOTE, "\n");
 253                 }
 254
 255               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 256               if (dump_enabled_p ())
 257                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 258                                  nunits);
 259
 260               if (!vectorization_factor
 261                   || (nunits > vectorization_factor))
 262                 vectorization_factor = nunits;
 263             }
 264         }
 265
 266       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 267            !gsi_end_p (si) || analyze_pattern_stmt;)
 268         {
 269           tree vf_vectype;
 270
 271           if (analyze_pattern_stmt)
 272             stmt = pattern_stmt;
 273           else
 274             stmt = gsi_stmt (si);
 275
 276           stmt_info = vinfo_for_stmt (stmt);
 277
 278           if (dump_enabled_p ())
 279             {
 280               dump_printf_loc (MSG_NOTE, vect_location,
 281                                "==> examining statement: ");
 282               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 283               dump_printf (MSG_NOTE, "\n");
 284             }
 285
 286           gcc_assert (stmt_info);
 287
 288           /* Skip stmts which do not need to be vectorized.  */
 289           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 290                && !STMT_VINFO_LIVE_P (stmt_info))
 291               || gimple_clobber_p (stmt))
 292             {
 293               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 294                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 295                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 296                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 297                 {
 298                   stmt = pattern_stmt;
 299                   stmt_info = vinfo_for_stmt (pattern_stmt);
 300                   if (dump_enabled_p ())
 301                     {
 302                       dump_printf_loc (MSG_NOTE, vect_location,
 303                                        "==> examining pattern statement: ");
 304                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 305                       dump_printf (MSG_NOTE, "\n");
 306                     }
 307                 }
 308               else
 309                 {
 310                   if (dump_enabled_p ())
 311                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 312                   gsi_next (&si);
 313                   continue;
 314                 }
 315             }
 316           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 317                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 318                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 319                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 320             analyze_pattern_stmt = true;
 321
 322           /* If a pattern statement has def stmts, analyze them too.  */
 323           if (is_pattern_stmt_p (stmt_info))
 324             {
 325               if (pattern_def_seq == NULL)
 326                 {
 327                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 328                   pattern_def_si = gsi_start (pattern_def_seq);
 329                 }
 330               else if (!gsi_end_p (pattern_def_si))
 331                 gsi_next (&pattern_def_si);
 332               if (pattern_def_seq != NULL)
 333                 {
 334                   gimple *pattern_def_stmt = NULL;
 335                   stmt_vec_info pattern_def_stmt_info = NULL;
 336
 337                   while (!gsi_end_p (pattern_def_si))
 338                     {
 339                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 340                       pattern_def_stmt_info
 341                         = vinfo_for_stmt (pattern_def_stmt);
 342                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 343                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 344                         break;
 345                       gsi_next (&pattern_def_si);
 346                     }
 347
 348                   if (!gsi_end_p (pattern_def_si))
 349                     {
 350                       if (dump_enabled_p ())
 351                         {
 352                           dump_printf_loc (MSG_NOTE, vect_location,
 353                                            "==> examining pattern def stmt: ");
 354                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 355                                             pattern_def_stmt, 0);
 356                           dump_printf (MSG_NOTE, "\n");
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 398                 }
 399               return false;
 400             }
 401
 402           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 403             {
 404               if (dump_enabled_p ())
 405                 {
 406                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                    "not vectorized: vector stmt in loop:");
 408                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 409                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (is_gimple_call (stmt)
 431                   && gimple_call_internal_p (stmt)
 432                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 433                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 434               else
 435                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 436
 437               /* Bool ops don't participate in vectorization factor
 438                  computation.  For comparison use compared types to
 439                  compute a factor.  */
 440               if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
 441                   && is_gimple_assign (stmt)
 442                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 443                 {
 444                   if (STMT_VINFO_RELEVANT_P (stmt_info))
 445                     mask_producers.safe_push (stmt_info);
 446                   bool_result = true;
 447
 448                   if (gimple_code (stmt) == GIMPLE_ASSIGN
 449                       && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 450                          == tcc_comparison
 451                       && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
 452                          != BOOLEAN_TYPE)
 453                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 454                   else
 455                     {
 456                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 457                         {
 458                           pattern_def_seq = NULL;
 459                           gsi_next (&si);
 460                         }
 461                       continue;
 462                     }
 463                 }
 464
 465               if (dump_enabled_p ())
 466                 {
 467                   dump_printf_loc (MSG_NOTE, vect_location,
 468                                    "get vectype for scalar type:  ");
 469                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 470                   dump_printf (MSG_NOTE, "\n");
 471                 }
 472               vectype = get_vectype_for_scalar_type (scalar_type);
 473               if (!vectype)
 474                 {
 475                   if (dump_enabled_p ())
 476                     {
 477                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 478                                        "not vectorized: unsupported "
 479                                        "data-type ");
 480                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 481                                          scalar_type);
 482                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 483                     }
 484                   return false;
 485                 }
 486
 487               if (!bool_result)
 488                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 489
 490               if (dump_enabled_p ())
 491                 {
 492                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 493                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 494                   dump_printf (MSG_NOTE, "\n");
 495                 }
 496             }
 497
 498           /* Don't try to compute VF out scalar types if we stmt
 499              produces boolean vector.  Use result vectype instead.  */
 500           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 501             vf_vectype = vectype;
 502           else
 503             {
 504               /* The vectorization factor is according to the smallest
 505                  scalar type (or the largest vector size, but we only
 506                  support one vector size per loop).  */
 507               if (!bool_result)
 508                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 509                                                              &dummy);
 510               if (dump_enabled_p ())
 511                 {
 512                   dump_printf_loc (MSG_NOTE, vect_location,
 513                                    "get vectype for scalar type:  ");
 514                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 515                   dump_printf (MSG_NOTE, "\n");
 516                 }
 517               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 518             }
 519           if (!vf_vectype)
 520             {
 521               if (dump_enabled_p ())
 522                 {
 523                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 524                                    "not vectorized: unsupported data-type ");
 525                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 526                                      scalar_type);
 527                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 528                 }
 529               return false;
 530             }
 531
 532           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 533                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 534             {
 535               if (dump_enabled_p ())
 536                 {
 537                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 538                                    "not vectorized: different sized vector "
 539                                    "types in statement, ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 543                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 544                                      vf_vectype);
 545                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 546                 }
 547               return false;
 548             }
 549
 550           if (dump_enabled_p ())
 551             {
 552               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 553               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 554               dump_printf (MSG_NOTE, "\n");
 555             }
 556
 557           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 560           if (!vectorization_factor
 561               || (nunits > vectorization_factor))
 562             vectorization_factor = nunits;
 563
 564           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 565             {
 566               pattern_def_seq = NULL;
 567               gsi_next (&si);
 568             }
 569         }
 570     }
 571
 572   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 573   if (dump_enabled_p ())
 574     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 575                      vectorization_factor);
 576   if (vectorization_factor <= 1)
 577     {
 578       if (dump_enabled_p ())
 579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 580                          "not vectorized: unsupported data-type\n");
 581       return false;
 582     }
 583   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 584
 585   for (i = 0; i < mask_producers.length (); i++)
 586     {
 587       tree mask_type = NULL;
 588
 589       stmt = STMT_VINFO_STMT (mask_producers[i]);
 590
 591       if (gimple_code (stmt) == GIMPLE_ASSIGN
 592           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 593           && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
 594         {
 595           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 596           mask_type = get_mask_type_for_scalar_type (scalar_type);
 597
 598           if (!mask_type)
 599             {
 600               if (dump_enabled_p ())
 601                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 602                                  "not vectorized: unsupported mask\n");
 603               return false;
 604             }
 605         }
 606       else
 607         {
 608           tree rhs;
 609           ssa_op_iter iter;
 610           gimple *def_stmt;
 611           enum vect_def_type dt;
 612
 613           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 614             {
 615               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 616                                        &def_stmt, &dt, &vectype))
 617                 {
 618                   if (dump_enabled_p ())
 619                     {
 620                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 621                                        "not vectorized: can't compute mask type "
 622                                        "for statement, ");
 623                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 624                                         0);
 625                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 626                     }
 627                   return false;
 628                 }
 629
 630               /* No vectype probably means external definition.
 631                  Allow it in case there is another operand which
 632                  allows to determine mask type.  */
 633               if (!vectype)
 634                 continue;
 635
 636               if (!mask_type)
 637                 mask_type = vectype;
 638               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 639                        != TYPE_VECTOR_SUBPARTS (vectype))
 640                 {
 641                   if (dump_enabled_p ())
 642                     {
 643                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 644                                        "not vectorized: different sized masks "
 645                                        "types in statement, ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          mask_type);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 649                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 650                                          vectype);
 651                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 652                     }
 653                   return false;
 654                 }
 655               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 656                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 657                 {
 658                   if (dump_enabled_p ())
 659                     {
 660                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                                        "not vectorized: mixed mask and "
 662                                        "nonmask vector types in statement, ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          mask_type);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 666                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 667                                          vectype);
 668                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 669                     }
 670                   return false;
 671                 }
 672             }
 673
 674           /* We may compare boolean value loaded as vector of integers.
 675              Fix mask_type in such case.  */
 676           if (mask_type
 677               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 678               && gimple_code (stmt) == GIMPLE_ASSIGN
 679               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 680             mask_type = build_same_sized_truth_vector_type (mask_type);
 681         }
 682
 683       /* No mask_type should mean loop invariant predicate.
 684          This is probably a subject for optimization in
 685          if-conversion.  */
 686       if (!mask_type)
 687         {
 688           if (dump_enabled_p ())
 689             {
 690               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 691                                "not vectorized: can't compute mask type "
 692                                "for statement, ");
 693               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 694                                 0);
 695               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 696             }
 697           return false;
 698         }
 699
 700       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 701     }
 702
 703   return true;
 704 }
 705
 706
 707 /* Function vect_is_simple_iv_evolution.
 708
 709    FORNOW: A simple evolution of an induction variables in the loop is
 710    considered a polynomial evolution.  */
 711
 712 static bool
 713 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 714                              tree * step)
 715 {
 716   tree init_expr;
 717   tree step_expr;
 718   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 719   basic_block bb;
 720
 721   /* When there is no evolution in this loop, the evolution function
 722      is not "simple".  */
 723   if (evolution_part == NULL_TREE)
 724     return false;
 725
 726   /* When the evolution is a polynomial of degree >= 2
 727      the evolution function is not "simple".  */
 728   if (tree_is_chrec (evolution_part))
 729     return false;
 730
 731   step_expr = evolution_part;
 732   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 733
 734   if (dump_enabled_p ())
 735     {
 736       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 737       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 738       dump_printf (MSG_NOTE, ",  init: ");
 739       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 740       dump_printf (MSG_NOTE, "\n");
 741     }
 742
 743   *init = init_expr;
 744   *step = step_expr;
 745
 746   if (TREE_CODE (step_expr) != INTEGER_CST
 747       && (TREE_CODE (step_expr) != SSA_NAME
 748           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 749               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 750           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 751               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 752                   || !flag_associative_math)))
 753       && (TREE_CODE (step_expr) != REAL_CST
 754           || !flag_associative_math))
 755     {
 756       if (dump_enabled_p ())
 757         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 758                          "step unknown.\n");
 759       return false;
 760     }
 761
 762   return true;
 763 }
 764
 765 /* Function vect_analyze_scalar_cycles_1.
 766
 767    Examine the cross iteration def-use cycles of scalar variables
 768    in LOOP.  LOOP_VINFO represents the loop that is now being
 769    considered for vectorization (can be LOOP, or an outer-loop
 770    enclosing LOOP).  */
 771
 772 static void
 773 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 774 {
 775   basic_block bb = loop->header;
 776   tree init, step;
 777   auto_vec<gimple *, 64> worklist;
 778   gphi_iterator gsi;
 779   bool double_reduc;
 780
 781   if (dump_enabled_p ())
 782     dump_printf_loc (MSG_NOTE, vect_location,
 783                      "=== vect_analyze_scalar_cycles ===\n");
 784
 785   /* First - identify all inductions.  Reduction detection assumes that all the
 786      inductions have been identified, therefore, this order must not be
 787      changed.  */
 788   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 789     {
 790       gphi *phi = gsi.phi ();
 791       tree access_fn = NULL;
 792       tree def = PHI_RESULT (phi);
 793       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 794
 795       if (dump_enabled_p ())
 796         {
 797           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 798           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 799           dump_printf (MSG_NOTE, "\n");
 800         }
 801
 802       /* Skip virtual phi's.  The data dependences that are associated with
 803          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 804       if (virtual_operand_p (def))
 805         continue;
 806
 807       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 808
 809       /* Analyze the evolution function.  */
 810       access_fn = analyze_scalar_evolution (loop, def);
 811       if (access_fn)
 812         {
 813           STRIP_NOPS (access_fn);
 814           if (dump_enabled_p ())
 815             {
 816               dump_printf_loc (MSG_NOTE, vect_location,
 817                                "Access function of PHI: ");
 818               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 819               dump_printf (MSG_NOTE, "\n");
 820             }
 821           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 822             = initial_condition_in_loop_num (access_fn, loop->num);
 823           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 824             = evolution_part_in_loop_num (access_fn, loop->num);
 825         }
 826
 827       if (!access_fn
 828           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 829           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 830               && TREE_CODE (step) != INTEGER_CST))
 831         {
 832           worklist.safe_push (phi);
 833           continue;
 834         }
 835
 836       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 837                   != NULL_TREE);
 838       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 839
 840       if (dump_enabled_p ())
 841         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 842       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 843     }
 844
 845
 846   /* Second - identify all reductions and nested cycles.  */
 847   while (worklist.length () > 0)
 848     {
 849       gimple *phi = worklist.pop ();
 850       tree def = PHI_RESULT (phi);
 851       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 852       gimple *reduc_stmt;
 853       bool nested_cycle;
 854
 855       if (dump_enabled_p ())
 856         {
 857           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 858           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 859           dump_printf (MSG_NOTE, "\n");
 860         }
 861
 862       gcc_assert (!virtual_operand_p (def)
 863                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 864
 865       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 867                                                 &double_reduc, false);
 868       if (reduc_stmt)
 869         {
 870           if (double_reduc)
 871             {
 872               if (dump_enabled_p ())
 873                 dump_printf_loc (MSG_NOTE, vect_location,
 874                                  "Detected double reduction.\n");
 875
 876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 878                                                     vect_double_reduction_def;
 879             }
 880           else
 881             {
 882               if (nested_cycle)
 883                 {
 884                   if (dump_enabled_p ())
 885                     dump_printf_loc (MSG_NOTE, vect_location,
 886                                      "Detected vectorizable nested cycle.\n");
 887
 888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 890                                                              vect_nested_cycle;
 891                 }
 892               else
 893                 {
 894                   if (dump_enabled_p ())
 895                     dump_printf_loc (MSG_NOTE, vect_location,
 896                                      "Detected reduction.\n");
 897
 898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 900                                                            vect_reduction_def;
 901                   /* Store the reduction cycles for possible vectorization in
 902                      loop-aware SLP.  */
 903                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 904                 }
 905             }
 906         }
 907       else
 908         if (dump_enabled_p ())
 909           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 910                            "Unknown def-use cycle pattern.\n");
 911     }
 912 }
 913
 914
 915 /* Function vect_analyze_scalar_cycles.
 916
 917    Examine the cross iteration def-use cycles of scalar variables, by
 918    analyzing the loop-header PHIs of scalar variables.  Classify each
 919    cycle as one of the following: invariant, induction, reduction, unknown.
 920    We do that for the loop represented by LOOP_VINFO, and also to its
 921    inner-loop, if exists.
 922    Examples for scalar cycles:
 923
 924    Example1: reduction:
 925
 926               loop1:
 927               for (i=0; i<N; i++)
 928                  sum += a[i];
 929
 930    Example2: induction:
 931
 932               loop2:
 933               for (i=0; i<N; i++)
 934                  a[i] = i;  */
 935
 936 static void
 937 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 938 {
 939   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 940
 941   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 942
 943   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 944      Reductions in such inner-loop therefore have different properties than
 945      the reductions in the nest that gets vectorized:
 946      1. When vectorized, they are executed in the same order as in the original
 947         scalar loop, so we can't change the order of computation when
 948         vectorizing them.
 949      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 950         current checks are too strict.  */
 951
 952   if (loop->inner)
 953     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 954 }
 955
 956 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 957
 958 static void
 959 vect_fixup_reduc_chain (gimple *stmt)
 960 {
 961   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962   gimple *stmtp;
 963   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 964               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 965   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 966   do
 967     {
 968       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 969       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 970       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 971       if (stmt)
 972         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 973           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 974     }
 975   while (stmt);
 976   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 977 }
 978
 979 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 980
 981 static void
 982 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 983 {
 984   gimple *first;
 985   unsigned i;
 986
 987   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 988     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 989       {
 990         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 991         while (next)
 992           {
 993             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 994               break;
 995             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 996           }
 997         /* If not all stmt in the chain are patterns try to handle
 998            the chain without patterns.  */
 999         if (! next)
1000           {
1001             vect_fixup_reduc_chain (first);
1002             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1003               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1004           }
1005       }
1006 }
1007
1008 /* Function vect_get_loop_niters.
1009
1010    Determine how many iterations the loop is executed and place it
1011    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1012    in NUMBER_OF_ITERATIONSM1.
1013
1014    Return the loop exit condition.  */
1015
1016
1017 static gcond *
1018 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1019                       tree *number_of_iterationsm1)
1020 {
1021   tree niters;
1022
1023   if (dump_enabled_p ())
1024     dump_printf_loc (MSG_NOTE, vect_location,
1025                      "=== get_loop_niters ===\n");
1026
1027   niters = number_of_latch_executions (loop);
1028   *number_of_iterationsm1 = niters;
1029
1030   /* We want the number of loop header executions which is the number
1031      of latch executions plus one.
1032      ???  For UINT_MAX latch executions this number overflows to zero
1033      for loops like do { n++; } while (n != 0);  */
1034   if (niters && !chrec_contains_undetermined (niters))
1035     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1036                           build_int_cst (TREE_TYPE (niters), 1));
1037   *number_of_iterations = niters;
1038
1039   return get_loop_exit_condition (loop);
1040 }
1041
1042
1043 /* Function bb_in_loop_p
1044
1045    Used as predicate for dfs order traversal of the loop bbs.  */
1046
1047 static bool
1048 bb_in_loop_p (const_basic_block bb, const void *data)
1049 {
1050   const struct loop *const loop = (const struct loop *)data;
1051   if (flow_bb_inside_loop_p (loop, bb))
1052     return true;
1053   return false;
1054 }
1055
1056
1057 /* Function new_loop_vec_info.
1058
1059    Create and initialize a new loop_vec_info struct for LOOP, as well as
1060    stmt_vec_info structs for all the stmts in LOOP.  */
1061
1062 static loop_vec_info
1063 new_loop_vec_info (struct loop *loop)
1064 {
1065   loop_vec_info res;
1066   basic_block *bbs;
1067   gimple_stmt_iterator si;
1068   unsigned int i, nbbs;
1069
1070   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1071   res->kind = vec_info::loop;
1072   LOOP_VINFO_LOOP (res) = loop;
1073
1074   bbs = get_loop_body (loop);
1075
1076   /* Create/Update stmt_info for all stmts in the loop.  */
1077   for (i = 0; i < loop->num_nodes; i++)
1078     {
1079       basic_block bb = bbs[i];
1080
1081       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1082         {
1083           gimple *phi = gsi_stmt (si);
1084           gimple_set_uid (phi, 0);
1085           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1086         }
1087
1088       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1089         {
1090           gimple *stmt = gsi_stmt (si);
1091           gimple_set_uid (stmt, 0);
1092           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1093         }
1094     }
1095
1096   /* CHECKME: We want to visit all BBs before their successors (except for
1097      latch blocks, for which this assertion wouldn't hold).  In the simple
1098      case of the loop forms we allow, a dfs order of the BBs would the same
1099      as reversed postorder traversal, so we are safe.  */
1100
1101    free (bbs);
1102    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1103    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1104                               bbs, loop->num_nodes, loop);
1105    gcc_assert (nbbs == loop->num_nodes);
1106
1107   LOOP_VINFO_BBS (res) = bbs;
1108   LOOP_VINFO_NITERSM1 (res) = NULL;
1109   LOOP_VINFO_NITERS (res) = NULL;
1110   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1111   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1112   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1113   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1114   LOOP_VINFO_VECT_FACTOR (res) = 0;
1115   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1116   LOOP_VINFO_DATAREFS (res) = vNULL;
1117   LOOP_VINFO_DDRS (res) = vNULL;
1118   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1119   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1120   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1121   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1122   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1123   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1124   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1125   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1126   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1127   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1128   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1129   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1130
1131   return res;
1132 }
1133
1134
1135 /* Function destroy_loop_vec_info.
1136
1137    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1138    stmts in the loop.  */
1139
1140 void
1141 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1142 {
1143   struct loop *loop;
1144   basic_block *bbs;
1145   int nbbs;
1146   gimple_stmt_iterator si;
1147   int j;
1148   vec<slp_instance> slp_instances;
1149   slp_instance instance;
1150   bool swapped;
1151
1152   if (!loop_vinfo)
1153     return;
1154
1155   loop = LOOP_VINFO_LOOP (loop_vinfo);
1156
1157   bbs = LOOP_VINFO_BBS (loop_vinfo);
1158   nbbs = clean_stmts ? loop->num_nodes : 0;
1159   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1160
1161   for (j = 0; j < nbbs; j++)
1162     {
1163       basic_block bb = bbs[j];
1164       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1165         free_stmt_vec_info (gsi_stmt (si));
1166
1167       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1168         {
1169           gimple *stmt = gsi_stmt (si);
1170
1171           /* We may have broken canonical form by moving a constant
1172              into RHS1 of a commutative op.  Fix such occurrences.  */
1173           if (swapped && is_gimple_assign (stmt))
1174             {
1175               enum tree_code code = gimple_assign_rhs_code (stmt);
1176
1177               if ((code == PLUS_EXPR
1178                    || code == POINTER_PLUS_EXPR
1179                    || code == MULT_EXPR)
1180                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1181                 swap_ssa_operands (stmt,
1182                                    gimple_assign_rhs1_ptr (stmt),
1183                                    gimple_assign_rhs2_ptr (stmt));
1184             }
1185
1186           /* Free stmt_vec_info.  */
1187           free_stmt_vec_info (stmt);
1188           gsi_next (&si);
1189         }
1190     }
1191
1192   free (LOOP_VINFO_BBS (loop_vinfo));
1193   vect_destroy_datarefs (loop_vinfo);
1194   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1195   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1196   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1197   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1198   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1199   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1200   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1201     vect_free_slp_instance (instance);
1202
1203   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1204   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1205   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1206   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1207
1208   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1209   loop_vinfo->scalar_cost_vec.release ();
1210
1211   free (loop_vinfo);
1212   loop->aux = NULL;
1213 }
1214
1215
1216 /* Calculate the cost of one scalar iteration of the loop.  */
1217 static void
1218 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1219 {
1220   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1221   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1222   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1223   int innerloop_iters, i;
1224
1225   /* Count statements in scalar loop.  Using this as scalar cost for a single
1226      iteration for now.
1227
1228      TODO: Add outer loop support.
1229
1230      TODO: Consider assigning different costs to different scalar
1231      statements.  */
1232
1233   /* FORNOW.  */
1234   innerloop_iters = 1;
1235   if (loop->inner)
1236     innerloop_iters = 50; /* FIXME */
1237
1238   for (i = 0; i < nbbs; i++)
1239     {
1240       gimple_stmt_iterator si;
1241       basic_block bb = bbs[i];
1242
1243       if (bb->loop_father == loop->inner)
1244         factor = innerloop_iters;
1245       else
1246         factor = 1;
1247
1248       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249         {
1250           gimple *stmt = gsi_stmt (si);
1251           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1252
1253           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254             continue;
1255
1256           /* Skip stmts that are not vectorized inside the loop.  */
1257           if (stmt_info
1258               && !STMT_VINFO_RELEVANT_P (stmt_info)
1259               && (!STMT_VINFO_LIVE_P (stmt_info)
1260                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1261               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1262             continue;
1263
1264           vect_cost_for_stmt kind;
1265           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1266             {
1267               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1268                kind = scalar_load;
1269              else
1270                kind = scalar_store;
1271             }
1272           else
1273             kind = scalar_stmt;
1274
1275           scalar_single_iter_cost
1276             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1277                                  factor, kind, NULL, 0, vect_prologue);
1278         }
1279     }
1280   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1281     = scalar_single_iter_cost;
1282 }
1283
1284
1285 /* Function vect_analyze_loop_form_1.
1286
1287    Verify that certain CFG restrictions hold, including:
1288    - the loop has a pre-header
1289    - the loop has a single entry and exit
1290    - the loop exit condition is simple enough, and the number of iterations
1291      can be analyzed (a countable loop).  */
1292
1293 bool
1294 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1295                           tree *number_of_iterationsm1,
1296                           tree *number_of_iterations, gcond **inner_loop_cond)
1297 {
1298   if (dump_enabled_p ())
1299     dump_printf_loc (MSG_NOTE, vect_location,
1300                      "=== vect_analyze_loop_form ===\n");
1301
1302   /* Different restrictions apply when we are considering an inner-most loop,
1303      vs. an outer (nested) loop.
1304      (FORNOW. May want to relax some of these restrictions in the future).  */
1305
1306   if (!loop->inner)
1307     {
1308       /* Inner-most loop.  We currently require that the number of BBs is
1309          exactly 2 (the header and latch).  Vectorizable inner-most loops
1310          look like this:
1311
1312                         (pre-header)
1313                            |
1314                           header <--------+
1315                            | |            |
1316                            | +--> latch --+
1317                            |
1318                         (exit-bb)  */
1319
1320       if (loop->num_nodes != 2)
1321         {
1322           if (dump_enabled_p ())
1323             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324                              "not vectorized: control flow in loop.\n");
1325           return false;
1326         }
1327
1328       if (empty_block_p (loop->header))
1329         {
1330           if (dump_enabled_p ())
1331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1332                              "not vectorized: empty loop.\n");
1333           return false;
1334         }
1335     }
1336   else
1337     {
1338       struct loop *innerloop = loop->inner;
1339       edge entryedge;
1340
1341       /* Nested loop. We currently require that the loop is doubly-nested,
1342          contains a single inner loop, and the number of BBs is exactly 5.
1343          Vectorizable outer-loops look like this:
1344
1345                         (pre-header)
1346                            |
1347                           header <---+
1348                            |         |
1349                           inner-loop |
1350                            |         |
1351                           tail ------+
1352                            |
1353                         (exit-bb)
1354
1355          The inner-loop has the properties expected of inner-most loops
1356          as described above.  */
1357
1358       if ((loop->inner)->inner || (loop->inner)->next)
1359         {
1360           if (dump_enabled_p ())
1361             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1362                              "not vectorized: multiple nested loops.\n");
1363           return false;
1364         }
1365
1366       if (loop->num_nodes != 5)
1367         {
1368           if (dump_enabled_p ())
1369             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1370                              "not vectorized: control flow in loop.\n");
1371           return false;
1372         }
1373
1374       entryedge = loop_preheader_edge (innerloop);
1375       if (entryedge->src != loop->header
1376           || !single_exit (innerloop)
1377           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1378         {
1379           if (dump_enabled_p ())
1380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1381                              "not vectorized: unsupported outerloop form.\n");
1382           return false;
1383         }
1384
1385       /* Analyze the inner-loop.  */
1386       tree inner_niterm1, inner_niter;
1387       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1388                                       &inner_niterm1, &inner_niter, NULL))
1389         {
1390           if (dump_enabled_p ())
1391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392                              "not vectorized: Bad inner loop.\n");
1393           return false;
1394         }
1395
1396       if (!expr_invariant_in_loop_p (loop, inner_niter))
1397         {
1398           if (dump_enabled_p ())
1399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                              "not vectorized: inner-loop count not"
1401                              " invariant.\n");
1402           return false;
1403         }
1404
1405       if (dump_enabled_p ())
1406         dump_printf_loc (MSG_NOTE, vect_location,
1407                          "Considering outer-loop vectorization.\n");
1408     }
1409
1410   if (!single_exit (loop)
1411       || EDGE_COUNT (loop->header->preds) != 2)
1412     {
1413       if (dump_enabled_p ())
1414         {
1415           if (!single_exit (loop))
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "not vectorized: multiple exits.\n");
1418           else if (EDGE_COUNT (loop->header->preds) != 2)
1419             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420                              "not vectorized: too many incoming edges.\n");
1421         }
1422       return false;
1423     }
1424
1425   /* We assume that the loop exit condition is at the end of the loop. i.e,
1426      that the loop is represented as a do-while (with a proper if-guard
1427      before the loop if needed), where the loop header contains all the
1428      executable statements, and the latch is empty.  */
1429   if (!empty_block_p (loop->latch)
1430       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1431     {
1432       if (dump_enabled_p ())
1433         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1434                          "not vectorized: latch block not empty.\n");
1435       return false;
1436     }
1437
1438   /* Make sure there exists a single-predecessor exit bb:  */
1439   if (!single_pred_p (single_exit (loop)->dest))
1440     {
1441       edge e = single_exit (loop);
1442       if (!(e->flags & EDGE_ABNORMAL))
1443         {
1444           split_loop_exit_edge (e);
1445           if (dump_enabled_p ())
1446             dump_printf (MSG_NOTE, "split exit edge.\n");
1447         }
1448       else
1449         {
1450           if (dump_enabled_p ())
1451             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452                              "not vectorized: abnormal loop exit edge.\n");
1453           return false;
1454         }
1455     }
1456
1457   *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1458                                      number_of_iterationsm1);
1459   if (!*loop_cond)
1460     {
1461       if (dump_enabled_p ())
1462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1463                          "not vectorized: complicated exit condition.\n");
1464       return false;
1465     }
1466
1467   if (!*number_of_iterations
1468       || chrec_contains_undetermined (*number_of_iterations))
1469     {
1470       if (dump_enabled_p ())
1471         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472                          "not vectorized: number of iterations cannot be "
1473                          "computed.\n");
1474       return false;
1475     }
1476
1477   if (integer_zerop (*number_of_iterations))
1478     {
1479       if (dump_enabled_p ())
1480         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1481                          "not vectorized: number of iterations = 0.\n");
1482       return false;
1483     }
1484
1485   return true;
1486 }
1487
1488 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1489
1490 loop_vec_info
1491 vect_analyze_loop_form (struct loop *loop)
1492 {
1493   tree number_of_iterations, number_of_iterationsm1;
1494   gcond *loop_cond, *inner_loop_cond = NULL;
1495
1496   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1497                                   &number_of_iterations, &inner_loop_cond))
1498     return NULL;
1499
1500   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1501   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1502   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1503   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1504
1505   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1506     {
1507       if (dump_enabled_p ())
1508         {
1509           dump_printf_loc (MSG_NOTE, vect_location,
1510                            "Symbolic number of iterations is ");
1511           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1512           dump_printf (MSG_NOTE, "\n");
1513         }
1514     }
1515
1516   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1517   if (inner_loop_cond)
1518     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1519       = loop_exit_ctrl_vec_info_type;
1520
1521   gcc_assert (!loop->aux);
1522   loop->aux = loop_vinfo;
1523   return loop_vinfo;
1524 }
1525
1526
1527
1528 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1529    statements update the vectorization factor.  */
1530
1531 static void
1532 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1533 {
1534   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1535   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1536   int nbbs = loop->num_nodes;
1537   unsigned int vectorization_factor;
1538   int i;
1539
1540   if (dump_enabled_p ())
1541     dump_printf_loc (MSG_NOTE, vect_location,
1542                      "=== vect_update_vf_for_slp ===\n");
1543
1544   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1545   gcc_assert (vectorization_factor != 0);
1546
1547   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1548      vectorization factor of the loop is the unrolling factor required by
1549      the SLP instances.  If that unrolling factor is 1, we say, that we
1550      perform pure SLP on loop - cross iteration parallelism is not
1551      exploited.  */
1552   bool only_slp_in_loop = true;
1553   for (i = 0; i < nbbs; i++)
1554     {
1555       basic_block bb = bbs[i];
1556       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1557            gsi_next (&si))
1558         {
1559           gimple *stmt = gsi_stmt (si);
1560           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1561           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1562               && STMT_VINFO_RELATED_STMT (stmt_info))
1563             {
1564               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1565               stmt_info = vinfo_for_stmt (stmt);
1566             }
1567           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1568                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1569               && !PURE_SLP_STMT (stmt_info))
1570             /* STMT needs both SLP and loop-based vectorization.  */
1571             only_slp_in_loop = false;
1572         }
1573     }
1574
1575   if (only_slp_in_loop)
1576     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1577   else
1578     vectorization_factor
1579       = least_common_multiple (vectorization_factor,
1580                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1581
1582   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1583   if (dump_enabled_p ())
1584     dump_printf_loc (MSG_NOTE, vect_location,
1585                      "Updating vectorization factor to %d\n",
1586                      vectorization_factor);
1587 }
1588
1589 /* Function vect_analyze_loop_operations.
1590
1591    Scan the loop stmts and make sure they are all vectorizable.  */
1592
1593 static bool
1594 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1595 {
1596   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1597   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1598   int nbbs = loop->num_nodes;
1599   int i;
1600   stmt_vec_info stmt_info;
1601   bool need_to_vectorize = false;
1602   bool ok;
1603
1604   if (dump_enabled_p ())
1605     dump_printf_loc (MSG_NOTE, vect_location,
1606                      "=== vect_analyze_loop_operations ===\n");
1607
1608   for (i = 0; i < nbbs; i++)
1609     {
1610       basic_block bb = bbs[i];
1611
1612       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1613            gsi_next (&si))
1614         {
1615           gphi *phi = si.phi ();
1616           ok = true;
1617
1618           stmt_info = vinfo_for_stmt (phi);
1619           if (dump_enabled_p ())
1620             {
1621               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1622               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1623               dump_printf (MSG_NOTE, "\n");
1624             }
1625           if (virtual_operand_p (gimple_phi_result (phi)))
1626             continue;
1627
1628           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1629              (i.e., a phi in the tail of the outer-loop).  */
1630           if (! is_loop_header_bb_p (bb))
1631             {
1632               /* FORNOW: we currently don't support the case that these phis
1633                  are not used in the outerloop (unless it is double reduction,
1634                  i.e., this phi is vect_reduction_def), cause this case
1635                  requires to actually do something here.  */
1636               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1637                    || STMT_VINFO_LIVE_P (stmt_info))
1638                   && STMT_VINFO_DEF_TYPE (stmt_info)
1639                      != vect_double_reduction_def)
1640                 {
1641                   if (dump_enabled_p ())
1642                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                                      "Unsupported loop-closed phi in "
1644                                      "outer-loop.\n");
1645                   return false;
1646                 }
1647
1648               /* If PHI is used in the outer loop, we check that its operand
1649                  is defined in the inner loop.  */
1650               if (STMT_VINFO_RELEVANT_P (stmt_info))
1651                 {
1652                   tree phi_op;
1653                   gimple *op_def_stmt;
1654
1655                   if (gimple_phi_num_args (phi) != 1)
1656                     return false;
1657
1658                   phi_op = PHI_ARG_DEF (phi, 0);
1659                   if (TREE_CODE (phi_op) != SSA_NAME)
1660                     return false;
1661
1662                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1663                   if (gimple_nop_p (op_def_stmt)
1664                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1665                       || !vinfo_for_stmt (op_def_stmt))
1666                     return false;
1667
1668                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1669                         != vect_used_in_outer
1670                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1671                            != vect_used_in_outer_by_reduction)
1672                     return false;
1673                 }
1674
1675               continue;
1676             }
1677
1678           gcc_assert (stmt_info);
1679
1680           if (STMT_VINFO_LIVE_P (stmt_info))
1681             {
1682               /* FORNOW: not yet supported.  */
1683               if (dump_enabled_p ())
1684                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685                                  "not vectorized: value used after loop.\n");
1686               return false;
1687             }
1688
1689           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1690               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1691             {
1692               /* A scalar-dependence cycle that we don't support.  */
1693               if (dump_enabled_p ())
1694                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1695                                  "not vectorized: scalar dependence cycle.\n");
1696               return false;
1697             }
1698
1699           if (STMT_VINFO_RELEVANT_P (stmt_info))
1700             {
1701               need_to_vectorize = true;
1702               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1703                 ok = vectorizable_induction (phi, NULL, NULL);
1704             }
1705
1706           if (!ok)
1707             {
1708               if (dump_enabled_p ())
1709                 {
1710                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711                                    "not vectorized: relevant phi not "
1712                                    "supported: ");
1713                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1714                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1715                 }
1716               return false;
1717             }
1718         }
1719
1720       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1721            gsi_next (&si))
1722         {
1723           gimple *stmt = gsi_stmt (si);
1724           if (!gimple_clobber_p (stmt)
1725               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1726             return false;
1727         }
1728     } /* bbs */
1729
1730   /* All operations in the loop are either irrelevant (deal with loop
1731      control, or dead), or only used outside the loop and can be moved
1732      out of the loop (e.g. invariants, inductions).  The loop can be
1733      optimized away by scalar optimizations.  We're better off not
1734      touching this loop.  */
1735   if (!need_to_vectorize)
1736     {
1737       if (dump_enabled_p ())
1738         dump_printf_loc (MSG_NOTE, vect_location,
1739                          "All the computation can be taken out of the loop.\n");
1740       if (dump_enabled_p ())
1741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1742                          "not vectorized: redundant loop. no profit to "
1743                          "vectorize.\n");
1744       return false;
1745     }
1746
1747   return true;
1748 }
1749
1750
1751 /* Function vect_analyze_loop_2.
1752
1753    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1754    for it.  The different analyses will record information in the
1755    loop_vec_info struct.  */
1756 static bool
1757 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1758 {
1759   bool ok;
1760   int max_vf = MAX_VECTORIZATION_FACTOR;
1761   int min_vf = 2;
1762   unsigned int n_stmts = 0;
1763
1764   /* The first group of checks is independent of the vector size.  */
1765   fatal = true;
1766
1767   /* Find all data references in the loop (which correspond to vdefs/vuses)
1768      and analyze their evolution in the loop.  */
1769
1770   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1771
1772   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1773   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1774     {
1775       if (dump_enabled_p ())
1776         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1777                          "not vectorized: loop nest containing two "
1778                          "or more consecutive inner loops cannot be "
1779                          "vectorized\n");
1780       return false;
1781     }
1782
1783   for (unsigned i = 0; i < loop->num_nodes; i++)
1784     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1785          !gsi_end_p (gsi); gsi_next (&gsi))
1786       {
1787         gimple *stmt = gsi_stmt (gsi);
1788         if (is_gimple_debug (stmt))
1789           continue;
1790         ++n_stmts;
1791         if (!find_data_references_in_stmt (loop, stmt,
1792                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1793           {
1794             if (is_gimple_call (stmt) && loop->safelen)
1795               {
1796                 tree fndecl = gimple_call_fndecl (stmt), op;
1797                 if (fndecl != NULL_TREE)
1798                   {
1799                     cgraph_node *node = cgraph_node::get (fndecl);
1800                     if (node != NULL && node->simd_clones != NULL)
1801                       {
1802                         unsigned int j, n = gimple_call_num_args (stmt);
1803                         for (j = 0; j < n; j++)
1804                           {
1805                             op = gimple_call_arg (stmt, j);
1806                             if (DECL_P (op)
1807                                 || (REFERENCE_CLASS_P (op)
1808                                     && get_base_address (op)))
1809                               break;
1810                           }
1811                         op = gimple_call_lhs (stmt);
1812                         /* Ignore #pragma omp declare simd functions
1813                            if they don't have data references in the
1814                            call stmt itself.  */
1815                         if (j == n
1816                             && !(op
1817                                  && (DECL_P (op)
1818                                      || (REFERENCE_CLASS_P (op)
1819                                          && get_base_address (op)))))
1820                           continue;
1821                       }
1822                   }
1823               }
1824             if (dump_enabled_p ())
1825               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                                "not vectorized: loop contains function "
1827                                "calls or data references that cannot "
1828                                "be analyzed\n");
1829             return false;
1830           }
1831       }
1832
1833   /* Analyze the data references and also adjust the minimal
1834      vectorization factor according to the loads and stores.  */
1835
1836   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1837   if (!ok)
1838     {
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841                          "bad data references.\n");
1842       return false;
1843     }
1844
1845   /* Classify all cross-iteration scalar data-flow cycles.
1846      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1847   vect_analyze_scalar_cycles (loop_vinfo);
1848
1849   vect_pattern_recog (loop_vinfo);
1850
1851   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1852
1853   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1854      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1855
1856   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1857   if (!ok)
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                          "bad data access.\n");
1862       return false;
1863     }
1864
1865   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1866
1867   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1868   if (!ok)
1869     {
1870       if (dump_enabled_p ())
1871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                          "unexpected pattern.\n");
1873       return false;
1874     }
1875
1876   /* While the rest of the analysis below depends on it in some way.  */
1877   fatal = false;
1878
1879   /* Analyze data dependences between the data-refs in the loop
1880      and adjust the maximum vectorization factor according to
1881      the dependences.
1882      FORNOW: fail at the first data dependence that we encounter.  */
1883
1884   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1885   if (!ok
1886       || max_vf < min_vf)
1887     {
1888       if (dump_enabled_p ())
1889             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1890                              "bad data dependence.\n");
1891       return false;
1892     }
1893
1894   ok = vect_determine_vectorization_factor (loop_vinfo);
1895   if (!ok)
1896     {
1897       if (dump_enabled_p ())
1898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                          "can't determine vectorization factor.\n");
1900       return false;
1901     }
1902   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1903     {
1904       if (dump_enabled_p ())
1905         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906                          "bad data dependence.\n");
1907       return false;
1908     }
1909
1910   /* Compute the scalar iteration cost.  */
1911   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1912
1913   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1914   HOST_WIDE_INT estimated_niter;
1915   unsigned th;
1916   int min_scalar_loop_bound;
1917
1918   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1919   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1920   if (!ok)
1921     return false;
1922
1923   /* If there are any SLP instances mark them as pure_slp.  */
1924   bool slp = vect_make_slp_decision (loop_vinfo);
1925   if (slp)
1926     {
1927       /* Find stmts that need to be both vectorized and SLPed.  */
1928       vect_detect_hybrid_slp (loop_vinfo);
1929
1930       /* Update the vectorization factor based on the SLP decision.  */
1931       vect_update_vf_for_slp (loop_vinfo);
1932     }
1933
1934   /* This is the point where we can re-start analysis with SLP forced off.  */
1935 start_over:
1936
1937   /* Now the vectorization factor is final.  */
1938   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1939   gcc_assert (vectorization_factor != 0);
1940
1941   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1942     dump_printf_loc (MSG_NOTE, vect_location,
1943                      "vectorization_factor = %d, niters = "
1944                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1945                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1946
1947   HOST_WIDE_INT max_niter
1948     = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1949   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1950        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1951       || (max_niter != -1
1952           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1953     {
1954       if (dump_enabled_p ())
1955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956                          "not vectorized: iteration count smaller than "
1957                          "vectorization factor.\n");
1958       return false;
1959     }
1960
1961   /* Analyze the alignment of the data-refs in the loop.
1962      Fail if a data reference is found that cannot be vectorized.  */
1963
1964   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1965   if (!ok)
1966     {
1967       if (dump_enabled_p ())
1968         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1969                          "bad data alignment.\n");
1970       return false;
1971     }
1972
1973   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1974      It is important to call pruning after vect_analyze_data_ref_accesses,
1975      since we use grouping information gathered by interleaving analysis.  */
1976   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1977   if (!ok)
1978     {
1979       if (dump_enabled_p ())
1980         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981                          "number of versioning for alias "
1982                          "run-time tests exceeds %d "
1983                          "(--param vect-max-version-for-alias-checks)\n",
1984                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1985       return false;
1986     }
1987
1988   /* This pass will decide on using loop versioning and/or loop peeling in
1989      order to enhance the alignment of data references in the loop.  */
1990   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1991   if (!ok)
1992     {
1993       if (dump_enabled_p ())
1994         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995                          "bad data alignment.\n");
1996       return false;
1997     }
1998
1999   if (slp)
2000     {
2001       /* Analyze operations in the SLP instances.  Note this may
2002          remove unsupported SLP instances which makes the above
2003          SLP kind detection invalid.  */
2004       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2005       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2006                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2007       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2008         goto again;
2009     }
2010
2011   /* Scan all the remaining operations in the loop that are not subject
2012      to SLP and make sure they are vectorizable.  */
2013   ok = vect_analyze_loop_operations (loop_vinfo);
2014   if (!ok)
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "bad operation or unsupported loop bound.\n");
2019       return false;
2020     }
2021
2022   /* Analyze cost.  Decide if worth while to vectorize.  */
2023   int min_profitable_estimate, min_profitable_iters;
2024   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2025                                       &min_profitable_estimate);
2026
2027   if (min_profitable_iters < 0)
2028     {
2029       if (dump_enabled_p ())
2030         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2031                          "not vectorized: vectorization not profitable.\n");
2032       if (dump_enabled_p ())
2033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2034                          "not vectorized: vector version will never be "
2035                          "profitable.\n");
2036       goto again;
2037     }
2038
2039   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2040                             * vectorization_factor) - 1);
2041
2042   /* Use the cost model only if it is more conservative than user specified
2043      threshold.  */
2044   th = (unsigned) min_scalar_loop_bound;
2045   if (min_profitable_iters
2046       && (!min_scalar_loop_bound
2047           || min_profitable_iters > min_scalar_loop_bound))
2048     th = (unsigned) min_profitable_iters;
2049
2050   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2051
2052   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2053       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2054     {
2055       if (dump_enabled_p ())
2056         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2057                          "not vectorized: vectorization not profitable.\n");
2058       if (dump_enabled_p ())
2059         dump_printf_loc (MSG_NOTE, vect_location,
2060                          "not vectorized: iteration count smaller than user "
2061                          "specified loop bound parameter or minimum profitable "
2062                          "iterations (whichever is more conservative).\n");
2063       goto again;
2064     }
2065
2066   estimated_niter
2067     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2068   if (estimated_niter != -1
2069       && ((unsigned HOST_WIDE_INT) estimated_niter
2070           <= MAX (th, (unsigned)min_profitable_estimate)))
2071     {
2072       if (dump_enabled_p ())
2073         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2074                          "not vectorized: estimated iteration count too "
2075                          "small.\n");
2076       if (dump_enabled_p ())
2077         dump_printf_loc (MSG_NOTE, vect_location,
2078                          "not vectorized: estimated iteration count smaller "
2079                          "than specified loop bound parameter or minimum "
2080                          "profitable iterations (whichever is more "
2081                          "conservative).\n");
2082       goto again;
2083     }
2084
2085   /* Decide whether we need to create an epilogue loop to handle
2086      remaining scalar iterations.  */
2087   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2088         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2089        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2090
2091   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2092       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2093     {
2094       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2095                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2096           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2097         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2098     }
2099   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2100            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2101                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2102                /* In case of versioning, check if the maximum number of
2103                   iterations is greater than th.  If they are identical,
2104                   the epilogue is unnecessary.  */
2105                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2106                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2107                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2108     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2109
2110   /* If an epilogue loop is required make sure we can create one.  */
2111   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2112       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2113     {
2114       if (dump_enabled_p ())
2115         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2116       if (!vect_can_advance_ivs_p (loop_vinfo)
2117           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2118                                            single_exit (LOOP_VINFO_LOOP
2119                                                          (loop_vinfo))))
2120         {
2121           if (dump_enabled_p ())
2122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123                              "not vectorized: can't create required "
2124                              "epilog loop\n");
2125           goto again;
2126         }
2127     }
2128
2129   gcc_assert (vectorization_factor
2130               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2131
2132   /* Ok to vectorize!  */
2133   return true;
2134
2135 again:
2136   /* Try again with SLP forced off but if we didn't do any SLP there is
2137      no point in re-trying.  */
2138   if (!slp)
2139     return false;
2140
2141   /* If there are reduction chains re-trying will fail anyway.  */
2142   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2143     return false;
2144
2145   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2146      via interleaving or lane instructions.  */
2147   slp_instance instance;
2148   slp_tree node;
2149   unsigned i, j;
2150   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2151     {
2152       stmt_vec_info vinfo;
2153       vinfo = vinfo_for_stmt
2154           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2155       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2156         continue;
2157       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2158       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2159       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2160       if (! vect_store_lanes_supported (vectype, size)
2161           && ! vect_grouped_store_supported (vectype, size))
2162         return false;
2163       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2164         {
2165           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2166           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2167           size = STMT_VINFO_GROUP_SIZE (vinfo);
2168           vectype = STMT_VINFO_VECTYPE (vinfo);
2169           if (! vect_load_lanes_supported (vectype, size)
2170               && ! vect_grouped_load_supported (vectype, size))
2171             return false;
2172         }
2173     }
2174
2175   if (dump_enabled_p ())
2176     dump_printf_loc (MSG_NOTE, vect_location,
2177                      "re-trying with SLP disabled\n");
2178
2179   /* Roll back state appropriately.  No SLP this time.  */
2180   slp = false;
2181   /* Restore vectorization factor as it were without SLP.  */
2182   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2183   /* Free the SLP instances.  */
2184   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2185     vect_free_slp_instance (instance);
2186   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2187   /* Reset SLP type to loop_vect on all stmts.  */
2188   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2189     {
2190       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2191       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2192            !gsi_end_p (si); gsi_next (&si))
2193         {
2194           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2195           STMT_SLP_TYPE (stmt_info) = loop_vect;
2196           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2197             {
2198               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2199               STMT_SLP_TYPE (stmt_info) = loop_vect;
2200               for (gimple_stmt_iterator pi
2201                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2202                    !gsi_end_p (pi); gsi_next (&pi))
2203                 {
2204                   gimple *pstmt = gsi_stmt (pi);
2205                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2206                 }
2207             }
2208         }
2209     }
2210   /* Free optimized alias test DDRS.  */
2211   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2212   /* Reset target cost data.  */
2213   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2214   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2215     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2216   /* Reset assorted flags.  */
2217   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2218   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2219   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2220
2221   goto start_over;
2222 }
2223
2224 /* Function vect_analyze_loop.
2225
2226    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227    for it.  The different analyses will record information in the
2228    loop_vec_info struct.  */
2229 loop_vec_info
2230 vect_analyze_loop (struct loop *loop)
2231 {
2232   loop_vec_info loop_vinfo;
2233   unsigned int vector_sizes;
2234
2235   /* Autodetect first vector size we try.  */
2236   current_vector_size = 0;
2237   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2238
2239   if (dump_enabled_p ())
2240     dump_printf_loc (MSG_NOTE, vect_location,
2241                      "===== analyze_loop_nest =====\n");
2242
2243   if (loop_outer (loop)
2244       && loop_vec_info_for_loop (loop_outer (loop))
2245       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_NOTE, vect_location,
2249                          "outer-loop already vectorized.\n");
2250       return NULL;
2251     }
2252
2253   while (1)
2254     {
2255       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2256       loop_vinfo = vect_analyze_loop_form (loop);
2257       if (!loop_vinfo)
2258         {
2259           if (dump_enabled_p ())
2260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261                              "bad loop form.\n");
2262           return NULL;
2263         }
2264
2265       bool fatal = false;
2266       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2267         {
2268           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2269
2270           return loop_vinfo;
2271         }
2272
2273       destroy_loop_vec_info (loop_vinfo, true);
2274
2275       vector_sizes &= ~current_vector_size;
2276       if (fatal
2277           || vector_sizes == 0
2278           || current_vector_size == 0)
2279         return NULL;
2280
2281       /* Try the next biggest vector size.  */
2282       current_vector_size = 1 << floor_log2 (vector_sizes);
2283       if (dump_enabled_p ())
2284         dump_printf_loc (MSG_NOTE, vect_location,
2285                          "***** Re-trying analysis with "
2286                          "vector size %d\n", current_vector_size);
2287     }
2288 }
2289
2290
2291 /* Function reduction_code_for_scalar_code
2292
2293    Input:
2294    CODE - tree_code of a reduction operations.
2295
2296    Output:
2297    REDUC_CODE - the corresponding tree-code to be used to reduce the
2298       vector of partial results into a single scalar result, or ERROR_MARK
2299       if the operation is a supported reduction operation, but does not have
2300       such a tree-code.
2301
2302    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2303
2304 static bool
2305 reduction_code_for_scalar_code (enum tree_code code,
2306                                 enum tree_code *reduc_code)
2307 {
2308   switch (code)
2309     {
2310       case MAX_EXPR:
2311         *reduc_code = REDUC_MAX_EXPR;
2312         return true;
2313
2314       case MIN_EXPR:
2315         *reduc_code = REDUC_MIN_EXPR;
2316         return true;
2317
2318       case PLUS_EXPR:
2319         *reduc_code = REDUC_PLUS_EXPR;
2320         return true;
2321
2322       case MULT_EXPR:
2323       case MINUS_EXPR:
2324       case BIT_IOR_EXPR:
2325       case BIT_XOR_EXPR:
2326       case BIT_AND_EXPR:
2327         *reduc_code = ERROR_MARK;
2328         return true;
2329
2330       default:
2331        return false;
2332     }
2333 }
2334
2335
2336 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2337    STMT is printed with a message MSG. */
2338
2339 static void
2340 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2341 {
2342   dump_printf_loc (msg_type, vect_location, "%s", msg);
2343   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2344   dump_printf (msg_type, "\n");
2345 }
2346
2347
2348 /* Detect SLP reduction of the form:
2349
2350    #a1 = phi <a5, a0>
2351    a2 = operation (a1)
2352    a3 = operation (a2)
2353    a4 = operation (a3)
2354    a5 = operation (a4)
2355
2356    #a = phi <a5>
2357
2358    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2359    FIRST_STMT is the first reduction stmt in the chain
2360    (a2 = operation (a1)).
2361
2362    Return TRUE if a reduction chain was detected.  */
2363
2364 static bool
2365 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2366                        gimple *first_stmt)
2367 {
2368   struct loop *loop = (gimple_bb (phi))->loop_father;
2369   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2370   enum tree_code code;
2371   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2372   stmt_vec_info use_stmt_info, current_stmt_info;
2373   tree lhs;
2374   imm_use_iterator imm_iter;
2375   use_operand_p use_p;
2376   int nloop_uses, size = 0, n_out_of_loop_uses;
2377   bool found = false;
2378
2379   if (loop != vect_loop)
2380     return false;
2381
2382   lhs = PHI_RESULT (phi);
2383   code = gimple_assign_rhs_code (first_stmt);
2384   while (1)
2385     {
2386       nloop_uses = 0;
2387       n_out_of_loop_uses = 0;
2388       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2389         {
2390           gimple *use_stmt = USE_STMT (use_p);
2391           if (is_gimple_debug (use_stmt))
2392             continue;
2393
2394           /* Check if we got back to the reduction phi.  */
2395           if (use_stmt == phi)
2396             {
2397               loop_use_stmt = use_stmt;
2398               found = true;
2399               break;
2400             }
2401
2402           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2403             {
2404               loop_use_stmt = use_stmt;
2405               nloop_uses++;
2406             }
2407            else
2408              n_out_of_loop_uses++;
2409
2410            /* There are can be either a single use in the loop or two uses in
2411               phi nodes.  */
2412            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2413              return false;
2414         }
2415
2416       if (found)
2417         break;
2418
2419       /* We reached a statement with no loop uses.  */
2420       if (nloop_uses == 0)
2421         return false;
2422
2423       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2424       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2425         return false;
2426
2427       if (!is_gimple_assign (loop_use_stmt)
2428           || code != gimple_assign_rhs_code (loop_use_stmt)
2429           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2430         return false;
2431
2432       /* Insert USE_STMT into reduction chain.  */
2433       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2434       if (current_stmt)
2435         {
2436           current_stmt_info = vinfo_for_stmt (current_stmt);
2437           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2438           GROUP_FIRST_ELEMENT (use_stmt_info)
2439             = GROUP_FIRST_ELEMENT (current_stmt_info);
2440         }
2441       else
2442         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2443
2444       lhs = gimple_assign_lhs (loop_use_stmt);
2445       current_stmt = loop_use_stmt;
2446       size++;
2447    }
2448
2449   if (!found || loop_use_stmt != phi || size < 2)
2450     return false;
2451
2452   /* Swap the operands, if needed, to make the reduction operand be the second
2453      operand.  */
2454   lhs = PHI_RESULT (phi);
2455   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2456   while (next_stmt)
2457     {
2458       if (gimple_assign_rhs2 (next_stmt) == lhs)
2459         {
2460           tree op = gimple_assign_rhs1 (next_stmt);
2461           gimple *def_stmt = NULL;
2462
2463           if (TREE_CODE (op) == SSA_NAME)
2464             def_stmt = SSA_NAME_DEF_STMT (op);
2465
2466           /* Check that the other def is either defined in the loop
2467              ("vect_internal_def"), or it's an induction (defined by a
2468              loop-header phi-node).  */
2469           if (def_stmt
2470               && gimple_bb (def_stmt)
2471               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2472               && (is_gimple_assign (def_stmt)
2473                   || is_gimple_call (def_stmt)
2474                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2475                            == vect_induction_def
2476                   || (gimple_code (def_stmt) == GIMPLE_PHI
2477                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2478                                   == vect_internal_def
2479                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2480             {
2481               lhs = gimple_assign_lhs (next_stmt);
2482               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2483               continue;
2484             }
2485
2486           return false;
2487         }
2488       else
2489         {
2490           tree op = gimple_assign_rhs2 (next_stmt);
2491           gimple *def_stmt = NULL;
2492
2493           if (TREE_CODE (op) == SSA_NAME)
2494             def_stmt = SSA_NAME_DEF_STMT (op);
2495
2496           /* Check that the other def is either defined in the loop
2497             ("vect_internal_def"), or it's an induction (defined by a
2498             loop-header phi-node).  */
2499           if (def_stmt
2500               && gimple_bb (def_stmt)
2501               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2502               && (is_gimple_assign (def_stmt)
2503                   || is_gimple_call (def_stmt)
2504                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2505                               == vect_induction_def
2506                   || (gimple_code (def_stmt) == GIMPLE_PHI
2507                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2508                                   == vect_internal_def
2509                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2510             {
2511               if (dump_enabled_p ())
2512                 {
2513                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2514                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2515                   dump_printf (MSG_NOTE, "\n");
2516                 }
2517
2518               swap_ssa_operands (next_stmt,
2519                                  gimple_assign_rhs1_ptr (next_stmt),
2520                                  gimple_assign_rhs2_ptr (next_stmt));
2521               update_stmt (next_stmt);
2522
2523               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2524                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2525             }
2526           else
2527             return false;
2528         }
2529
2530       lhs = gimple_assign_lhs (next_stmt);
2531       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2532     }
2533
2534   /* Save the chain for further analysis in SLP detection.  */
2535   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2536   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2537   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2538
2539   return true;
2540 }
2541
2542
2543 /* Function vect_is_simple_reduction_1
2544
2545    (1) Detect a cross-iteration def-use cycle that represents a simple
2546    reduction computation.  We look for the following pattern:
2547
2548    loop_header:
2549      a1 = phi < a0, a2 >
2550      a3 = ...
2551      a2 = operation (a3, a1)
2552
2553    or
2554
2555    a3 = ...
2556    loop_header:
2557      a1 = phi < a0, a2 >
2558      a2 = operation (a3, a1)
2559
2560    such that:
2561    1. operation is commutative and associative and it is safe to
2562       change the order of the computation (if CHECK_REDUCTION is true)
2563    2. no uses for a2 in the loop (a2 is used out of the loop)
2564    3. no uses of a1 in the loop besides the reduction operation
2565    4. no uses of a1 outside the loop.
2566
2567    Conditions 1,4 are tested here.
2568    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2569
2570    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2571    nested cycles, if CHECK_REDUCTION is false.
2572
2573    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2574    reductions:
2575
2576      a1 = phi < a0, a2 >
2577      inner loop (def of a3)
2578      a2 = phi < a3 >
2579
2580    (4) Detect condition expressions, ie:
2581      for (int i = 0; i < N; i++)
2582        if (a[i] < val)
2583         ret_val = a[i];
2584
2585 */
2586
2587 static gimple *
2588 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2589                           bool check_reduction, bool *double_reduc,
2590                           bool need_wrapping_integral_overflow,
2591                           enum vect_reduction_type *v_reduc_type)
2592 {
2593   struct loop *loop = (gimple_bb (phi))->loop_father;
2594   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2595   edge latch_e = loop_latch_edge (loop);
2596   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2597   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2598   enum tree_code orig_code, code;
2599   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2600   tree type;
2601   int nloop_uses;
2602   tree name;
2603   imm_use_iterator imm_iter;
2604   use_operand_p use_p;
2605   bool phi_def;
2606
2607   *double_reduc = false;
2608   *v_reduc_type = TREE_CODE_REDUCTION;
2609
2610   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2611      otherwise, we assume outer loop vectorization.  */
2612   gcc_assert ((check_reduction && loop == vect_loop)
2613               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2614
2615   name = PHI_RESULT (phi);
2616   /* ???  If there are no uses of the PHI result the inner loop reduction
2617      won't be detected as possibly double-reduction by vectorizable_reduction
2618      because that tries to walk the PHI arg from the preheader edge which
2619      can be constant.  See PR60382.  */
2620   if (has_zero_uses (name))
2621     return NULL;
2622   nloop_uses = 0;
2623   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2624     {
2625       gimple *use_stmt = USE_STMT (use_p);
2626       if (is_gimple_debug (use_stmt))
2627         continue;
2628
2629       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2630         {
2631           if (dump_enabled_p ())
2632             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2633                              "intermediate value used outside loop.\n");
2634
2635           return NULL;
2636         }
2637
2638       nloop_uses++;
2639       if (nloop_uses > 1)
2640         {
2641           if (dump_enabled_p ())
2642             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2643                              "reduction used in loop.\n");
2644           return NULL;
2645         }
2646
2647       phi_use_stmt = use_stmt;
2648     }
2649
2650   if (TREE_CODE (loop_arg) != SSA_NAME)
2651     {
2652       if (dump_enabled_p ())
2653         {
2654           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2655                            "reduction: not ssa_name: ");
2656           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2657           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2658         }
2659       return NULL;
2660     }
2661
2662   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2663   if (!def_stmt)
2664     {
2665       if (dump_enabled_p ())
2666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2667                          "reduction: no def_stmt.\n");
2668       return NULL;
2669     }
2670
2671   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2672     {
2673       if (dump_enabled_p ())
2674         {
2675           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2676           dump_printf (MSG_NOTE, "\n");
2677         }
2678       return NULL;
2679     }
2680
2681   if (is_gimple_assign (def_stmt))
2682     {
2683       name = gimple_assign_lhs (def_stmt);
2684       phi_def = false;
2685     }
2686   else
2687     {
2688       name = PHI_RESULT (def_stmt);
2689       phi_def = true;
2690     }
2691
2692   nloop_uses = 0;
2693   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2694     {
2695       gimple *use_stmt = USE_STMT (use_p);
2696       if (is_gimple_debug (use_stmt))
2697         continue;
2698       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2699         nloop_uses++;
2700       if (nloop_uses > 1)
2701         {
2702           if (dump_enabled_p ())
2703             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2704                              "reduction used in loop.\n");
2705           return NULL;
2706         }
2707     }
2708
2709   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2710      defined in the inner loop.  */
2711   if (phi_def)
2712     {
2713       op1 = PHI_ARG_DEF (def_stmt, 0);
2714
2715       if (gimple_phi_num_args (def_stmt) != 1
2716           || TREE_CODE (op1) != SSA_NAME)
2717         {
2718           if (dump_enabled_p ())
2719             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2720                              "unsupported phi node definition.\n");
2721
2722           return NULL;
2723         }
2724
2725       def1 = SSA_NAME_DEF_STMT (op1);
2726       if (gimple_bb (def1)
2727           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2728           && loop->inner
2729           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2730           && is_gimple_assign (def1)
2731           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2732         {
2733           if (dump_enabled_p ())
2734             report_vect_op (MSG_NOTE, def_stmt,
2735                             "detected double reduction: ");
2736
2737           *double_reduc = true;
2738           return def_stmt;
2739         }
2740
2741       return NULL;
2742     }
2743
2744   code = orig_code = gimple_assign_rhs_code (def_stmt);
2745
2746   /* We can handle "res -= x[i]", which is non-associative by
2747      simply rewriting this into "res += -x[i]".  Avoid changing
2748      gimple instruction for the first simple tests and only do this
2749      if we're allowed to change code at all.  */
2750   if (code == MINUS_EXPR
2751       && (op1 = gimple_assign_rhs1 (def_stmt))
2752       && TREE_CODE (op1) == SSA_NAME
2753       && SSA_NAME_DEF_STMT (op1) == phi)
2754     code = PLUS_EXPR;
2755
2756   if (code == COND_EXPR)
2757     {
2758       if (check_reduction)
2759         *v_reduc_type = COND_REDUCTION;
2760     }
2761   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2762     {
2763       if (dump_enabled_p ())
2764         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2765                         "reduction: not commutative/associative: ");
2766       return NULL;
2767     }
2768
2769   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2770     {
2771       if (code != COND_EXPR)
2772         {
2773           if (dump_enabled_p ())
2774             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2775                             "reduction: not binary operation: ");
2776
2777           return NULL;
2778         }
2779
2780       op3 = gimple_assign_rhs1 (def_stmt);
2781       if (COMPARISON_CLASS_P (op3))
2782         {
2783           op4 = TREE_OPERAND (op3, 1);
2784           op3 = TREE_OPERAND (op3, 0);
2785         }
2786
2787       op1 = gimple_assign_rhs2 (def_stmt);
2788       op2 = gimple_assign_rhs3 (def_stmt);
2789
2790       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2791         {
2792           if (dump_enabled_p ())
2793             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2794                             "reduction: uses not ssa_names: ");
2795
2796           return NULL;
2797         }
2798     }
2799   else
2800     {
2801       op1 = gimple_assign_rhs1 (def_stmt);
2802       op2 = gimple_assign_rhs2 (def_stmt);
2803
2804       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2805         {
2806           if (dump_enabled_p ())
2807             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2808                             "reduction: uses not ssa_names: ");
2809
2810           return NULL;
2811         }
2812    }
2813
2814   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2815   if ((TREE_CODE (op1) == SSA_NAME
2816        && !types_compatible_p (type,TREE_TYPE (op1)))
2817       || (TREE_CODE (op2) == SSA_NAME
2818           && !types_compatible_p (type, TREE_TYPE (op2)))
2819       || (op3 && TREE_CODE (op3) == SSA_NAME
2820           && !types_compatible_p (type, TREE_TYPE (op3)))
2821       || (op4 && TREE_CODE (op4) == SSA_NAME
2822           && !types_compatible_p (type, TREE_TYPE (op4))))
2823     {
2824       if (dump_enabled_p ())
2825         {
2826           dump_printf_loc (MSG_NOTE, vect_location,
2827                            "reduction: multiple types: operation type: ");
2828           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2829           dump_printf (MSG_NOTE, ", operands types: ");
2830           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2831                              TREE_TYPE (op1));
2832           dump_printf (MSG_NOTE, ",");
2833           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2834                              TREE_TYPE (op2));
2835           if (op3)
2836             {
2837               dump_printf (MSG_NOTE, ",");
2838               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2839                                  TREE_TYPE (op3));
2840             }
2841
2842           if (op4)
2843             {
2844               dump_printf (MSG_NOTE, ",");
2845               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2846                                  TREE_TYPE (op4));
2847             }
2848           dump_printf (MSG_NOTE, "\n");
2849         }
2850
2851       return NULL;
2852     }
2853
2854   /* Check that it's ok to change the order of the computation.
2855      Generally, when vectorizing a reduction we change the order of the
2856      computation.  This may change the behavior of the program in some
2857      cases, so we need to check that this is ok.  One exception is when
2858      vectorizing an outer-loop: the inner-loop is executed sequentially,
2859      and therefore vectorizing reductions in the inner-loop during
2860      outer-loop vectorization is safe.  */
2861
2862   if (*v_reduc_type != COND_REDUCTION
2863       && check_reduction)
2864     {
2865       /* CHECKME: check for !flag_finite_math_only too?  */
2866       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2867         {
2868           /* Changing the order of operations changes the semantics.  */
2869           if (dump_enabled_p ())
2870             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2871                         "reduction: unsafe fp math optimization: ");
2872           return NULL;
2873         }
2874       else if (INTEGRAL_TYPE_P (type))
2875         {
2876           if (!operation_no_trapping_overflow (type, code))
2877             {
2878               /* Changing the order of operations changes the semantics.  */
2879               if (dump_enabled_p ())
2880                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2881                                 "reduction: unsafe int math optimization"
2882                                 " (overflow traps): ");
2883               return NULL;
2884             }
2885           if (need_wrapping_integral_overflow
2886               && !TYPE_OVERFLOW_WRAPS (type)
2887               && operation_can_overflow (code))
2888             {
2889               /* Changing the order of operations changes the semantics.  */
2890               if (dump_enabled_p ())
2891                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2892                                 "reduction: unsafe int math optimization"
2893                                 " (overflow doesn't wrap): ");
2894               return NULL;
2895             }
2896         }
2897       else if (SAT_FIXED_POINT_TYPE_P (type))
2898         {
2899           /* Changing the order of operations changes the semantics.  */
2900           if (dump_enabled_p ())
2901           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2902                           "reduction: unsafe fixed-point math optimization: ");
2903           return NULL;
2904         }
2905     }
2906
2907   /* Reduction is safe. We're dealing with one of the following:
2908      1) integer arithmetic and no trapv
2909      2) floating point arithmetic, and special flags permit this optimization
2910      3) nested cycle (i.e., outer loop vectorization).  */
2911   if (TREE_CODE (op1) == SSA_NAME)
2912     def1 = SSA_NAME_DEF_STMT (op1);
2913
2914   if (TREE_CODE (op2) == SSA_NAME)
2915     def2 = SSA_NAME_DEF_STMT (op2);
2916
2917   if (code != COND_EXPR
2918       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2919     {
2920       if (dump_enabled_p ())
2921         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2922       return NULL;
2923     }
2924
2925   /* Check that one def is the reduction def, defined by PHI,
2926      the other def is either defined in the loop ("vect_internal_def"),
2927      or it's an induction (defined by a loop-header phi-node).  */
2928
2929   if (def2 && def2 == phi
2930       && (code == COND_EXPR
2931           || !def1 || gimple_nop_p (def1)
2932           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2933           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2934               && (is_gimple_assign (def1)
2935                   || is_gimple_call (def1)
2936                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2937                       == vect_induction_def
2938                   || (gimple_code (def1) == GIMPLE_PHI
2939                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2940                           == vect_internal_def
2941                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2942     {
2943       if (dump_enabled_p ())
2944         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2945       return def_stmt;
2946     }
2947
2948   if (def1 && def1 == phi
2949       && (code == COND_EXPR
2950           || !def2 || gimple_nop_p (def2)
2951           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2952           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2953               && (is_gimple_assign (def2)
2954                   || is_gimple_call (def2)
2955                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2956                       == vect_induction_def
2957                   || (gimple_code (def2) == GIMPLE_PHI
2958                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2959                           == vect_internal_def
2960                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2961     {
2962       if (check_reduction
2963           && orig_code != MINUS_EXPR)
2964         {
2965           if (code == COND_EXPR)
2966             {
2967               /* No current known use where this case would be useful.  */
2968               if (dump_enabled_p ())
2969                 report_vect_op (MSG_NOTE, def_stmt,
2970                                 "detected reduction: cannot currently swap "
2971                                 "operands for cond_expr");
2972               return NULL;
2973             }
2974
2975           /* Swap operands (just for simplicity - so that the rest of the code
2976              can assume that the reduction variable is always the last (second)
2977              argument).  */
2978           if (dump_enabled_p ())
2979             report_vect_op (MSG_NOTE, def_stmt,
2980                             "detected reduction: need to swap operands: ");
2981
2982           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2983                              gimple_assign_rhs2_ptr (def_stmt));
2984
2985           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2986             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2987         }
2988       else
2989         {
2990           if (dump_enabled_p ())
2991             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2992         }
2993
2994       return def_stmt;
2995     }
2996
2997   /* Try to find SLP reduction chain.  */
2998   if (check_reduction && code != COND_EXPR
2999       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3000     {
3001       if (dump_enabled_p ())
3002         report_vect_op (MSG_NOTE, def_stmt,
3003                         "reduction: detected reduction chain: ");
3004
3005       return def_stmt;
3006     }
3007
3008   if (dump_enabled_p ())
3009     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3010                     "reduction: unknown pattern: ");
3011
3012   return NULL;
3013 }
3014
3015 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3016    in-place if it enables detection of more reductions.  Arguments
3017    as there.  */
3018
3019 gimple *
3020 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3021                              bool check_reduction, bool *double_reduc,
3022                              bool need_wrapping_integral_overflow)
3023 {
3024   enum vect_reduction_type v_reduc_type;
3025   return vect_is_simple_reduction (loop_info, phi, check_reduction,
3026                                    double_reduc,
3027                                    need_wrapping_integral_overflow,
3028                                    &v_reduc_type);
3029 }
3030
3031 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3032 int
3033 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3034                              int *peel_iters_epilogue,
3035                              stmt_vector_for_cost *scalar_cost_vec,
3036                              stmt_vector_for_cost *prologue_cost_vec,
3037                              stmt_vector_for_cost *epilogue_cost_vec)
3038 {
3039   int retval = 0;
3040   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3041
3042   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3043     {
3044       *peel_iters_epilogue = vf/2;
3045       if (dump_enabled_p ())
3046         dump_printf_loc (MSG_NOTE, vect_location,
3047                          "cost model: epilogue peel iters set to vf/2 "
3048                          "because loop iterations are unknown .\n");
3049
3050       /* If peeled iterations are known but number of scalar loop
3051          iterations are unknown, count a taken branch per peeled loop.  */
3052       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3053                                  NULL, 0, vect_prologue);
3054       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3055                                  NULL, 0, vect_epilogue);
3056     }
3057   else
3058     {
3059       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3060       peel_iters_prologue = niters < peel_iters_prologue ?
3061                             niters : peel_iters_prologue;
3062       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3063       /* If we need to peel for gaps, but no peeling is required, we have to
3064          peel VF iterations.  */
3065       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3066         *peel_iters_epilogue = vf;
3067     }
3068
3069   stmt_info_for_cost *si;
3070   int j;
3071   if (peel_iters_prologue)
3072     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3073       retval += record_stmt_cost (prologue_cost_vec,
3074                                   si->count * peel_iters_prologue,
3075                                   si->kind, NULL, si->misalign,
3076                                   vect_prologue);
3077   if (*peel_iters_epilogue)
3078     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3079       retval += record_stmt_cost (epilogue_cost_vec,
3080                                   si->count * *peel_iters_epilogue,
3081                                   si->kind, NULL, si->misalign,
3082                                   vect_epilogue);
3083
3084   return retval;
3085 }
3086
3087 /* Function vect_estimate_min_profitable_iters
3088
3089    Return the number of iterations required for the vector version of the
3090    loop to be profitable relative to the cost of the scalar version of the
3091    loop.  */
3092
3093 static void
3094 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3095                                     int *ret_min_profitable_niters,
3096                                     int *ret_min_profitable_estimate)
3097 {
3098   int min_profitable_iters;
3099   int min_profitable_estimate;
3100   int peel_iters_prologue;
3101   int peel_iters_epilogue;
3102   unsigned vec_inside_cost = 0;
3103   int vec_outside_cost = 0;
3104   unsigned vec_prologue_cost = 0;
3105   unsigned vec_epilogue_cost = 0;
3106   int scalar_single_iter_cost = 0;
3107   int scalar_outside_cost = 0;
3108   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3109   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3110   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3111
3112   /* Cost model disabled.  */
3113   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3114     {
3115       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3116       *ret_min_profitable_niters = 0;
3117       *ret_min_profitable_estimate = 0;
3118       return;
3119     }
3120
3121   /* Requires loop versioning tests to handle misalignment.  */
3122   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3123     {
3124       /*  FIXME: Make cost depend on complexity of individual check.  */
3125       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3126       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3127                             vect_prologue);
3128       dump_printf (MSG_NOTE,
3129                    "cost model: Adding cost of checks for loop "
3130                    "versioning to treat misalignment.\n");
3131     }
3132
3133   /* Requires loop versioning with alias checks.  */
3134   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3135     {
3136       /*  FIXME: Make cost depend on complexity of individual check.  */
3137       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3138       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3139                             vect_prologue);
3140       dump_printf (MSG_NOTE,
3141                    "cost model: Adding cost of checks for loop "
3142                    "versioning aliasing.\n");
3143     }
3144
3145   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3146       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3147     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3148                           vect_prologue);
3149
3150   /* Count statements in scalar loop.  Using this as scalar cost for a single
3151      iteration for now.
3152
3153      TODO: Add outer loop support.
3154
3155      TODO: Consider assigning different costs to different scalar
3156      statements.  */
3157
3158   scalar_single_iter_cost
3159     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3160
3161   /* Add additional cost for the peeled instructions in prologue and epilogue
3162      loop.
3163
3164      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3165      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3166
3167      TODO: Build an expression that represents peel_iters for prologue and
3168      epilogue to be used in a run-time test.  */
3169
3170   if (npeel  < 0)
3171     {
3172       peel_iters_prologue = vf/2;
3173       dump_printf (MSG_NOTE, "cost model: "
3174                    "prologue peel iters set to vf/2.\n");
3175
3176       /* If peeling for alignment is unknown, loop bound of main loop becomes
3177          unknown.  */
3178       peel_iters_epilogue = vf/2;
3179       dump_printf (MSG_NOTE, "cost model: "
3180                    "epilogue peel iters set to vf/2 because "
3181                    "peeling for alignment is unknown.\n");
3182
3183       /* If peeled iterations are unknown, count a taken branch and a not taken
3184          branch per peeled loop. Even if scalar loop iterations are known,
3185          vector iterations are not known since peeled prologue iterations are
3186          not known. Hence guards remain the same.  */
3187       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3188                             NULL, 0, vect_prologue);
3189       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3190                             NULL, 0, vect_prologue);
3191       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3192                             NULL, 0, vect_epilogue);
3193       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3194                             NULL, 0, vect_epilogue);
3195       stmt_info_for_cost *si;
3196       int j;
3197       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3198         {
3199           struct _stmt_vec_info *stmt_info
3200             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3201           (void) add_stmt_cost (target_cost_data,
3202                                 si->count * peel_iters_prologue,
3203                                 si->kind, stmt_info, si->misalign,
3204                                 vect_prologue);
3205           (void) add_stmt_cost (target_cost_data,
3206                                 si->count * peel_iters_epilogue,
3207                                 si->kind, stmt_info, si->misalign,
3208                                 vect_epilogue);
3209         }
3210     }
3211   else
3212     {
3213       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3214       stmt_info_for_cost *si;
3215       int j;
3216       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3217
3218       prologue_cost_vec.create (2);
3219       epilogue_cost_vec.create (2);
3220       peel_iters_prologue = npeel;
3221
3222       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3223                                           &peel_iters_epilogue,
3224                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3225                                             (loop_vinfo),
3226                                           &prologue_cost_vec,
3227                                           &epilogue_cost_vec);
3228
3229       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3230         {
3231           struct _stmt_vec_info *stmt_info
3232             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3233           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3234                                 si->misalign, vect_prologue);
3235         }
3236
3237       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3238         {
3239           struct _stmt_vec_info *stmt_info
3240             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3241           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3242                                 si->misalign, vect_epilogue);
3243         }
3244
3245       prologue_cost_vec.release ();
3246       epilogue_cost_vec.release ();
3247     }
3248
3249   /* FORNOW: The scalar outside cost is incremented in one of the
3250      following ways:
3251
3252      1. The vectorizer checks for alignment and aliasing and generates
3253      a condition that allows dynamic vectorization.  A cost model
3254      check is ANDED with the versioning condition.  Hence scalar code
3255      path now has the added cost of the versioning check.
3256
3257        if (cost > th & versioning_check)
3258          jmp to vector code
3259
3260      Hence run-time scalar is incremented by not-taken branch cost.
3261
3262      2. The vectorizer then checks if a prologue is required.  If the
3263      cost model check was not done before during versioning, it has to
3264      be done before the prologue check.
3265
3266        if (cost <= th)
3267          prologue = scalar_iters
3268        if (prologue == 0)
3269          jmp to vector code
3270        else
3271          execute prologue
3272        if (prologue == num_iters)
3273          go to exit
3274
3275      Hence the run-time scalar cost is incremented by a taken branch,
3276      plus a not-taken branch, plus a taken branch cost.
3277
3278      3. The vectorizer then checks if an epilogue is required.  If the
3279      cost model check was not done before during prologue check, it
3280      has to be done with the epilogue check.
3281
3282        if (prologue == 0)
3283          jmp to vector code
3284        else
3285          execute prologue
3286        if (prologue == num_iters)
3287          go to exit
3288        vector code:
3289          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3290            jmp to epilogue
3291
3292      Hence the run-time scalar cost should be incremented by 2 taken
3293      branches.
3294
3295      TODO: The back end may reorder the BBS's differently and reverse
3296      conditions/branch directions.  Change the estimates below to
3297      something more reasonable.  */
3298
3299   /* If the number of iterations is known and we do not do versioning, we can
3300      decide whether to vectorize at compile time.  Hence the scalar version
3301      do not carry cost model guard costs.  */
3302   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3303       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3304       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3305     {
3306       /* Cost model check occurs at versioning.  */
3307       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3308           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3309         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3310       else
3311         {
3312           /* Cost model check occurs at prologue generation.  */
3313           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3314             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3315               + vect_get_stmt_cost (cond_branch_not_taken);
3316           /* Cost model check occurs at epilogue generation.  */
3317           else
3318             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3319         }
3320     }
3321
3322   /* Complete the target-specific cost calculations.  */
3323   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3324                &vec_inside_cost, &vec_epilogue_cost);
3325
3326   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3327
3328   if (dump_enabled_p ())
3329     {
3330       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3331       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3332                    vec_inside_cost);
3333       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3334                    vec_prologue_cost);
3335       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3336                    vec_epilogue_cost);
3337       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3338                    scalar_single_iter_cost);
3339       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3340                    scalar_outside_cost);
3341       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3342                    vec_outside_cost);
3343       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3344                    peel_iters_prologue);
3345       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3346                    peel_iters_epilogue);
3347     }
3348
3349   /* Calculate number of iterations required to make the vector version
3350      profitable, relative to the loop bodies only.  The following condition
3351      must hold true:
3352      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3353      where
3354      SIC = scalar iteration cost, VIC = vector iteration cost,
3355      VOC = vector outside cost, VF = vectorization factor,
3356      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3357      SOC = scalar outside cost for run time cost model check.  */
3358
3359   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3360     {
3361       if (vec_outside_cost <= 0)
3362         min_profitable_iters = 1;
3363       else
3364         {
3365           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3366                                   - vec_inside_cost * peel_iters_prologue
3367                                   - vec_inside_cost * peel_iters_epilogue)
3368                                  / ((scalar_single_iter_cost * vf)
3369                                     - vec_inside_cost);
3370
3371           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3372               <= (((int) vec_inside_cost * min_profitable_iters)
3373                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3374             min_profitable_iters++;
3375         }
3376     }
3377   /* vector version will never be profitable.  */
3378   else
3379     {
3380       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3381         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3382                     "did not happen for a simd loop");
3383
3384       if (dump_enabled_p ())
3385         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3386                          "cost model: the vector iteration cost = %d "
3387                          "divided by the scalar iteration cost = %d "
3388                          "is greater or equal to the vectorization factor = %d"
3389                          ".\n",
3390                          vec_inside_cost, scalar_single_iter_cost, vf);
3391       *ret_min_profitable_niters = -1;
3392       *ret_min_profitable_estimate = -1;
3393       return;
3394     }
3395
3396   dump_printf (MSG_NOTE,
3397                "  Calculated minimum iters for profitability: %d\n",
3398                min_profitable_iters);
3399
3400   min_profitable_iters =
3401         min_profitable_iters < vf ? vf : min_profitable_iters;
3402
3403   /* Because the condition we create is:
3404      if (niters <= min_profitable_iters)
3405        then skip the vectorized loop.  */
3406   min_profitable_iters--;
3407
3408   if (dump_enabled_p ())
3409     dump_printf_loc (MSG_NOTE, vect_location,
3410                      "  Runtime profitability threshold = %d\n",
3411                      min_profitable_iters);
3412
3413   *ret_min_profitable_niters = min_profitable_iters;
3414
3415   /* Calculate number of iterations required to make the vector version
3416      profitable, relative to the loop bodies only.
3417
3418      Non-vectorized variant is SIC * niters and it must win over vector
3419      variant on the expected loop trip count.  The following condition must hold true:
3420      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3421
3422   if (vec_outside_cost <= 0)
3423     min_profitable_estimate = 1;
3424   else
3425     {
3426       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3427                                  - vec_inside_cost * peel_iters_prologue
3428                                  - vec_inside_cost * peel_iters_epilogue)
3429                                  / ((scalar_single_iter_cost * vf)
3430                                    - vec_inside_cost);
3431     }
3432   min_profitable_estimate --;
3433   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3434   if (dump_enabled_p ())
3435     dump_printf_loc (MSG_NOTE, vect_location,
3436                      "  Static estimate profitability threshold = %d\n",
3437                      min_profitable_estimate);
3438
3439   *ret_min_profitable_estimate = min_profitable_estimate;
3440 }
3441
3442 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3443    vector elements (not bits) for a vector of mode MODE.  */
3444 static void
3445 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3446                               unsigned char *sel)
3447 {
3448   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3449
3450   for (i = 0; i < nelt; i++)
3451     sel[i] = (i + offset) & (2*nelt - 1);
3452 }
3453
3454 /* Checks whether the target supports whole-vector shifts for vectors of mode
3455    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3456    it supports vec_perm_const with masks for all necessary shift amounts.  */
3457 static bool
3458 have_whole_vector_shift (enum machine_mode mode)
3459 {
3460   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3461     return true;
3462
3463   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3464     return false;
3465
3466   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3467   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3468
3469   for (i = nelt/2; i >= 1; i/=2)
3470     {
3471       calc_vec_perm_mask_for_shift (mode, i, sel);
3472       if (!can_vec_perm_p (mode, false, sel))
3473         return false;
3474     }
3475   return true;
3476 }
3477
3478 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3479
3480 static tree
3481 get_reduction_op (gimple *stmt, int reduc_index)
3482 {
3483   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3484     {
3485     case GIMPLE_SINGLE_RHS:
3486       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3487                   == ternary_op);
3488       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3489     case GIMPLE_UNARY_RHS:
3490       return gimple_assign_rhs1 (stmt);
3491     case GIMPLE_BINARY_RHS:
3492       return (reduc_index
3493               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3494     case GIMPLE_TERNARY_RHS:
3495       return gimple_op (stmt, reduc_index + 1);
3496     default:
3497       gcc_unreachable ();
3498     }
3499 }
3500
3501 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3502    functions. Design better to avoid maintenance issues.  */
3503
3504 /* Function vect_model_reduction_cost.
3505
3506    Models cost for a reduction operation, including the vector ops
3507    generated within the strip-mine loop, the initial definition before
3508    the loop, and the epilogue code that must be generated.  */
3509
3510 static bool
3511 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3512                            int ncopies, int reduc_index)
3513 {
3514   int prologue_cost = 0, epilogue_cost = 0;
3515   enum tree_code code;
3516   optab optab;
3517   tree vectype;
3518   gimple *stmt, *orig_stmt;
3519   tree reduction_op;
3520   machine_mode mode;
3521   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3522   struct loop *loop = NULL;
3523   void *target_cost_data;
3524
3525   if (loop_vinfo)
3526     {
3527       loop = LOOP_VINFO_LOOP (loop_vinfo);
3528       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3529     }
3530   else
3531     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3532
3533   /* Condition reductions generate two reductions in the loop.  */
3534   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3535     ncopies *= 2;
3536
3537   /* Cost of reduction op inside loop.  */
3538   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3539                                         stmt_info, 0, vect_body);
3540   stmt = STMT_VINFO_STMT (stmt_info);
3541
3542   reduction_op = get_reduction_op (stmt, reduc_index);
3543
3544   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3545   if (!vectype)
3546     {
3547       if (dump_enabled_p ())
3548         {
3549           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3550                            "unsupported data-type ");
3551           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3552                              TREE_TYPE (reduction_op));
3553           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3554         }
3555       return false;
3556    }
3557
3558   mode = TYPE_MODE (vectype);
3559   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3560
3561   if (!orig_stmt)
3562     orig_stmt = STMT_VINFO_STMT (stmt_info);
3563
3564   code = gimple_assign_rhs_code (orig_stmt);
3565
3566   /* Add in cost for initial definition.
3567      For cond reduction we have four vectors: initial index, step, initial
3568      result of the data reduction, initial value of the index reduction.  */
3569   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3570                        == COND_REDUCTION ? 4 : 1;
3571   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3572                                   scalar_to_vec, stmt_info, 0,
3573                                   vect_prologue);
3574
3575   /* Determine cost of epilogue code.
3576
3577      We have a reduction operator that will reduce the vector in one statement.
3578      Also requires scalar extract.  */
3579
3580   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3581     {
3582       if (reduc_code != ERROR_MARK)
3583         {
3584           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3585             {
3586               /* An EQ stmt and an COND_EXPR stmt.  */
3587               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3588                                               vector_stmt, stmt_info, 0,
3589                                               vect_epilogue);
3590               /* Reduction of the max index and a reduction of the found
3591                  values.  */
3592               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3593                                               vec_to_scalar, stmt_info, 0,
3594                                               vect_epilogue);
3595               /* A broadcast of the max value.  */
3596               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3597                                               scalar_to_vec, stmt_info, 0,
3598                                               vect_epilogue);
3599             }
3600           else
3601             {
3602               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3603                                               stmt_info, 0, vect_epilogue);
3604               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3605                                               vec_to_scalar, stmt_info, 0,
3606                                               vect_epilogue);
3607             }
3608         }
3609       else
3610         {
3611           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3612           tree bitsize =
3613             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3614           int element_bitsize = tree_to_uhwi (bitsize);
3615           int nelements = vec_size_in_bits / element_bitsize;
3616
3617           optab = optab_for_tree_code (code, vectype, optab_default);
3618
3619           /* We have a whole vector shift available.  */
3620           if (VECTOR_MODE_P (mode)
3621               && optab_handler (optab, mode) != CODE_FOR_nothing
3622               && have_whole_vector_shift (mode))
3623             {
3624               /* Final reduction via vector shifts and the reduction operator.
3625                  Also requires scalar extract.  */
3626               epilogue_cost += add_stmt_cost (target_cost_data,
3627                                               exact_log2 (nelements) * 2,
3628                                               vector_stmt, stmt_info, 0,
3629                                               vect_epilogue);
3630               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3631                                               vec_to_scalar, stmt_info, 0,
3632                                               vect_epilogue);
3633             }
3634           else
3635             /* Use extracts and reduction op for final reduction.  For N
3636                elements, we have N extracts and N-1 reduction ops.  */
3637             epilogue_cost += add_stmt_cost (target_cost_data,
3638                                             nelements + nelements - 1,
3639                                             vector_stmt, stmt_info, 0,
3640                                             vect_epilogue);
3641         }
3642     }
3643
3644   if (dump_enabled_p ())
3645     dump_printf (MSG_NOTE,
3646                  "vect_model_reduction_cost: inside_cost = %d, "
3647                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3648                  prologue_cost, epilogue_cost);
3649
3650   return true;
3651 }
3652
3653
3654 /* Function vect_model_induction_cost.
3655
3656    Models cost for induction operations.  */
3657
3658 static void
3659 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3660 {
3661   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3662   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3663   unsigned inside_cost, prologue_cost;
3664
3665   /* loop cost for vec_loop.  */
3666   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3667                                stmt_info, 0, vect_body);
3668
3669   /* prologue cost for vec_init and vec_step.  */
3670   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3671                                  stmt_info, 0, vect_prologue);
3672
3673   if (dump_enabled_p ())
3674     dump_printf_loc (MSG_NOTE, vect_location,
3675                      "vect_model_induction_cost: inside_cost = %d, "
3676                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3677 }
3678
3679
3680 /* Function get_initial_def_for_induction
3681
3682    Input:
3683    STMT - a stmt that performs an induction operation in the loop.
3684    IV_PHI - the initial value of the induction variable
3685
3686    Output:
3687    Return a vector variable, initialized with the first VF values of
3688    the induction variable.  E.g., for an iv with IV_PHI='X' and
3689    evolution S, for a vector of 4 units, we want to return:
3690    [X, X + S, X + 2*S, X + 3*S].  */
3691
3692 static tree
3693 get_initial_def_for_induction (gimple *iv_phi)
3694 {
3695   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3696   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3697   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3698   tree vectype;
3699   int nunits;
3700   edge pe = loop_preheader_edge (loop);
3701   struct loop *iv_loop;
3702   basic_block new_bb;
3703   tree new_vec, vec_init, vec_step, t;
3704   tree new_name;
3705   gimple *new_stmt;
3706   gphi *induction_phi;
3707   tree induc_def, vec_def, vec_dest;
3708   tree init_expr, step_expr;
3709   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3710   int i;
3711   int ncopies;
3712   tree expr;
3713   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3714   bool nested_in_vect_loop = false;
3715   gimple_seq stmts;
3716   imm_use_iterator imm_iter;
3717   use_operand_p use_p;
3718   gimple *exit_phi;
3719   edge latch_e;
3720   tree loop_arg;
3721   gimple_stmt_iterator si;
3722   basic_block bb = gimple_bb (iv_phi);
3723   tree stepvectype;
3724   tree resvectype;
3725
3726   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3727   if (nested_in_vect_loop_p (loop, iv_phi))
3728     {
3729       nested_in_vect_loop = true;
3730       iv_loop = loop->inner;
3731     }
3732   else
3733     iv_loop = loop;
3734   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3735
3736   latch_e = loop_latch_edge (iv_loop);
3737   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3738
3739   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3740   gcc_assert (step_expr != NULL_TREE);
3741
3742   pe = loop_preheader_edge (iv_loop);
3743   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3744                                      loop_preheader_edge (iv_loop));
3745
3746   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3747   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3748   gcc_assert (vectype);
3749   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3750   ncopies = vf / nunits;
3751
3752   gcc_assert (phi_info);
3753   gcc_assert (ncopies >= 1);
3754
3755   /* Convert the step to the desired type.  */
3756   stmts = NULL;
3757   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3758   if (stmts)
3759     {
3760       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3761       gcc_assert (!new_bb);
3762     }
3763
3764   /* Find the first insertion point in the BB.  */
3765   si = gsi_after_labels (bb);
3766
3767   /* Create the vector that holds the initial_value of the induction.  */
3768   if (nested_in_vect_loop)
3769     {
3770       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3771          been created during vectorization of previous stmts.  We obtain it
3772          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3773       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3774       /* If the initial value is not of proper type, convert it.  */
3775       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3776         {
3777           new_stmt
3778             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3779                                                           vect_simple_var,
3780                                                           "vec_iv_"),
3781                                    VIEW_CONVERT_EXPR,
3782                                    build1 (VIEW_CONVERT_EXPR, vectype,
3783                                            vec_init));
3784           vec_init = gimple_assign_lhs (new_stmt);
3785           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3786                                                  new_stmt);
3787           gcc_assert (!new_bb);
3788           set_vinfo_for_stmt (new_stmt,
3789                               new_stmt_vec_info (new_stmt, loop_vinfo));
3790         }
3791     }
3792   else
3793     {
3794       vec<constructor_elt, va_gc> *v;
3795
3796       /* iv_loop is the loop to be vectorized. Create:
3797          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3798       stmts = NULL;
3799       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3800
3801       vec_alloc (v, nunits);
3802       bool constant_p = is_gimple_min_invariant (new_name);
3803       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3804       for (i = 1; i < nunits; i++)
3805         {
3806           /* Create: new_name_i = new_name + step_expr  */
3807           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3808                                    new_name, step_expr);
3809           if (!is_gimple_min_invariant (new_name))
3810             constant_p = false;
3811           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3812         }
3813       if (stmts)
3814         {
3815           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3816           gcc_assert (!new_bb);
3817         }
3818
3819       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3820       if (constant_p)
3821         new_vec = build_vector_from_ctor (vectype, v);
3822       else
3823         new_vec = build_constructor (vectype, v);
3824       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3825     }
3826
3827
3828   /* Create the vector that holds the step of the induction.  */
3829   if (nested_in_vect_loop)
3830     /* iv_loop is nested in the loop to be vectorized. Generate:
3831        vec_step = [S, S, S, S]  */
3832     new_name = step_expr;
3833   else
3834     {
3835       /* iv_loop is the loop to be vectorized. Generate:
3836           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3837       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3838         {
3839           expr = build_int_cst (integer_type_node, vf);
3840           expr = fold_convert (TREE_TYPE (step_expr), expr);
3841         }
3842       else
3843         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3844       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3845                               expr, step_expr);
3846       if (TREE_CODE (step_expr) == SSA_NAME)
3847         new_name = vect_init_vector (iv_phi, new_name,
3848                                      TREE_TYPE (step_expr), NULL);
3849     }
3850
3851   t = unshare_expr (new_name);
3852   gcc_assert (CONSTANT_CLASS_P (new_name)
3853               || TREE_CODE (new_name) == SSA_NAME);
3854   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3855   gcc_assert (stepvectype);
3856   new_vec = build_vector_from_val (stepvectype, t);
3857   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3858
3859
3860   /* Create the following def-use cycle:
3861      loop prolog:
3862          vec_init = ...
3863          vec_step = ...
3864      loop:
3865          vec_iv = PHI <vec_init, vec_loop>
3866          ...
3867          STMT
3868          ...
3869          vec_loop = vec_iv + vec_step;  */
3870
3871   /* Create the induction-phi that defines the induction-operand.  */
3872   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3873   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3874   set_vinfo_for_stmt (induction_phi,
3875                       new_stmt_vec_info (induction_phi, loop_vinfo));
3876   induc_def = PHI_RESULT (induction_phi);
3877
3878   /* Create the iv update inside the loop  */
3879   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3880   vec_def = make_ssa_name (vec_dest, new_stmt);
3881   gimple_assign_set_lhs (new_stmt, vec_def);
3882   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3883   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3884
3885   /* Set the arguments of the phi node:  */
3886   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3887   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3888                UNKNOWN_LOCATION);
3889
3890
3891   /* In case that vectorization factor (VF) is bigger than the number
3892      of elements that we can fit in a vectype (nunits), we have to generate
3893      more than one vector stmt - i.e - we need to "unroll" the
3894      vector stmt by a factor VF/nunits.  For more details see documentation
3895      in vectorizable_operation.  */
3896
3897   if (ncopies > 1)
3898     {
3899       stmt_vec_info prev_stmt_vinfo;
3900       /* FORNOW. This restriction should be relaxed.  */
3901       gcc_assert (!nested_in_vect_loop);
3902
3903       /* Create the vector that holds the step of the induction.  */
3904       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3905         {
3906           expr = build_int_cst (integer_type_node, nunits);
3907           expr = fold_convert (TREE_TYPE (step_expr), expr);
3908         }
3909       else
3910         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3911       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3912                               expr, step_expr);
3913       if (TREE_CODE (step_expr) == SSA_NAME)
3914         new_name = vect_init_vector (iv_phi, new_name,
3915                                      TREE_TYPE (step_expr), NULL);
3916       t = unshare_expr (new_name);
3917       gcc_assert (CONSTANT_CLASS_P (new_name)
3918                   || TREE_CODE (new_name) == SSA_NAME);
3919       new_vec = build_vector_from_val (stepvectype, t);
3920       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3921
3922       vec_def = induc_def;
3923       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3924       for (i = 1; i < ncopies; i++)
3925         {
3926           /* vec_i = vec_prev + vec_step  */
3927           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3928                                           vec_def, vec_step);
3929           vec_def = make_ssa_name (vec_dest, new_stmt);
3930           gimple_assign_set_lhs (new_stmt, vec_def);
3931
3932           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3933           if (!useless_type_conversion_p (resvectype, vectype))
3934             {
3935               new_stmt
3936                 = gimple_build_assign
3937                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3938                                                 "vec_iv_"),
3939                          VIEW_CONVERT_EXPR,
3940                          build1 (VIEW_CONVERT_EXPR, resvectype,
3941                                  gimple_assign_lhs (new_stmt)));
3942               gimple_assign_set_lhs (new_stmt,
3943                                      make_ssa_name
3944                                        (gimple_assign_lhs (new_stmt), new_stmt));
3945               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3946             }
3947           set_vinfo_for_stmt (new_stmt,
3948                               new_stmt_vec_info (new_stmt, loop_vinfo));
3949           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3950           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3951         }
3952     }
3953
3954   if (nested_in_vect_loop)
3955     {
3956       /* Find the loop-closed exit-phi of the induction, and record
3957          the final vector of induction results:  */
3958       exit_phi = NULL;
3959       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3960         {
3961           gimple *use_stmt = USE_STMT (use_p);
3962           if (is_gimple_debug (use_stmt))
3963             continue;
3964
3965           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3966             {
3967               exit_phi = use_stmt;
3968               break;
3969             }
3970         }
3971       if (exit_phi)
3972         {
3973           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3974           /* FORNOW. Currently not supporting the case that an inner-loop induction
3975              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3976           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3977                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3978
3979           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3980           if (dump_enabled_p ())
3981             {
3982               dump_printf_loc (MSG_NOTE, vect_location,
3983                                "vector of inductions after inner-loop:");
3984               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3985               dump_printf (MSG_NOTE, "\n");
3986             }
3987         }
3988     }
3989
3990
3991   if (dump_enabled_p ())
3992     {
3993       dump_printf_loc (MSG_NOTE, vect_location,
3994                        "transform induction: created def-use cycle: ");
3995       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3996       dump_printf (MSG_NOTE, "\n");
3997       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3998                         SSA_NAME_DEF_STMT (vec_def), 0);
3999       dump_printf (MSG_NOTE, "\n");
4000     }
4001
4002   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4003   if (!useless_type_conversion_p (resvectype, vectype))
4004     {
4005       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4006                                                              vect_simple_var,
4007                                                              "vec_iv_"),
4008                                       VIEW_CONVERT_EXPR,
4009                                       build1 (VIEW_CONVERT_EXPR, resvectype,
4010                                               induc_def));
4011       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4012       gimple_assign_set_lhs (new_stmt, induc_def);
4013       si = gsi_after_labels (bb);
4014       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4015       set_vinfo_for_stmt (new_stmt,
4016                           new_stmt_vec_info (new_stmt, loop_vinfo));
4017       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4018         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4019     }
4020
4021   return induc_def;
4022 }
4023
4024
4025 /* Function get_initial_def_for_reduction
4026
4027    Input:
4028    STMT - a stmt that performs a reduction operation in the loop.
4029    INIT_VAL - the initial value of the reduction variable
4030
4031    Output:
4032    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4033         of the reduction (used for adjusting the epilog - see below).
4034    Return a vector variable, initialized according to the operation that STMT
4035         performs. This vector will be used as the initial value of the
4036         vector of partial results.
4037
4038    Option1 (adjust in epilog): Initialize the vector as follows:
4039      add/bit or/xor:    [0,0,...,0,0]
4040      mult/bit and:      [1,1,...,1,1]
4041      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4042    and when necessary (e.g. add/mult case) let the caller know
4043    that it needs to adjust the result by init_val.
4044
4045    Option2: Initialize the vector as follows:
4046      add/bit or/xor:    [init_val,0,0,...,0]
4047      mult/bit and:      [init_val,1,1,...,1]
4048      min/max/cond_expr: [init_val,init_val,...,init_val]
4049    and no adjustments are needed.
4050
4051    For example, for the following code:
4052
4053    s = init_val;
4054    for (i=0;i<n;i++)
4055      s = s + a[i];
4056
4057    STMT is 's = s + a[i]', and the reduction variable is 's'.
4058    For a vector of 4 units, we want to return either [0,0,0,init_val],
4059    or [0,0,0,0] and let the caller know that it needs to adjust
4060    the result at the end by 'init_val'.
4061
4062    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4063    initialization vector is simpler (same element in all entries), if
4064    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4065
4066    A cost model should help decide between these two schemes.  */
4067
4068 tree
4069 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4070                                tree *adjustment_def)
4071 {
4072   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4073   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4074   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4075   tree scalar_type = TREE_TYPE (init_val);
4076   tree vectype = get_vectype_for_scalar_type (scalar_type);
4077   int nunits;
4078   enum tree_code code = gimple_assign_rhs_code (stmt);
4079   tree def_for_init;
4080   tree init_def;
4081   tree *elts;
4082   int i;
4083   bool nested_in_vect_loop = false;
4084   REAL_VALUE_TYPE real_init_val = dconst0;
4085   int int_init_val = 0;
4086   gimple *def_stmt = NULL;
4087   gimple_seq stmts = NULL;
4088
4089   gcc_assert (vectype);
4090   nunits = TYPE_VECTOR_SUBPARTS (vectype);
4091
4092   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4093               || SCALAR_FLOAT_TYPE_P (scalar_type));
4094
4095   if (nested_in_vect_loop_p (loop, stmt))
4096     nested_in_vect_loop = true;
4097   else
4098     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4099
4100   /* In case of double reduction we only create a vector variable to be put
4101      in the reduction phi node.  The actual statement creation is done in
4102      vect_create_epilog_for_reduction.  */
4103   if (adjustment_def && nested_in_vect_loop
4104       && TREE_CODE (init_val) == SSA_NAME
4105       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4106       && gimple_code (def_stmt) == GIMPLE_PHI
4107       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4108       && vinfo_for_stmt (def_stmt)
4109       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4110           == vect_double_reduction_def)
4111     {
4112       *adjustment_def = NULL;
4113       return vect_create_destination_var (init_val, vectype);
4114     }
4115
4116   /* In case of a nested reduction do not use an adjustment def as
4117      that case is not supported by the epilogue generation correctly
4118      if ncopies is not one.  */
4119   if (adjustment_def && nested_in_vect_loop)
4120     {
4121       *adjustment_def = NULL;
4122       return vect_get_vec_def_for_operand (init_val, stmt);
4123     }
4124
4125   switch (code)
4126     {
4127       case WIDEN_SUM_EXPR:
4128       case DOT_PROD_EXPR:
4129       case SAD_EXPR:
4130       case PLUS_EXPR:
4131       case MINUS_EXPR:
4132       case BIT_IOR_EXPR:
4133       case BIT_XOR_EXPR:
4134       case MULT_EXPR:
4135       case BIT_AND_EXPR:
4136         /* ADJUSMENT_DEF is NULL when called from
4137            vect_create_epilog_for_reduction to vectorize double reduction.  */
4138         if (adjustment_def)
4139           *adjustment_def = init_val;
4140
4141         if (code == MULT_EXPR)
4142           {
4143             real_init_val = dconst1;
4144             int_init_val = 1;
4145           }
4146
4147         if (code == BIT_AND_EXPR)
4148           int_init_val = -1;
4149
4150         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4151           def_for_init = build_real (scalar_type, real_init_val);
4152         else
4153           def_for_init = build_int_cst (scalar_type, int_init_val);
4154
4155         /* Create a vector of '0' or '1' except the first element.  */
4156         elts = XALLOCAVEC (tree, nunits);
4157         for (i = nunits - 2; i >= 0; --i)
4158           elts[i + 1] = def_for_init;
4159
4160         /* Option1: the first element is '0' or '1' as well.  */
4161         if (adjustment_def)
4162           {
4163             elts[0] = def_for_init;
4164             init_def = build_vector (vectype, elts);
4165             break;
4166           }
4167
4168         /* Option2: the first element is INIT_VAL.  */
4169         elts[0] = init_val;
4170         if (TREE_CONSTANT (init_val))
4171           init_def = build_vector (vectype, elts);
4172         else
4173           {
4174             vec<constructor_elt, va_gc> *v;
4175             vec_alloc (v, nunits);
4176             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4177             for (i = 1; i < nunits; ++i)
4178               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4179             init_def = build_constructor (vectype, v);
4180           }
4181
4182         break;
4183
4184       case MIN_EXPR:
4185       case MAX_EXPR:
4186       case COND_EXPR:
4187         if (adjustment_def)
4188           {
4189             *adjustment_def = NULL_TREE;
4190             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4191               {
4192                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4193                 break;
4194               }
4195           }
4196         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4197         if (! gimple_seq_empty_p (stmts))
4198           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4199         init_def = build_vector_from_val (vectype, init_val);
4200         break;
4201
4202       default:
4203         gcc_unreachable ();
4204     }
4205
4206   return init_def;
4207 }
4208
4209 /* Function vect_create_epilog_for_reduction
4210
4211    Create code at the loop-epilog to finalize the result of a reduction
4212    computation.
4213
4214    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4215      reduction statements.
4216    STMT is the scalar reduction stmt that is being vectorized.
4217    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4218      number of elements that we can fit in a vectype (nunits).  In this case
4219      we have to generate more than one vector stmt - i.e - we need to "unroll"
4220      the vector stmt by a factor VF/nunits.  For more details see documentation
4221      in vectorizable_operation.
4222    REDUC_CODE is the tree-code for the epilog reduction.
4223    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4224      computation.
4225    REDUC_INDEX is the index of the operand in the right hand side of the
4226      statement that is defined by REDUCTION_PHI.
4227    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4228    SLP_NODE is an SLP node containing a group of reduction statements. The
4229      first one in this group is STMT.
4230    INDUCTION_INDEX is the index of the loop for condition reductions.
4231      Otherwise it is undefined.
4232
4233    This function:
4234    1. Creates the reduction def-use cycles: sets the arguments for
4235       REDUCTION_PHIS:
4236       The loop-entry argument is the vectorized initial-value of the reduction.
4237       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4238       sums.
4239    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4240       by applying the operation specified by REDUC_CODE if available, or by
4241       other means (whole-vector shifts or a scalar loop).
4242       The function also creates a new phi node at the loop exit to preserve
4243       loop-closed form, as illustrated below.
4244
4245      The flow at the entry to this function:
4246
4247         loop:
4248           vec_def = phi <null, null>            # REDUCTION_PHI
4249           VECT_DEF = vector_stmt                # vectorized form of STMT
4250           s_loop = scalar_stmt                  # (scalar) STMT
4251         loop_exit:
4252           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4253           use <s_out0>
4254           use <s_out0>
4255
4256      The above is transformed by this function into:
4257
4258         loop:
4259           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4260           VECT_DEF = vector_stmt                # vectorized form of STMT
4261           s_loop = scalar_stmt                  # (scalar) STMT
4262         loop_exit:
4263           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4264           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4265           v_out2 = reduce <v_out1>
4266           s_out3 = extract_field <v_out2, 0>
4267           s_out4 = adjust_result <s_out3>
4268           use <s_out4>
4269           use <s_out4>
4270 */
4271
4272 static void
4273 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4274                                   int ncopies, enum tree_code reduc_code,
4275                                   vec<gimple *> reduction_phis,
4276                                   int reduc_index, bool double_reduc,
4277                                   slp_tree slp_node, tree induction_index)
4278 {
4279   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4280   stmt_vec_info prev_phi_info;
4281   tree vectype;
4282   machine_mode mode;
4283   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4284   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4285   basic_block exit_bb;
4286   tree scalar_dest;
4287   tree scalar_type;
4288   gimple *new_phi = NULL, *phi;
4289   gimple_stmt_iterator exit_gsi;
4290   tree vec_dest;
4291   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4292   gimple *epilog_stmt = NULL;
4293   enum tree_code code = gimple_assign_rhs_code (stmt);
4294   gimple *exit_phi;
4295   tree bitsize;
4296   tree adjustment_def = NULL;
4297   tree vec_initial_def = NULL;
4298   tree reduction_op, expr, def, initial_def = NULL;
4299   tree orig_name, scalar_result;
4300   imm_use_iterator imm_iter, phi_imm_iter;
4301   use_operand_p use_p, phi_use_p;
4302   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4303   bool nested_in_vect_loop = false;
4304   auto_vec<gimple *> new_phis;
4305   auto_vec<gimple *> inner_phis;
4306   enum vect_def_type dt = vect_unknown_def_type;
4307   int j, i;
4308   auto_vec<tree> scalar_results;
4309   unsigned int group_size = 1, k, ratio;
4310   auto_vec<tree> vec_initial_defs;
4311   auto_vec<gimple *> phis;
4312   bool slp_reduc = false;
4313   tree new_phi_result;
4314   gimple *inner_phi = NULL;
4315
4316   if (slp_node)
4317     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4318
4319   if (nested_in_vect_loop_p (loop, stmt))
4320     {
4321       outer_loop = loop;
4322       loop = loop->inner;
4323       nested_in_vect_loop = true;
4324       gcc_assert (!slp_node);
4325     }
4326
4327   reduction_op = get_reduction_op (stmt, reduc_index);
4328
4329   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4330   gcc_assert (vectype);
4331   mode = TYPE_MODE (vectype);
4332
4333   /* 1. Create the reduction def-use cycle:
4334      Set the arguments of REDUCTION_PHIS, i.e., transform
4335
4336         loop:
4337           vec_def = phi <null, null>            # REDUCTION_PHI
4338           VECT_DEF = vector_stmt                # vectorized form of STMT
4339           ...
4340
4341      into:
4342
4343         loop:
4344           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4345           VECT_DEF = vector_stmt                # vectorized form of STMT
4346           ...
4347
4348      (in case of SLP, do it for all the phis). */
4349
4350   /* Get the loop-entry arguments.  */
4351   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4352   if (slp_node)
4353     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4354                        NULL, slp_node, reduc_index);
4355   else
4356     {
4357       /* Get at the scalar def before the loop, that defines the initial value
4358          of the reduction variable.  */
4359       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4360       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4361                                            loop_preheader_edge (loop));
4362       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4363       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4364                                                        &adjustment_def);
4365       vec_initial_defs.create (1);
4366       vec_initial_defs.quick_push (vec_initial_def);
4367     }
4368
4369   /* Set phi nodes arguments.  */
4370   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4371     {
4372       tree vec_init_def, def;
4373       gimple_seq stmts;
4374       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4375                                            true, NULL_TREE);
4376       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4377       def = vect_defs[i];
4378       for (j = 0; j < ncopies; j++)
4379         {
4380           if (j != 0)
4381             {
4382               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4383               if (nested_in_vect_loop)
4384                 vec_init_def
4385                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4386                                                     vec_init_def);
4387             }
4388
4389           /* Set the loop-entry arg of the reduction-phi.  */
4390
4391           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4392               == INTEGER_INDUC_COND_REDUCTION)
4393             {
4394               /* Initialise the reduction phi to zero.  This prevents initial
4395                  values of non-zero interferring with the reduction op.  */
4396               gcc_assert (ncopies == 1);
4397               gcc_assert (i == 0);
4398
4399               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4400               tree zero_vec = build_zero_cst (vec_init_def_type);
4401
4402               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4403                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4404             }
4405           else
4406             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4407                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4408
4409           /* Set the loop-latch arg for the reduction-phi.  */
4410           if (j > 0)
4411             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4412
4413           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4414                        UNKNOWN_LOCATION);
4415
4416           if (dump_enabled_p ())
4417             {
4418               dump_printf_loc (MSG_NOTE, vect_location,
4419                                "transform reduction: created def-use cycle: ");
4420               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4421               dump_printf (MSG_NOTE, "\n");
4422               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4423               dump_printf (MSG_NOTE, "\n");
4424             }
4425         }
4426     }
4427
4428   /* 2. Create epilog code.
4429         The reduction epilog code operates across the elements of the vector
4430         of partial results computed by the vectorized loop.
4431         The reduction epilog code consists of:
4432
4433         step 1: compute the scalar result in a vector (v_out2)
4434         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4435         step 3: adjust the scalar result (s_out3) if needed.
4436
4437         Step 1 can be accomplished using one the following three schemes:
4438           (scheme 1) using reduc_code, if available.
4439           (scheme 2) using whole-vector shifts, if available.
4440           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4441                      combined.
4442
4443           The overall epilog code looks like this:
4444
4445           s_out0 = phi <s_loop>         # original EXIT_PHI
4446           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4447           v_out2 = reduce <v_out1>              # step 1
4448           s_out3 = extract_field <v_out2, 0>    # step 2
4449           s_out4 = adjust_result <s_out3>       # step 3
4450
4451           (step 3 is optional, and steps 1 and 2 may be combined).
4452           Lastly, the uses of s_out0 are replaced by s_out4.  */
4453
4454
4455   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4456          v_out1 = phi <VECT_DEF>
4457          Store them in NEW_PHIS.  */
4458
4459   exit_bb = single_exit (loop)->dest;
4460   prev_phi_info = NULL;
4461   new_phis.create (vect_defs.length ());
4462   FOR_EACH_VEC_ELT (vect_defs, i, def)
4463     {
4464       for (j = 0; j < ncopies; j++)
4465         {
4466           tree new_def = copy_ssa_name (def);
4467           phi = create_phi_node (new_def, exit_bb);
4468           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4469           if (j == 0)
4470             new_phis.quick_push (phi);
4471           else
4472             {
4473               def = vect_get_vec_def_for_stmt_copy (dt, def);
4474               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4475             }
4476
4477           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4478           prev_phi_info = vinfo_for_stmt (phi);
4479         }
4480     }
4481
4482   /* The epilogue is created for the outer-loop, i.e., for the loop being
4483      vectorized.  Create exit phis for the outer loop.  */
4484   if (double_reduc)
4485     {
4486       loop = outer_loop;
4487       exit_bb = single_exit (loop)->dest;
4488       inner_phis.create (vect_defs.length ());
4489       FOR_EACH_VEC_ELT (new_phis, i, phi)
4490         {
4491           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4492           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4493           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4494                            PHI_RESULT (phi));
4495           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4496                                                             loop_vinfo));
4497           inner_phis.quick_push (phi);
4498           new_phis[i] = outer_phi;
4499           prev_phi_info = vinfo_for_stmt (outer_phi);
4500           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4501             {
4502               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4503               new_result = copy_ssa_name (PHI_RESULT (phi));
4504               outer_phi = create_phi_node (new_result, exit_bb);
4505               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4506                                PHI_RESULT (phi));
4507               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4508                                                                 loop_vinfo));
4509               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4510               prev_phi_info = vinfo_for_stmt (outer_phi);
4511             }
4512         }
4513     }
4514
4515   exit_gsi = gsi_after_labels (exit_bb);
4516
4517   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4518          (i.e. when reduc_code is not available) and in the final adjustment
4519          code (if needed).  Also get the original scalar reduction variable as
4520          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4521          represents a reduction pattern), the tree-code and scalar-def are
4522          taken from the original stmt that the pattern-stmt (STMT) replaces.
4523          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4524          are taken from STMT.  */
4525
4526   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4527   if (!orig_stmt)
4528     {
4529       /* Regular reduction  */
4530       orig_stmt = stmt;
4531     }
4532   else
4533     {
4534       /* Reduction pattern  */
4535       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4536       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4537       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4538     }
4539
4540   code = gimple_assign_rhs_code (orig_stmt);
4541   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4542      partial results are added and not subtracted.  */
4543   if (code == MINUS_EXPR)
4544     code = PLUS_EXPR;
4545
4546   scalar_dest = gimple_assign_lhs (orig_stmt);
4547   scalar_type = TREE_TYPE (scalar_dest);
4548   scalar_results.create (group_size);
4549   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4550   bitsize = TYPE_SIZE (scalar_type);
4551
4552   /* In case this is a reduction in an inner-loop while vectorizing an outer
4553      loop - we don't need to extract a single scalar result at the end of the
4554      inner-loop (unless it is double reduction, i.e., the use of reduction is
4555      outside the outer-loop).  The final vector of partial results will be used
4556      in the vectorized outer-loop, or reduced to a scalar result at the end of
4557      the outer-loop.  */
4558   if (nested_in_vect_loop && !double_reduc)
4559     goto vect_finalize_reduction;
4560
4561   /* SLP reduction without reduction chain, e.g.,
4562      # a1 = phi <a2, a0>
4563      # b1 = phi <b2, b0>
4564      a2 = operation (a1)
4565      b2 = operation (b1)  */
4566   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4567
4568   /* In case of reduction chain, e.g.,
4569      # a1 = phi <a3, a0>
4570      a2 = operation (a1)
4571      a3 = operation (a2),
4572
4573      we may end up with more than one vector result.  Here we reduce them to
4574      one vector.  */
4575   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4576     {
4577       tree first_vect = PHI_RESULT (new_phis[0]);
4578       tree tmp;
4579       gassign *new_vec_stmt = NULL;
4580
4581       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4582       for (k = 1; k < new_phis.length (); k++)
4583         {
4584           gimple *next_phi = new_phis[k];
4585           tree second_vect = PHI_RESULT (next_phi);
4586
4587           tmp = build2 (code, vectype,  first_vect, second_vect);
4588           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4589           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4590           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4591           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4592         }
4593
4594       new_phi_result = first_vect;
4595       if (new_vec_stmt)
4596         {
4597           new_phis.truncate (0);
4598           new_phis.safe_push (new_vec_stmt);
4599         }
4600     }
4601   else
4602     new_phi_result = PHI_RESULT (new_phis[0]);
4603
4604   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4605     {
4606       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4607          various data values where the condition matched and another vector
4608          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4609          need to extract the last matching index (which will be the index with
4610          highest value) and use this to index into the data vector.
4611          For the case where there were no matches, the data vector will contain
4612          all default values and the index vector will be all zeros.  */
4613
4614       /* Get various versions of the type of the vector of indexes.  */
4615       tree index_vec_type = TREE_TYPE (induction_index);
4616       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4617       tree index_scalar_type = TREE_TYPE (index_vec_type);
4618       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4619         (index_vec_type);
4620
4621       /* Get an unsigned integer version of the type of the data vector.  */
4622       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4623       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4624       tree vectype_unsigned = build_vector_type
4625         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4626
4627       /* First we need to create a vector (ZERO_VEC) of zeros and another
4628          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4629          can create using a MAX reduction and then expanding.
4630          In the case where the loop never made any matches, the max index will
4631          be zero.  */
4632
4633       /* Vector of {0, 0, 0,...}.  */
4634       tree zero_vec = make_ssa_name (vectype);
4635       tree zero_vec_rhs = build_zero_cst (vectype);
4636       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4637       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4638
4639       /* Find maximum value from the vector of found indexes.  */
4640       tree max_index = make_ssa_name (index_scalar_type);
4641       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4642                                                     induction_index);
4643       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4644
4645       /* Vector of {max_index, max_index, max_index,...}.  */
4646       tree max_index_vec = make_ssa_name (index_vec_type);
4647       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4648                                                       max_index);
4649       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4650                                                         max_index_vec_rhs);
4651       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4652
4653       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4654          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4655          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4656          otherwise.  Only one value should match, resulting in a vector
4657          (VEC_COND) with one data value and the rest zeros.
4658          In the case where the loop never made any matches, every index will
4659          match, resulting in a vector with all data values (which will all be
4660          the default value).  */
4661
4662       /* Compare the max index vector to the vector of found indexes to find
4663          the position of the max value.  */
4664       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4665       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4666                                                       induction_index,
4667                                                       max_index_vec);
4668       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4669
4670       /* Use the compare to choose either values from the data vector or
4671          zero.  */
4672       tree vec_cond = make_ssa_name (vectype);
4673       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4674                                                    vec_compare, new_phi_result,
4675                                                    zero_vec);
4676       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4677
4678       /* Finally we need to extract the data value from the vector (VEC_COND)
4679          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4680          reduction, but because this doesn't exist, we can use a MAX reduction
4681          instead.  The data value might be signed or a float so we need to cast
4682          it first.
4683          In the case where the loop never made any matches, the data values are
4684          all identical, and so will reduce down correctly.  */
4685
4686       /* Make the matched data values unsigned.  */
4687       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4688       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4689                                        vec_cond);
4690       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4691                                                         VIEW_CONVERT_EXPR,
4692                                                         vec_cond_cast_rhs);
4693       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4694
4695       /* Reduce down to a scalar value.  */
4696       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4697       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4698                                       optab_default);
4699       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4700                   != CODE_FOR_nothing);
4701       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4702                                                      REDUC_MAX_EXPR,
4703                                                      vec_cond_cast);
4704       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4705
4706       /* Convert the reduced value back to the result type and set as the
4707          result.  */
4708       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4709                                      data_reduc);
4710       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4711       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4712       gimple_assign_set_lhs (epilog_stmt, new_temp);
4713       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4714       scalar_results.safe_push (new_temp);
4715     }
4716
4717   /* 2.3 Create the reduction code, using one of the three schemes described
4718          above. In SLP we simply need to extract all the elements from the
4719          vector (without reducing them), so we use scalar shifts.  */
4720   else if (reduc_code != ERROR_MARK && !slp_reduc)
4721     {
4722       tree tmp;
4723       tree vec_elem_type;
4724
4725       /*** Case 1:  Create:
4726            v_out2 = reduc_expr <v_out1>  */
4727
4728       if (dump_enabled_p ())
4729         dump_printf_loc (MSG_NOTE, vect_location,
4730                          "Reduce using direct vector reduction.\n");
4731
4732       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4733       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4734         {
4735           tree tmp_dest =
4736               vect_create_destination_var (scalar_dest, vec_elem_type);
4737           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4738           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4739           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4740           gimple_assign_set_lhs (epilog_stmt, new_temp);
4741           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4742
4743           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4744         }
4745       else
4746         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4747
4748       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4749       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4750       gimple_assign_set_lhs (epilog_stmt, new_temp);
4751       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4752
4753       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4754           == INTEGER_INDUC_COND_REDUCTION)
4755         {
4756           /* Earlier we set the initial value to be zero.  Check the result
4757              and if it is zero then replace with the original initial
4758              value.  */
4759           tree zero = build_zero_cst (scalar_type);
4760           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4761
4762           tmp = make_ssa_name (new_scalar_dest);
4763           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4764                                              initial_def, new_temp);
4765           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4766           new_temp = tmp;
4767         }
4768
4769       scalar_results.safe_push (new_temp);
4770     }
4771   else
4772     {
4773       bool reduce_with_shift = have_whole_vector_shift (mode);
4774       int element_bitsize = tree_to_uhwi (bitsize);
4775       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4776       tree vec_temp;
4777
4778       /* Regardless of whether we have a whole vector shift, if we're
4779          emulating the operation via tree-vect-generic, we don't want
4780          to use it.  Only the first round of the reduction is likely
4781          to still be profitable via emulation.  */
4782       /* ??? It might be better to emit a reduction tree code here, so that
4783          tree-vect-generic can expand the first round via bit tricks.  */
4784       if (!VECTOR_MODE_P (mode))
4785         reduce_with_shift = false;
4786       else
4787         {
4788           optab optab = optab_for_tree_code (code, vectype, optab_default);
4789           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4790             reduce_with_shift = false;
4791         }
4792
4793       if (reduce_with_shift && !slp_reduc)
4794         {
4795           int nelements = vec_size_in_bits / element_bitsize;
4796           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4797
4798           int elt_offset;
4799
4800           tree zero_vec = build_zero_cst (vectype);
4801           /*** Case 2: Create:
4802              for (offset = nelements/2; offset >= 1; offset/=2)
4803                 {
4804                   Create:  va' = vec_shift <va, offset>
4805                   Create:  va = vop <va, va'>
4806                 }  */
4807
4808           tree rhs;
4809
4810           if (dump_enabled_p ())
4811             dump_printf_loc (MSG_NOTE, vect_location,
4812                              "Reduce using vector shifts\n");
4813
4814           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4815           new_temp = new_phi_result;
4816           for (elt_offset = nelements / 2;
4817                elt_offset >= 1;
4818                elt_offset /= 2)
4819             {
4820               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4821               tree mask = vect_gen_perm_mask_any (vectype, sel);
4822               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4823                                                  new_temp, zero_vec, mask);
4824               new_name = make_ssa_name (vec_dest, epilog_stmt);
4825               gimple_assign_set_lhs (epilog_stmt, new_name);
4826               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4827
4828               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4829                                                  new_temp);
4830               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4831               gimple_assign_set_lhs (epilog_stmt, new_temp);
4832               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4833             }
4834
4835           /* 2.4  Extract the final scalar result.  Create:
4836              s_out3 = extract_field <v_out2, bitpos>  */
4837
4838           if (dump_enabled_p ())
4839             dump_printf_loc (MSG_NOTE, vect_location,
4840                              "extract scalar result\n");
4841
4842           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4843                         bitsize, bitsize_zero_node);
4844           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4845           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4846           gimple_assign_set_lhs (epilog_stmt, new_temp);
4847           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4848           scalar_results.safe_push (new_temp);
4849         }
4850       else
4851         {
4852           /*** Case 3: Create:
4853              s = extract_field <v_out2, 0>
4854              for (offset = element_size;
4855                   offset < vector_size;
4856                   offset += element_size;)
4857                {
4858                  Create:  s' = extract_field <v_out2, offset>
4859                  Create:  s = op <s, s'>  // For non SLP cases
4860                }  */
4861
4862           if (dump_enabled_p ())
4863             dump_printf_loc (MSG_NOTE, vect_location,
4864                              "Reduce using scalar code.\n");
4865
4866           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4867           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4868             {
4869               int bit_offset;
4870               if (gimple_code (new_phi) == GIMPLE_PHI)
4871                 vec_temp = PHI_RESULT (new_phi);
4872               else
4873                 vec_temp = gimple_assign_lhs (new_phi);
4874               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4875                             bitsize_zero_node);
4876               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4877               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4878               gimple_assign_set_lhs (epilog_stmt, new_temp);
4879               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4880
4881               /* In SLP we don't need to apply reduction operation, so we just
4882                  collect s' values in SCALAR_RESULTS.  */
4883               if (slp_reduc)
4884                 scalar_results.safe_push (new_temp);
4885
4886               for (bit_offset = element_bitsize;
4887                    bit_offset < vec_size_in_bits;
4888                    bit_offset += element_bitsize)
4889                 {
4890                   tree bitpos = bitsize_int (bit_offset);
4891                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4892                                      bitsize, bitpos);
4893
4894                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4895                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4896                   gimple_assign_set_lhs (epilog_stmt, new_name);
4897                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4898
4899                   if (slp_reduc)
4900                     {
4901                       /* In SLP we don't need to apply reduction operation, so
4902                          we just collect s' values in SCALAR_RESULTS.  */
4903                       new_temp = new_name;
4904                       scalar_results.safe_push (new_name);
4905                     }
4906                   else
4907                     {
4908                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4909                                                          new_name, new_temp);
4910                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4911                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4912                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4913                     }
4914                 }
4915             }
4916
4917           /* The only case where we need to reduce scalar results in SLP, is
4918              unrolling.  If the size of SCALAR_RESULTS is greater than
4919              GROUP_SIZE, we reduce them combining elements modulo
4920              GROUP_SIZE.  */
4921           if (slp_reduc)
4922             {
4923               tree res, first_res, new_res;
4924               gimple *new_stmt;
4925
4926               /* Reduce multiple scalar results in case of SLP unrolling.  */
4927               for (j = group_size; scalar_results.iterate (j, &res);
4928                    j++)
4929                 {
4930                   first_res = scalar_results[j % group_size];
4931                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4932                                                   first_res, res);
4933                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4934                   gimple_assign_set_lhs (new_stmt, new_res);
4935                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4936                   scalar_results[j % group_size] = new_res;
4937                 }
4938             }
4939           else
4940             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4941             scalar_results.safe_push (new_temp);
4942         }
4943     }
4944
4945 vect_finalize_reduction:
4946
4947   if (double_reduc)
4948     loop = loop->inner;
4949
4950   /* 2.5 Adjust the final result by the initial value of the reduction
4951          variable. (When such adjustment is not needed, then
4952          'adjustment_def' is zero).  For example, if code is PLUS we create:
4953          new_temp = loop_exit_def + adjustment_def  */
4954
4955   if (adjustment_def)
4956     {
4957       gcc_assert (!slp_reduc);
4958       if (nested_in_vect_loop)
4959         {
4960           new_phi = new_phis[0];
4961           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4962           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4963           new_dest = vect_create_destination_var (scalar_dest, vectype);
4964         }
4965       else
4966         {
4967           new_temp = scalar_results[0];
4968           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4969           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4970           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4971         }
4972
4973       epilog_stmt = gimple_build_assign (new_dest, expr);
4974       new_temp = make_ssa_name (new_dest, epilog_stmt);
4975       gimple_assign_set_lhs (epilog_stmt, new_temp);
4976       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4977       if (nested_in_vect_loop)
4978         {
4979           set_vinfo_for_stmt (epilog_stmt,
4980                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
4981           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4982                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4983
4984           if (!double_reduc)
4985             scalar_results.quick_push (new_temp);
4986           else
4987             scalar_results[0] = new_temp;
4988         }
4989       else
4990         scalar_results[0] = new_temp;
4991
4992       new_phis[0] = epilog_stmt;
4993     }
4994
4995   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4996           phis with new adjusted scalar results, i.e., replace use <s_out0>
4997           with use <s_out4>.
4998
4999      Transform:
5000         loop_exit:
5001           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5002           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5003           v_out2 = reduce <v_out1>
5004           s_out3 = extract_field <v_out2, 0>
5005           s_out4 = adjust_result <s_out3>
5006           use <s_out0>
5007           use <s_out0>
5008
5009      into:
5010
5011         loop_exit:
5012           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5013           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5014           v_out2 = reduce <v_out1>
5015           s_out3 = extract_field <v_out2, 0>
5016           s_out4 = adjust_result <s_out3>
5017           use <s_out4>
5018           use <s_out4> */
5019
5020
5021   /* In SLP reduction chain we reduce vector results into one vector if
5022      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5023      the last stmt in the reduction chain, since we are looking for the loop
5024      exit phi node.  */
5025   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5026     {
5027       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5028       /* Handle reduction patterns.  */
5029       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5030         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5031
5032       scalar_dest = gimple_assign_lhs (dest_stmt);
5033       group_size = 1;
5034     }
5035
5036   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5037      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5038      need to match SCALAR_RESULTS with corresponding statements.  The first
5039      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5040      the first vector stmt, etc.
5041      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5042   if (group_size > new_phis.length ())
5043     {
5044       ratio = group_size / new_phis.length ();
5045       gcc_assert (!(group_size % new_phis.length ()));
5046     }
5047   else
5048     ratio = 1;
5049
5050   for (k = 0; k < group_size; k++)
5051     {
5052       if (k % ratio == 0)
5053         {
5054           epilog_stmt = new_phis[k / ratio];
5055           reduction_phi = reduction_phis[k / ratio];
5056           if (double_reduc)
5057             inner_phi = inner_phis[k / ratio];
5058         }
5059
5060       if (slp_reduc)
5061         {
5062           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5063
5064           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5065           /* SLP statements can't participate in patterns.  */
5066           gcc_assert (!orig_stmt);
5067           scalar_dest = gimple_assign_lhs (current_stmt);
5068         }
5069
5070       phis.create (3);
5071       /* Find the loop-closed-use at the loop exit of the original scalar
5072          result.  (The reduction result is expected to have two immediate uses -
5073          one at the latch block, and one at the loop exit).  */
5074       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5075         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5076             && !is_gimple_debug (USE_STMT (use_p)))
5077           phis.safe_push (USE_STMT (use_p));
5078
5079       /* While we expect to have found an exit_phi because of loop-closed-ssa
5080          form we can end up without one if the scalar cycle is dead.  */
5081
5082       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5083         {
5084           if (outer_loop)
5085             {
5086               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5087               gphi *vect_phi;
5088
5089               /* FORNOW. Currently not supporting the case that an inner-loop
5090                  reduction is not used in the outer-loop (but only outside the
5091                  outer-loop), unless it is double reduction.  */
5092               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5093                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5094                           || double_reduc);
5095
5096               if (double_reduc)
5097                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5098               else
5099                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5100               if (!double_reduc
5101                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5102                       != vect_double_reduction_def)
5103                 continue;
5104
5105               /* Handle double reduction:
5106
5107                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5108                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5109                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5110                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5111
5112                  At that point the regular reduction (stmt2 and stmt3) is
5113                  already vectorized, as well as the exit phi node, stmt4.
5114                  Here we vectorize the phi node of double reduction, stmt1, and
5115                  update all relevant statements.  */
5116
5117               /* Go through all the uses of s2 to find double reduction phi
5118                  node, i.e., stmt1 above.  */
5119               orig_name = PHI_RESULT (exit_phi);
5120               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5121                 {
5122                   stmt_vec_info use_stmt_vinfo;
5123                   stmt_vec_info new_phi_vinfo;
5124                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5125                   basic_block bb = gimple_bb (use_stmt);
5126                   gimple *use;
5127
5128                   /* Check that USE_STMT is really double reduction phi
5129                      node.  */
5130                   if (gimple_code (use_stmt) != GIMPLE_PHI
5131                       || gimple_phi_num_args (use_stmt) != 2
5132                       || bb->loop_father != outer_loop)
5133                     continue;
5134                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5135                   if (!use_stmt_vinfo
5136                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5137                           != vect_double_reduction_def)
5138                     continue;
5139
5140                   /* Create vector phi node for double reduction:
5141                      vs1 = phi <vs0, vs2>
5142                      vs1 was created previously in this function by a call to
5143                        vect_get_vec_def_for_operand and is stored in
5144                        vec_initial_def;
5145                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5146                      vs0 is created here.  */
5147
5148                   /* Create vector phi node.  */
5149                   vect_phi = create_phi_node (vec_initial_def, bb);
5150                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5151                                     loop_vec_info_for_loop (outer_loop));
5152                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5153
5154                   /* Create vs0 - initial def of the double reduction phi.  */
5155                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5156                                              loop_preheader_edge (outer_loop));
5157                   init_def = get_initial_def_for_reduction (stmt,
5158                                                           preheader_arg, NULL);
5159                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5160                                                     vectype, NULL);
5161
5162                   /* Update phi node arguments with vs0 and vs2.  */
5163                   add_phi_arg (vect_phi, vect_phi_init,
5164                                loop_preheader_edge (outer_loop),
5165                                UNKNOWN_LOCATION);
5166                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5167                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5168                   if (dump_enabled_p ())
5169                     {
5170                       dump_printf_loc (MSG_NOTE, vect_location,
5171                                        "created double reduction phi node: ");
5172                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5173                       dump_printf (MSG_NOTE, "\n");
5174                     }
5175
5176                   vect_phi_res = PHI_RESULT (vect_phi);
5177
5178                   /* Replace the use, i.e., set the correct vs1 in the regular
5179                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5180                      loop is redundant.  */
5181                   use = reduction_phi;
5182                   for (j = 0; j < ncopies; j++)
5183                     {
5184                       edge pr_edge = loop_preheader_edge (loop);
5185                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5186                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5187                     }
5188                 }
5189             }
5190         }
5191
5192       phis.release ();
5193       if (nested_in_vect_loop)
5194         {
5195           if (double_reduc)
5196             loop = outer_loop;
5197           else
5198             continue;
5199         }
5200
5201       phis.create (3);
5202       /* Find the loop-closed-use at the loop exit of the original scalar
5203          result.  (The reduction result is expected to have two immediate uses,
5204          one at the latch block, and one at the loop exit).  For double
5205          reductions we are looking for exit phis of the outer loop.  */
5206       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5207         {
5208           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5209             {
5210               if (!is_gimple_debug (USE_STMT (use_p)))
5211                 phis.safe_push (USE_STMT (use_p));
5212             }
5213           else
5214             {
5215               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5216                 {
5217                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5218
5219                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5220                     {
5221                       if (!flow_bb_inside_loop_p (loop,
5222                                              gimple_bb (USE_STMT (phi_use_p)))
5223                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5224                         phis.safe_push (USE_STMT (phi_use_p));
5225                     }
5226                 }
5227             }
5228         }
5229
5230       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5231         {
5232           /* Replace the uses:  */
5233           orig_name = PHI_RESULT (exit_phi);
5234           scalar_result = scalar_results[k];
5235           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5236             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5237               SET_USE (use_p, scalar_result);
5238         }
5239
5240       phis.release ();
5241     }
5242 }
5243
5244
5245 /* Function is_nonwrapping_integer_induction.
5246
5247    Check if STMT (which is part of loop LOOP) both increments and
5248    does not cause overflow.  */
5249
5250 static bool
5251 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5252 {
5253   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5254   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5255   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5256   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5257   widest_int ni, max_loop_value, lhs_max;
5258   bool overflow = false;
5259
5260   /* Make sure the loop is integer based.  */
5261   if (TREE_CODE (base) != INTEGER_CST
5262       || TREE_CODE (step) != INTEGER_CST)
5263     return false;
5264
5265   /* Check that the induction increments.  */
5266   if (tree_int_cst_sgn (step) == -1)
5267     return false;
5268
5269   /* Check that the max size of the loop will not wrap.  */
5270
5271   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5272     return true;
5273
5274   if (! max_stmt_executions (loop, &ni))
5275     return false;
5276
5277   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5278                             &overflow);
5279   if (overflow)
5280     return false;
5281
5282   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5283                             TYPE_SIGN (lhs_type), &overflow);
5284   if (overflow)
5285     return false;
5286
5287   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5288           <= TYPE_PRECISION (lhs_type));
5289 }
5290
5291 /* Function vectorizable_reduction.
5292
5293    Check if STMT performs a reduction operation that can be vectorized.
5294    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5295    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5296    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5297
5298    This function also handles reduction idioms (patterns) that have been
5299    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5300    of this form:
5301      X = pattern_expr (arg0, arg1, ..., X)
5302    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5303    sequence that had been detected and replaced by the pattern-stmt (STMT).
5304
5305    This function also handles reduction of condition expressions, for example:
5306      for (int i = 0; i < N; i++)
5307        if (a[i] < value)
5308          last = a[i];
5309    This is handled by vectorising the loop and creating an additional vector
5310    containing the loop indexes for which "a[i] < value" was true.  In the
5311    function epilogue this is reduced to a single max value and then used to
5312    index into the vector of results.
5313
5314    In some cases of reduction patterns, the type of the reduction variable X is
5315    different than the type of the other arguments of STMT.
5316    In such cases, the vectype that is used when transforming STMT into a vector
5317    stmt is different than the vectype that is used to determine the
5318    vectorization factor, because it consists of a different number of elements
5319    than the actual number of elements that are being operated upon in parallel.
5320
5321    For example, consider an accumulation of shorts into an int accumulator.
5322    On some targets it's possible to vectorize this pattern operating on 8
5323    shorts at a time (hence, the vectype for purposes of determining the
5324    vectorization factor should be V8HI); on the other hand, the vectype that
5325    is used to create the vector form is actually V4SI (the type of the result).
5326
5327    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5328    indicates what is the actual level of parallelism (V8HI in the example), so
5329    that the right vectorization factor would be derived.  This vectype
5330    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5331    be used to create the vectorized stmt.  The right vectype for the vectorized
5332    stmt is obtained from the type of the result X:
5333         get_vectype_for_scalar_type (TREE_TYPE (X))
5334
5335    This means that, contrary to "regular" reductions (or "regular" stmts in
5336    general), the following equation:
5337       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5338    does *NOT* necessarily hold for reduction patterns.  */
5339
5340 bool
5341 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5342                         gimple **vec_stmt, slp_tree slp_node)
5343 {
5344   tree vec_dest;
5345   tree scalar_dest;
5346   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5347   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5348   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5349   tree vectype_in = NULL_TREE;
5350   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5351   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5352   enum tree_code code, orig_code, epilog_reduc_code;
5353   machine_mode vec_mode;
5354   int op_type;
5355   optab optab, reduc_optab;
5356   tree new_temp = NULL_TREE;
5357   gimple *def_stmt;
5358   enum vect_def_type dt;
5359   gphi *new_phi = NULL;
5360   tree scalar_type;
5361   bool is_simple_use;
5362   gimple *orig_stmt;
5363   stmt_vec_info orig_stmt_info;
5364   tree expr = NULL_TREE;
5365   int i;
5366   int ncopies;
5367   int epilog_copies;
5368   stmt_vec_info prev_stmt_info, prev_phi_info;
5369   bool single_defuse_cycle = false;
5370   tree reduc_def = NULL_TREE;
5371   gimple *new_stmt = NULL;
5372   int j;
5373   tree ops[3];
5374   bool nested_cycle = false, found_nested_cycle_def = false;
5375   gimple *reduc_def_stmt = NULL;
5376   bool double_reduc = false, dummy;
5377   basic_block def_bb;
5378   struct loop * def_stmt_loop, *outer_loop = NULL;
5379   tree def_arg;
5380   gimple *def_arg_stmt;
5381   auto_vec<tree> vec_oprnds0;
5382   auto_vec<tree> vec_oprnds1;
5383   auto_vec<tree> vect_defs;
5384   auto_vec<gimple *> phis;
5385   int vec_num;
5386   tree def0, def1, tem, op0, op1 = NULL_TREE;
5387   bool first_p = true;
5388   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5389   gimple *cond_expr_induction_def_stmt = NULL;
5390
5391   /* In case of reduction chain we switch to the first stmt in the chain, but
5392      we don't update STMT_INFO, since only the last stmt is marked as reduction
5393      and has reduction properties.  */
5394   if (GROUP_FIRST_ELEMENT (stmt_info)
5395       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5396     {
5397       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5398       first_p = false;
5399     }
5400
5401   if (nested_in_vect_loop_p (loop, stmt))
5402     {
5403       outer_loop = loop;
5404       loop = loop->inner;
5405       nested_cycle = true;
5406     }
5407
5408   /* 1. Is vectorizable reduction?  */
5409   /* Not supportable if the reduction variable is used in the loop, unless
5410      it's a reduction chain.  */
5411   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5412       && !GROUP_FIRST_ELEMENT (stmt_info))
5413     return false;
5414
5415   /* Reductions that are not used even in an enclosing outer-loop,
5416      are expected to be "live" (used out of the loop).  */
5417   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5418       && !STMT_VINFO_LIVE_P (stmt_info))
5419     return false;
5420
5421   /* Make sure it was already recognized as a reduction computation.  */
5422   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5423       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5424     return false;
5425
5426   /* 2. Has this been recognized as a reduction pattern?
5427
5428      Check if STMT represents a pattern that has been recognized
5429      in earlier analysis stages.  For stmts that represent a pattern,
5430      the STMT_VINFO_RELATED_STMT field records the last stmt in
5431      the original sequence that constitutes the pattern.  */
5432
5433   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5434   if (orig_stmt)
5435     {
5436       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5437       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5438       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5439     }
5440
5441   /* 3. Check the operands of the operation.  The first operands are defined
5442         inside the loop body. The last operand is the reduction variable,
5443         which is defined by the loop-header-phi.  */
5444
5445   gcc_assert (is_gimple_assign (stmt));
5446
5447   /* Flatten RHS.  */
5448   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5449     {
5450     case GIMPLE_SINGLE_RHS:
5451       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5452       if (op_type == ternary_op)
5453         {
5454           tree rhs = gimple_assign_rhs1 (stmt);
5455           ops[0] = TREE_OPERAND (rhs, 0);
5456           ops[1] = TREE_OPERAND (rhs, 1);
5457           ops[2] = TREE_OPERAND (rhs, 2);
5458           code = TREE_CODE (rhs);
5459         }
5460       else
5461         return false;
5462       break;
5463
5464     case GIMPLE_BINARY_RHS:
5465       code = gimple_assign_rhs_code (stmt);
5466       op_type = TREE_CODE_LENGTH (code);
5467       gcc_assert (op_type == binary_op);
5468       ops[0] = gimple_assign_rhs1 (stmt);
5469       ops[1] = gimple_assign_rhs2 (stmt);
5470       break;
5471
5472     case GIMPLE_TERNARY_RHS:
5473       code = gimple_assign_rhs_code (stmt);
5474       op_type = TREE_CODE_LENGTH (code);
5475       gcc_assert (op_type == ternary_op);
5476       ops[0] = gimple_assign_rhs1 (stmt);
5477       ops[1] = gimple_assign_rhs2 (stmt);
5478       ops[2] = gimple_assign_rhs3 (stmt);
5479       break;
5480
5481     case GIMPLE_UNARY_RHS:
5482       return false;
5483
5484     default:
5485       gcc_unreachable ();
5486     }
5487   /* The default is that the reduction variable is the last in statement.  */
5488   int reduc_index = op_type - 1;
5489   if (code == MINUS_EXPR)
5490     reduc_index = 0;
5491
5492   if (code == COND_EXPR && slp_node)
5493     return false;
5494
5495   scalar_dest = gimple_assign_lhs (stmt);
5496   scalar_type = TREE_TYPE (scalar_dest);
5497   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5498       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5499     return false;
5500
5501   /* Do not try to vectorize bit-precision reductions.  */
5502   if ((TYPE_PRECISION (scalar_type)
5503        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5504     return false;
5505
5506   /* All uses but the last are expected to be defined in the loop.
5507      The last use is the reduction variable.  In case of nested cycle this
5508      assumption is not true: we use reduc_index to record the index of the
5509      reduction variable.  */
5510   for (i = 0; i < op_type; i++)
5511     {
5512       if (i == reduc_index)
5513         continue;
5514
5515       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5516       if (i == 0 && code == COND_EXPR)
5517         continue;
5518
5519       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5520                                           &def_stmt, &dt, &tem);
5521       if (!vectype_in)
5522         vectype_in = tem;
5523       gcc_assert (is_simple_use);
5524
5525       if (dt != vect_internal_def
5526           && dt != vect_external_def
5527           && dt != vect_constant_def
5528           && dt != vect_induction_def
5529           && !(dt == vect_nested_cycle && nested_cycle))
5530         return false;
5531
5532       if (dt == vect_nested_cycle)
5533         {
5534           found_nested_cycle_def = true;
5535           reduc_def_stmt = def_stmt;
5536           reduc_index = i;
5537         }
5538
5539       if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5540         cond_expr_induction_def_stmt = def_stmt;
5541     }
5542
5543   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5544                                       &def_stmt, &dt, &tem);
5545   if (!vectype_in)
5546     vectype_in = tem;
5547   gcc_assert (is_simple_use);
5548   if (!found_nested_cycle_def)
5549     reduc_def_stmt = def_stmt;
5550
5551   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5552     return false;
5553
5554   if (!(dt == vect_reduction_def
5555         || dt == vect_nested_cycle
5556         || ((dt == vect_internal_def || dt == vect_external_def
5557              || dt == vect_constant_def || dt == vect_induction_def)
5558             && nested_cycle && found_nested_cycle_def)))
5559     {
5560       /* For pattern recognized stmts, orig_stmt might be a reduction,
5561          but some helper statements for the pattern might not, or
5562          might be COND_EXPRs with reduction uses in the condition.  */
5563       gcc_assert (orig_stmt);
5564       return false;
5565     }
5566
5567   enum vect_reduction_type v_reduc_type;
5568   gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5569                                           !nested_cycle, &dummy, false,
5570                                           &v_reduc_type);
5571
5572   /* If we have a condition reduction, see if we can simplify it further.  */
5573   if (v_reduc_type == COND_REDUCTION
5574       && cond_expr_induction_def_stmt != NULL
5575       && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5576     {
5577       if (dump_enabled_p ())
5578         dump_printf_loc (MSG_NOTE, vect_location,
5579                          "condition expression based on integer induction.\n");
5580       STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5581     }
5582   else
5583    STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5584
5585   if (orig_stmt)
5586     gcc_assert (tmp == orig_stmt
5587                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5588   else
5589     /* We changed STMT to be the first stmt in reduction chain, hence we
5590        check that in this case the first element in the chain is STMT.  */
5591     gcc_assert (stmt == tmp
5592                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5593
5594   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5595     return false;
5596
5597   if (slp_node || PURE_SLP_STMT (stmt_info))
5598     ncopies = 1;
5599   else
5600     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5601                / TYPE_VECTOR_SUBPARTS (vectype_in));
5602
5603   gcc_assert (ncopies >= 1);
5604
5605   vec_mode = TYPE_MODE (vectype_in);
5606
5607   if (code == COND_EXPR)
5608     {
5609       /* Only call during the analysis stage, otherwise we'll lose
5610          STMT_VINFO_TYPE.  */
5611       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5612                                                 ops[reduc_index], 0, NULL))
5613         {
5614           if (dump_enabled_p ())
5615             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5616                              "unsupported condition in reduction\n");
5617           return false;
5618         }
5619     }
5620   else
5621     {
5622       /* 4. Supportable by target?  */
5623
5624       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5625           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5626         {
5627           /* Shifts and rotates are only supported by vectorizable_shifts,
5628              not vectorizable_reduction.  */
5629           if (dump_enabled_p ())
5630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631                              "unsupported shift or rotation.\n");
5632           return false;
5633         }
5634
5635       /* 4.1. check support for the operation in the loop  */
5636       optab = optab_for_tree_code (code, vectype_in, optab_default);
5637       if (!optab)
5638         {
5639           if (dump_enabled_p ())
5640             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5641                              "no optab.\n");
5642
5643           return false;
5644         }
5645
5646       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5647         {
5648           if (dump_enabled_p ())
5649             dump_printf (MSG_NOTE, "op not supported by target.\n");
5650
5651           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5652               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5653                   < vect_min_worthwhile_factor (code))
5654             return false;
5655
5656           if (dump_enabled_p ())
5657             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5658         }
5659
5660       /* Worthwhile without SIMD support?  */
5661       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5662           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5663              < vect_min_worthwhile_factor (code))
5664         {
5665           if (dump_enabled_p ())
5666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667                              "not worthwhile without SIMD support.\n");
5668
5669           return false;
5670         }
5671     }
5672
5673   /* 4.2. Check support for the epilog operation.
5674
5675           If STMT represents a reduction pattern, then the type of the
5676           reduction variable may be different than the type of the rest
5677           of the arguments.  For example, consider the case of accumulation
5678           of shorts into an int accumulator; The original code:
5679                         S1: int_a = (int) short_a;
5680           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5681
5682           was replaced with:
5683                         STMT: int_acc = widen_sum <short_a, int_acc>
5684
5685           This means that:
5686           1. The tree-code that is used to create the vector operation in the
5687              epilog code (that reduces the partial results) is not the
5688              tree-code of STMT, but is rather the tree-code of the original
5689              stmt from the pattern that STMT is replacing.  I.e, in the example
5690              above we want to use 'widen_sum' in the loop, but 'plus' in the
5691              epilog.
5692           2. The type (mode) we use to check available target support
5693              for the vector operation to be created in the *epilog*, is
5694              determined by the type of the reduction variable (in the example
5695              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5696              However the type (mode) we use to check available target support
5697              for the vector operation to be created *inside the loop*, is
5698              determined by the type of the other arguments to STMT (in the
5699              example we'd check this: optab_handler (widen_sum_optab,
5700              vect_short_mode)).
5701
5702           This is contrary to "regular" reductions, in which the types of all
5703           the arguments are the same as the type of the reduction variable.
5704           For "regular" reductions we can therefore use the same vector type
5705           (and also the same tree-code) when generating the epilog code and
5706           when generating the code inside the loop.  */
5707
5708   if (orig_stmt)
5709     {
5710       /* This is a reduction pattern: get the vectype from the type of the
5711          reduction variable, and get the tree-code from orig_stmt.  */
5712       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5713                   == TREE_CODE_REDUCTION);
5714       orig_code = gimple_assign_rhs_code (orig_stmt);
5715       gcc_assert (vectype_out);
5716       vec_mode = TYPE_MODE (vectype_out);
5717     }
5718   else
5719     {
5720       /* Regular reduction: use the same vectype and tree-code as used for
5721          the vector code inside the loop can be used for the epilog code. */
5722       orig_code = code;
5723
5724       if (code == MINUS_EXPR)
5725         orig_code = PLUS_EXPR;
5726
5727       /* For simple condition reductions, replace with the actual expression
5728          we want to base our reduction around.  */
5729       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5730           == INTEGER_INDUC_COND_REDUCTION)
5731         orig_code = MAX_EXPR;
5732     }
5733
5734   if (nested_cycle)
5735     {
5736       def_bb = gimple_bb (reduc_def_stmt);
5737       def_stmt_loop = def_bb->loop_father;
5738       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5739                                        loop_preheader_edge (def_stmt_loop));
5740       if (TREE_CODE (def_arg) == SSA_NAME
5741           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5742           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5743           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5744           && vinfo_for_stmt (def_arg_stmt)
5745           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5746               == vect_double_reduction_def)
5747         double_reduc = true;
5748     }
5749
5750   epilog_reduc_code = ERROR_MARK;
5751
5752   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5753       || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5754                 == INTEGER_INDUC_COND_REDUCTION)
5755     {
5756       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5757         {
5758           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5759                                          optab_default);
5760           if (!reduc_optab)
5761             {
5762               if (dump_enabled_p ())
5763                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764                                  "no optab for reduction.\n");
5765
5766               epilog_reduc_code = ERROR_MARK;
5767             }
5768           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5769             {
5770               if (dump_enabled_p ())
5771                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5772                                  "reduc op not supported by target.\n");
5773
5774               epilog_reduc_code = ERROR_MARK;
5775             }
5776
5777           /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5778              generated in the epilog using multiple expressions.  This does not
5779              work for condition reductions.  */
5780           if (epilog_reduc_code == ERROR_MARK
5781               && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5782                         == INTEGER_INDUC_COND_REDUCTION)
5783             {
5784               if (dump_enabled_p ())
5785                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5786                                  "no reduc code for scalar code.\n");
5787               return false;
5788             }
5789         }
5790       else
5791         {
5792           if (!nested_cycle || double_reduc)
5793             {
5794               if (dump_enabled_p ())
5795                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5796                                  "no reduc code for scalar code.\n");
5797
5798               return false;
5799             }
5800         }
5801     }
5802   else
5803     {
5804       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5805       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5806       cr_index_vector_type = build_vector_type
5807         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5808
5809       epilog_reduc_code = REDUC_MAX_EXPR;
5810       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5811                                    optab_default);
5812       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5813           == CODE_FOR_nothing)
5814         {
5815           if (dump_enabled_p ())
5816             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5817                              "reduc max op not supported by target.\n");
5818           return false;
5819         }
5820     }
5821
5822   if ((double_reduc
5823        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5824        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5825                 == INTEGER_INDUC_COND_REDUCTION)
5826       && ncopies > 1)
5827     {
5828       if (dump_enabled_p ())
5829         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5830                          "multiple types in double reduction or condition "
5831                          "reduction.\n");
5832       return false;
5833     }
5834
5835   /* In case of widenning multiplication by a constant, we update the type
5836      of the constant to be the type of the other operand.  We check that the
5837      constant fits the type in the pattern recognition pass.  */
5838   if (code == DOT_PROD_EXPR
5839       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5840     {
5841       if (TREE_CODE (ops[0]) == INTEGER_CST)
5842         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5843       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5844         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5845       else
5846         {
5847           if (dump_enabled_p ())
5848             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5849                              "invalid types in dot-prod\n");
5850
5851           return false;
5852         }
5853     }
5854
5855   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5856     {
5857       widest_int ni;
5858
5859       if (! max_loop_iterations (loop, &ni))
5860         {
5861           if (dump_enabled_p ())
5862             dump_printf_loc (MSG_NOTE, vect_location,
5863                              "loop count not known, cannot create cond "
5864                              "reduction.\n");
5865           return false;
5866         }
5867       /* Convert backedges to iterations.  */
5868       ni += 1;
5869
5870       /* The additional index will be the same type as the condition.  Check
5871          that the loop can fit into this less one (because we'll use up the
5872          zero slot for when there are no matches).  */
5873       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5874       if (wi::geu_p (ni, wi::to_widest (max_index)))
5875         {
5876           if (dump_enabled_p ())
5877             dump_printf_loc (MSG_NOTE, vect_location,
5878                              "loop size is greater than data size.\n");
5879           return false;
5880         }
5881     }
5882
5883   if (!vec_stmt) /* transformation not required.  */
5884     {
5885       if (first_p
5886           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5887                                          reduc_index))
5888         return false;
5889       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5890       return true;
5891     }
5892
5893   /** Transform.  **/
5894
5895   if (dump_enabled_p ())
5896     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5897
5898   /* FORNOW: Multiple types are not supported for condition.  */
5899   if (code == COND_EXPR)
5900     gcc_assert (ncopies == 1);
5901
5902   /* Create the destination vector  */
5903   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5904
5905   /* In case the vectorization factor (VF) is bigger than the number
5906      of elements that we can fit in a vectype (nunits), we have to generate
5907      more than one vector stmt - i.e - we need to "unroll" the
5908      vector stmt by a factor VF/nunits.  For more details see documentation
5909      in vectorizable_operation.  */
5910
5911   /* If the reduction is used in an outer loop we need to generate
5912      VF intermediate results, like so (e.g. for ncopies=2):
5913         r0 = phi (init, r0)
5914         r1 = phi (init, r1)
5915         r0 = x0 + r0;
5916         r1 = x1 + r1;
5917     (i.e. we generate VF results in 2 registers).
5918     In this case we have a separate def-use cycle for each copy, and therefore
5919     for each copy we get the vector def for the reduction variable from the
5920     respective phi node created for this copy.
5921
5922     Otherwise (the reduction is unused in the loop nest), we can combine
5923     together intermediate results, like so (e.g. for ncopies=2):
5924         r = phi (init, r)
5925         r = x0 + r;
5926         r = x1 + r;
5927    (i.e. we generate VF/2 results in a single register).
5928    In this case for each copy we get the vector def for the reduction variable
5929    from the vectorized reduction operation generated in the previous iteration.
5930   */
5931
5932   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5933     {
5934       single_defuse_cycle = true;
5935       epilog_copies = 1;
5936     }
5937   else
5938     epilog_copies = ncopies;
5939
5940   prev_stmt_info = NULL;
5941   prev_phi_info = NULL;
5942   if (slp_node)
5943     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5944   else
5945     {
5946       vec_num = 1;
5947       vec_oprnds0.create (1);
5948       if (op_type == ternary_op)
5949         vec_oprnds1.create (1);
5950     }
5951
5952   phis.create (vec_num);
5953   vect_defs.create (vec_num);
5954   if (!slp_node)
5955     vect_defs.quick_push (NULL_TREE);
5956
5957   for (j = 0; j < ncopies; j++)
5958     {
5959       if (j == 0 || !single_defuse_cycle)
5960         {
5961           for (i = 0; i < vec_num; i++)
5962             {
5963               /* Create the reduction-phi that defines the reduction
5964                  operand.  */
5965               new_phi = create_phi_node (vec_dest, loop->header);
5966               set_vinfo_for_stmt (new_phi,
5967                                   new_stmt_vec_info (new_phi, loop_vinfo));
5968                if (j == 0 || slp_node)
5969                  phis.quick_push (new_phi);
5970             }
5971         }
5972
5973       if (code == COND_EXPR)
5974         {
5975           gcc_assert (!slp_node);
5976           vectorizable_condition (stmt, gsi, vec_stmt,
5977                                   PHI_RESULT (phis[0]),
5978                                   reduc_index, NULL);
5979           /* Multiple types are not supported for condition.  */
5980           break;
5981         }
5982
5983       /* Handle uses.  */
5984       if (j == 0)
5985         {
5986           op0 = ops[!reduc_index];
5987           if (op_type == ternary_op)
5988             {
5989               if (reduc_index == 0)
5990                 op1 = ops[2];
5991               else
5992                 op1 = ops[1];
5993             }
5994
5995           if (slp_node)
5996             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5997                                slp_node, -1);
5998           else
5999             {
6000               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6001                                                             stmt);
6002               vec_oprnds0.quick_push (loop_vec_def0);
6003               if (op_type == ternary_op)
6004                {
6005                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6006                  vec_oprnds1.quick_push (loop_vec_def1);
6007                }
6008             }
6009         }
6010       else
6011         {
6012           if (!slp_node)
6013             {
6014               enum vect_def_type dt;
6015               gimple *dummy_stmt;
6016
6017               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6018                                   &dummy_stmt, &dt);
6019               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6020                                                               loop_vec_def0);
6021               vec_oprnds0[0] = loop_vec_def0;
6022               if (op_type == ternary_op)
6023                 {
6024                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6025                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6026                                                                 loop_vec_def1);
6027                   vec_oprnds1[0] = loop_vec_def1;
6028                 }
6029             }
6030
6031           if (single_defuse_cycle)
6032             reduc_def = gimple_assign_lhs (new_stmt);
6033
6034           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6035         }
6036
6037       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6038         {
6039           if (slp_node)
6040             reduc_def = PHI_RESULT (phis[i]);
6041           else
6042             {
6043               if (!single_defuse_cycle || j == 0)
6044                 reduc_def = PHI_RESULT (new_phi);
6045             }
6046
6047           def1 = ((op_type == ternary_op)
6048                   ? vec_oprnds1[i] : NULL);
6049           if (op_type == binary_op)
6050             {
6051               if (reduc_index == 0)
6052                 expr = build2 (code, vectype_out, reduc_def, def0);
6053               else
6054                 expr = build2 (code, vectype_out, def0, reduc_def);
6055             }
6056           else
6057             {
6058               if (reduc_index == 0)
6059                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6060               else
6061                 {
6062                   if (reduc_index == 1)
6063                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
6064                   else
6065                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
6066                 }
6067             }
6068
6069           new_stmt = gimple_build_assign (vec_dest, expr);
6070           new_temp = make_ssa_name (vec_dest, new_stmt);
6071           gimple_assign_set_lhs (new_stmt, new_temp);
6072           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6073
6074           if (slp_node)
6075             {
6076               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6077               vect_defs.quick_push (new_temp);
6078             }
6079           else
6080             vect_defs[0] = new_temp;
6081         }
6082
6083       if (slp_node)
6084         continue;
6085
6086       if (j == 0)
6087         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6088       else
6089         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6090
6091       prev_stmt_info = vinfo_for_stmt (new_stmt);
6092       prev_phi_info = vinfo_for_stmt (new_phi);
6093     }
6094
6095   tree indx_before_incr, indx_after_incr, cond_name = NULL;
6096
6097   /* Finalize the reduction-phi (set its arguments) and create the
6098      epilog reduction code.  */
6099   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6100     {
6101       new_temp = gimple_assign_lhs (*vec_stmt);
6102       vect_defs[0] = new_temp;
6103
6104       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6105          which is updated with the current index of the loop for every match of
6106          the original loop's cond_expr (VEC_STMT).  This results in a vector
6107          containing the last time the condition passed for that vector lane.
6108          The first match will be a 1 to allow 0 to be used for non-matching
6109          indexes.  If there are no matches at all then the vector will be all
6110          zeroes.  */
6111       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6112         {
6113           int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6114           int k;
6115
6116           gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6117
6118           /* First we create a simple vector induction variable which starts
6119              with the values {1,2,3,...} (SERIES_VECT) and increments by the
6120              vector size (STEP).  */
6121
6122           /* Create a {1,2,3,...} vector.  */
6123           tree *vtemp = XALLOCAVEC (tree, nunits_out);
6124           for (k = 0; k < nunits_out; ++k)
6125             vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6126           tree series_vect = build_vector (cr_index_vector_type, vtemp);
6127
6128           /* Create a vector of the step value.  */
6129           tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6130           tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6131
6132           /* Create an induction variable.  */
6133           gimple_stmt_iterator incr_gsi;
6134           bool insert_after;
6135           standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6136           create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6137                      insert_after, &indx_before_incr, &indx_after_incr);
6138
6139           /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6140              filled with zeros (VEC_ZERO).  */
6141
6142           /* Create a vector of 0s.  */
6143           tree zero = build_zero_cst (cr_index_scalar_type);
6144           tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6145
6146           /* Create a vector phi node.  */
6147           tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6148           new_phi = create_phi_node (new_phi_tree, loop->header);
6149           set_vinfo_for_stmt (new_phi,
6150                               new_stmt_vec_info (new_phi, loop_vinfo));
6151           add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6152                        UNKNOWN_LOCATION);
6153
6154           /* Now take the condition from the loops original cond_expr
6155              (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6156              every match uses values from the induction variable
6157              (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6158              (NEW_PHI_TREE).
6159              Finally, we update the phi (NEW_PHI_TREE) to take the value of
6160              the new cond_expr (INDEX_COND_EXPR).  */
6161
6162           /* Turn the condition from vec_stmt into an ssa name.  */
6163           gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6164           tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6165           tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6166           gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6167                                                        ccompare);
6168           gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6169           gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6170           update_stmt (*vec_stmt);
6171
6172           /* Create a conditional, where the condition is taken from vec_stmt
6173              (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6174              and else is the phi (NEW_PHI_TREE).  */
6175           tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6176                                          ccompare_name, indx_before_incr,
6177                                          new_phi_tree);
6178           cond_name = make_ssa_name (cr_index_vector_type);
6179           gimple *index_condition = gimple_build_assign (cond_name,
6180                                                          index_cond_expr);
6181           gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6182           stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6183                                                             loop_vinfo);
6184           STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6185           set_vinfo_for_stmt (index_condition, index_vec_info);
6186
6187           /* Update the phi with the vec cond.  */
6188           add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6189                        UNKNOWN_LOCATION);
6190         }
6191     }
6192
6193   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6194                                     epilog_reduc_code, phis, reduc_index,
6195                                     double_reduc, slp_node, cond_name);
6196
6197   return true;
6198 }
6199
6200 /* Function vect_min_worthwhile_factor.
6201
6202    For a loop where we could vectorize the operation indicated by CODE,
6203    return the minimum vectorization factor that makes it worthwhile
6204    to use generic vectors.  */
6205 int
6206 vect_min_worthwhile_factor (enum tree_code code)
6207 {
6208   switch (code)
6209     {
6210     case PLUS_EXPR:
6211     case MINUS_EXPR:
6212     case NEGATE_EXPR:
6213       return 4;
6214
6215     case BIT_AND_EXPR:
6216     case BIT_IOR_EXPR:
6217     case BIT_XOR_EXPR:
6218     case BIT_NOT_EXPR:
6219       return 2;
6220
6221     default:
6222       return INT_MAX;
6223     }
6224 }
6225
6226
6227 /* Function vectorizable_induction
6228
6229    Check if PHI performs an induction computation that can be vectorized.
6230    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6231    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6232    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6233
6234 bool
6235 vectorizable_induction (gimple *phi,
6236                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6237                         gimple **vec_stmt)
6238 {
6239   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6240   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6241   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6242   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6243   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6244   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6245   tree vec_def;
6246
6247   gcc_assert (ncopies >= 1);
6248   /* FORNOW. These restrictions should be relaxed.  */
6249   if (nested_in_vect_loop_p (loop, phi))
6250     {
6251       imm_use_iterator imm_iter;
6252       use_operand_p use_p;
6253       gimple *exit_phi;
6254       edge latch_e;
6255       tree loop_arg;
6256
6257       if (ncopies > 1)
6258         {
6259           if (dump_enabled_p ())
6260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6261                              "multiple types in nested loop.\n");
6262           return false;
6263         }
6264
6265       exit_phi = NULL;
6266       latch_e = loop_latch_edge (loop->inner);
6267       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6268       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6269         {
6270           gimple *use_stmt = USE_STMT (use_p);
6271           if (is_gimple_debug (use_stmt))
6272             continue;
6273
6274           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6275             {
6276               exit_phi = use_stmt;
6277               break;
6278             }
6279         }
6280       if (exit_phi)
6281         {
6282           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6283           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6284                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6285             {
6286               if (dump_enabled_p ())
6287                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6288                                  "inner-loop induction only used outside "
6289                                  "of the outer vectorized loop.\n");
6290               return false;
6291             }
6292         }
6293     }
6294
6295   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6296     return false;
6297
6298   /* FORNOW: SLP not supported.  */
6299   if (STMT_SLP_TYPE (stmt_info))
6300     return false;
6301
6302   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6303
6304   if (gimple_code (phi) != GIMPLE_PHI)
6305     return false;
6306
6307   if (!vec_stmt) /* transformation not required.  */
6308     {
6309       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6310       if (dump_enabled_p ())
6311         dump_printf_loc (MSG_NOTE, vect_location,
6312                          "=== vectorizable_induction ===\n");
6313       vect_model_induction_cost (stmt_info, ncopies);
6314       return true;
6315     }
6316
6317   /** Transform.  **/
6318
6319   if (dump_enabled_p ())
6320     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6321
6322   vec_def = get_initial_def_for_induction (phi);
6323   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6324   return true;
6325 }
6326
6327 /* Function vectorizable_live_operation.
6328
6329    STMT computes a value that is used outside the loop.  Check if
6330    it can be supported.  */
6331
6332 bool
6333 vectorizable_live_operation (gimple *stmt,
6334                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6335                              gimple **vec_stmt)
6336 {
6337   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6338   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6339   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6340   tree op;
6341   gimple *def_stmt;
6342   ssa_op_iter iter;
6343
6344   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6345
6346   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6347     return false;
6348
6349   if (!is_gimple_assign (stmt))
6350     {
6351       if (gimple_call_internal_p (stmt)
6352           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6353           && gimple_call_lhs (stmt)
6354           && loop->simduid
6355           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6356           && loop->simduid
6357              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6358         {
6359           edge e = single_exit (loop);
6360           basic_block merge_bb = e->dest;
6361           imm_use_iterator imm_iter;
6362           use_operand_p use_p;
6363           tree lhs = gimple_call_lhs (stmt);
6364
6365           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6366             {
6367               gimple *use_stmt = USE_STMT (use_p);
6368               if (gimple_code (use_stmt) == GIMPLE_PHI
6369                   && gimple_bb (use_stmt) == merge_bb)
6370                 {
6371                   if (vec_stmt)
6372                     {
6373                       tree vfm1
6374                         = build_int_cst (unsigned_type_node,
6375                                          loop_vinfo->vectorization_factor - 1);
6376                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6377                     }
6378                   return true;
6379                 }
6380             }
6381         }
6382
6383       return false;
6384     }
6385
6386   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6387     return false;
6388
6389   /* FORNOW. CHECKME. */
6390   if (nested_in_vect_loop_p (loop, stmt))
6391     return false;
6392
6393   /* FORNOW: support only if all uses are invariant.  This means
6394      that the scalar operations can remain in place, unvectorized.
6395      The original last scalar value that they compute will be used.  */
6396   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6397     {
6398       enum vect_def_type dt = vect_uninitialized_def;
6399
6400       if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6401         {
6402           if (dump_enabled_p ())
6403             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6404                              "use not simple.\n");
6405           return false;
6406         }
6407
6408       if (dt != vect_external_def && dt != vect_constant_def)
6409         return false;
6410     }
6411
6412   /* No transformation is required for the cases we currently support.  */
6413   return true;
6414 }
6415
6416 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6417
6418 static void
6419 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6420 {
6421   ssa_op_iter op_iter;
6422   imm_use_iterator imm_iter;
6423   def_operand_p def_p;
6424   gimple *ustmt;
6425
6426   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6427     {
6428       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6429         {
6430           basic_block bb;
6431
6432           if (!is_gimple_debug (ustmt))
6433             continue;
6434
6435           bb = gimple_bb (ustmt);
6436
6437           if (!flow_bb_inside_loop_p (loop, bb))
6438             {
6439               if (gimple_debug_bind_p (ustmt))
6440                 {
6441                   if (dump_enabled_p ())
6442                     dump_printf_loc (MSG_NOTE, vect_location,
6443                                      "killing debug use\n");
6444
6445                   gimple_debug_bind_reset_value (ustmt);
6446                   update_stmt (ustmt);
6447                 }
6448               else
6449                 gcc_unreachable ();
6450             }
6451         }
6452     }
6453 }
6454
6455
6456 /* This function builds ni_name = number of iterations.  Statements
6457    are emitted on the loop preheader edge.  */
6458
6459 static tree
6460 vect_build_loop_niters (loop_vec_info loop_vinfo)
6461 {
6462   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6463   if (TREE_CODE (ni) == INTEGER_CST)
6464     return ni;
6465   else
6466     {
6467       tree ni_name, var;
6468       gimple_seq stmts = NULL;
6469       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6470
6471       var = create_tmp_var (TREE_TYPE (ni), "niters");
6472       ni_name = force_gimple_operand (ni, &stmts, false, var);
6473       if (stmts)
6474         gsi_insert_seq_on_edge_immediate (pe, stmts);
6475
6476       return ni_name;
6477     }
6478 }
6479
6480
6481 /* This function generates the following statements:
6482
6483    ni_name = number of iterations loop executes
6484    ratio = ni_name / vf
6485    ratio_mult_vf_name = ratio * vf
6486
6487    and places them on the loop preheader edge.  */
6488
6489 static void
6490 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6491                                  tree ni_name,
6492                                  tree *ratio_mult_vf_name_ptr,
6493                                  tree *ratio_name_ptr)
6494 {
6495   tree ni_minus_gap_name;
6496   tree var;
6497   tree ratio_name;
6498   tree ratio_mult_vf_name;
6499   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6500   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6501   tree log_vf;
6502
6503   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6504
6505   /* If epilogue loop is required because of data accesses with gaps, we
6506      subtract one iteration from the total number of iterations here for
6507      correct calculation of RATIO.  */
6508   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6509     {
6510       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6511                                        ni_name,
6512                                        build_one_cst (TREE_TYPE (ni_name)));
6513       if (!is_gimple_val (ni_minus_gap_name))
6514         {
6515           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6516           gimple *stmts = NULL;
6517           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6518                                                     true, var);
6519           gsi_insert_seq_on_edge_immediate (pe, stmts);
6520         }
6521     }
6522   else
6523     ni_minus_gap_name = ni_name;
6524
6525   /* Create: ratio = ni >> log2(vf) */
6526   /* ???  As we have ni == number of latch executions + 1, ni could
6527      have overflown to zero.  So avoid computing ratio based on ni
6528      but compute it using the fact that we know ratio will be at least
6529      one, thus via (ni - vf) >> log2(vf) + 1.  */
6530   ratio_name
6531     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6532                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6533                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6534                                              ni_minus_gap_name,
6535                                              build_int_cst
6536                                                (TREE_TYPE (ni_name), vf)),
6537                                 log_vf),
6538                    build_int_cst (TREE_TYPE (ni_name), 1));
6539   if (!is_gimple_val (ratio_name))
6540     {
6541       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6542       gimple *stmts = NULL;
6543       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6544       gsi_insert_seq_on_edge_immediate (pe, stmts);
6545     }
6546   *ratio_name_ptr = ratio_name;
6547
6548   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6549
6550   if (ratio_mult_vf_name_ptr)
6551     {
6552       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6553                                         ratio_name, log_vf);
6554       if (!is_gimple_val (ratio_mult_vf_name))
6555         {
6556           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6557           gimple *stmts = NULL;
6558           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6559                                                      true, var);
6560           gsi_insert_seq_on_edge_immediate (pe, stmts);
6561         }
6562       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6563     }
6564
6565   return;
6566 }
6567
6568
6569 /* Function vect_transform_loop.
6570
6571    The analysis phase has determined that the loop is vectorizable.
6572    Vectorize the loop - created vectorized stmts to replace the scalar
6573    stmts in the loop, and update the loop exit condition.  */
6574
6575 void
6576 vect_transform_loop (loop_vec_info loop_vinfo)
6577 {
6578   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6579   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6580   int nbbs = loop->num_nodes;
6581   int i;
6582   tree ratio = NULL;
6583   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6584   bool grouped_store;
6585   bool slp_scheduled = false;
6586   gimple *stmt, *pattern_stmt;
6587   gimple_seq pattern_def_seq = NULL;
6588   gimple_stmt_iterator pattern_def_si = gsi_none ();
6589   bool transform_pattern_stmt = false;
6590   bool check_profitability = false;
6591   int th;
6592   /* Record number of iterations before we started tampering with the profile. */
6593   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6594
6595   if (dump_enabled_p ())
6596     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6597
6598   /* If profile is inprecise, we have chance to fix it up.  */
6599   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6600     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6601
6602   /* Use the more conservative vectorization threshold.  If the number
6603      of iterations is constant assume the cost check has been performed
6604      by our caller.  If the threshold makes all loops profitable that
6605      run at least the vectorization factor number of times checking
6606      is pointless, too.  */
6607   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6608   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6609       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6610     {
6611       if (dump_enabled_p ())
6612         dump_printf_loc (MSG_NOTE, vect_location,
6613                          "Profitability threshold is %d loop iterations.\n",
6614                          th);
6615       check_profitability = true;
6616     }
6617
6618   /* Version the loop first, if required, so the profitability check
6619      comes first.  */
6620
6621   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6622       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6623     {
6624       vect_loop_versioning (loop_vinfo, th, check_profitability);
6625       check_profitability = false;
6626     }
6627
6628   tree ni_name = vect_build_loop_niters (loop_vinfo);
6629   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6630
6631   /* Peel the loop if there are data refs with unknown alignment.
6632      Only one data ref with unknown store is allowed.  */
6633
6634   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6635     {
6636       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6637                                      th, check_profitability);
6638       check_profitability = false;
6639       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6640          be re-computed.  */
6641       ni_name = NULL_TREE;
6642     }
6643
6644   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6645      compile time constant), or it is a constant that doesn't divide by the
6646      vectorization factor, then an epilog loop needs to be created.
6647      We therefore duplicate the loop: the original loop will be vectorized,
6648      and will compute the first (n/VF) iterations.  The second copy of the loop
6649      will remain scalar and will compute the remaining (n%VF) iterations.
6650      (VF is the vectorization factor).  */
6651
6652   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6653       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6654     {
6655       tree ratio_mult_vf;
6656       if (!ni_name)
6657         ni_name = vect_build_loop_niters (loop_vinfo);
6658       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6659                                        &ratio);
6660       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6661                                       th, check_profitability);
6662     }
6663   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6664     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6665                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6666   else
6667     {
6668       if (!ni_name)
6669         ni_name = vect_build_loop_niters (loop_vinfo);
6670       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6671     }
6672
6673   /* 1) Make sure the loop header has exactly two entries
6674      2) Make sure we have a preheader basic block.  */
6675
6676   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6677
6678   split_edge (loop_preheader_edge (loop));
6679
6680   /* FORNOW: the vectorizer supports only loops which body consist
6681      of one basic block (header + empty latch). When the vectorizer will
6682      support more involved loop forms, the order by which the BBs are
6683      traversed need to be reconsidered.  */
6684
6685   for (i = 0; i < nbbs; i++)
6686     {
6687       basic_block bb = bbs[i];
6688       stmt_vec_info stmt_info;
6689
6690       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6691            gsi_next (&si))
6692         {
6693           gphi *phi = si.phi ();
6694           if (dump_enabled_p ())
6695             {
6696               dump_printf_loc (MSG_NOTE, vect_location,
6697                                "------>vectorizing phi: ");
6698               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6699               dump_printf (MSG_NOTE, "\n");
6700             }
6701           stmt_info = vinfo_for_stmt (phi);
6702           if (!stmt_info)
6703             continue;
6704
6705           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6706             vect_loop_kill_debug_uses (loop, phi);
6707
6708           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6709               && !STMT_VINFO_LIVE_P (stmt_info))
6710             continue;
6711
6712           if (STMT_VINFO_VECTYPE (stmt_info)
6713               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6714                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6715               && dump_enabled_p ())
6716             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6717
6718           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6719             {
6720               if (dump_enabled_p ())
6721                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6722               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6723             }
6724         }
6725
6726       pattern_stmt = NULL;
6727       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6728            !gsi_end_p (si) || transform_pattern_stmt;)
6729         {
6730           bool is_store;
6731
6732           if (transform_pattern_stmt)
6733             stmt = pattern_stmt;
6734           else
6735             {
6736               stmt = gsi_stmt (si);
6737               /* During vectorization remove existing clobber stmts.  */
6738               if (gimple_clobber_p (stmt))
6739                 {
6740                   unlink_stmt_vdef (stmt);
6741                   gsi_remove (&si, true);
6742                   release_defs (stmt);
6743                   continue;
6744                 }
6745             }
6746
6747           if (dump_enabled_p ())
6748             {
6749               dump_printf_loc (MSG_NOTE, vect_location,
6750                                "------>vectorizing statement: ");
6751               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6752               dump_printf (MSG_NOTE, "\n");
6753             }
6754
6755           stmt_info = vinfo_for_stmt (stmt);
6756
6757           /* vector stmts created in the outer-loop during vectorization of
6758              stmts in an inner-loop may not have a stmt_info, and do not
6759              need to be vectorized.  */
6760           if (!stmt_info)
6761             {
6762               gsi_next (&si);
6763               continue;
6764             }
6765
6766           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6767             vect_loop_kill_debug_uses (loop, stmt);
6768
6769           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6770               && !STMT_VINFO_LIVE_P (stmt_info))
6771             {
6772               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6773                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6774                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6775                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6776                 {
6777                   stmt = pattern_stmt;
6778                   stmt_info = vinfo_for_stmt (stmt);
6779                 }
6780               else
6781                 {
6782                   gsi_next (&si);
6783                   continue;
6784                 }
6785             }
6786           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6787                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6788                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6789                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6790             transform_pattern_stmt = true;
6791
6792           /* If pattern statement has def stmts, vectorize them too.  */
6793           if (is_pattern_stmt_p (stmt_info))
6794             {
6795               if (pattern_def_seq == NULL)
6796                 {
6797                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6798                   pattern_def_si = gsi_start (pattern_def_seq);
6799                 }
6800               else if (!gsi_end_p (pattern_def_si))
6801                 gsi_next (&pattern_def_si);
6802               if (pattern_def_seq != NULL)
6803                 {
6804                   gimple *pattern_def_stmt = NULL;
6805                   stmt_vec_info pattern_def_stmt_info = NULL;
6806
6807                   while (!gsi_end_p (pattern_def_si))
6808                     {
6809                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6810                       pattern_def_stmt_info
6811                         = vinfo_for_stmt (pattern_def_stmt);
6812                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6813                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6814                         break;
6815                       gsi_next (&pattern_def_si);
6816                     }
6817
6818                   if (!gsi_end_p (pattern_def_si))
6819                     {
6820                       if (dump_enabled_p ())
6821                         {
6822                           dump_printf_loc (MSG_NOTE, vect_location,
6823                                            "==> vectorizing pattern def "
6824                                            "stmt: ");
6825                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6826                                             pattern_def_stmt, 0);
6827                           dump_printf (MSG_NOTE, "\n");
6828                         }
6829
6830                       stmt = pattern_def_stmt;
6831                       stmt_info = pattern_def_stmt_info;
6832                     }
6833                   else
6834                     {
6835                       pattern_def_si = gsi_none ();
6836                       transform_pattern_stmt = false;
6837                     }
6838                 }
6839               else
6840                 transform_pattern_stmt = false;
6841             }
6842
6843           if (STMT_VINFO_VECTYPE (stmt_info))
6844             {
6845               unsigned int nunits
6846                 = (unsigned int)
6847                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6848               if (!STMT_SLP_TYPE (stmt_info)
6849                   && nunits != (unsigned int) vectorization_factor
6850                   && dump_enabled_p ())
6851                   /* For SLP VF is set according to unrolling factor, and not
6852                      to vector size, hence for SLP this print is not valid.  */
6853                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6854             }
6855
6856           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6857              reached.  */
6858           if (STMT_SLP_TYPE (stmt_info))
6859             {
6860               if (!slp_scheduled)
6861                 {
6862                   slp_scheduled = true;
6863
6864                   if (dump_enabled_p ())
6865                     dump_printf_loc (MSG_NOTE, vect_location,
6866                                      "=== scheduling SLP instances ===\n");
6867
6868                   vect_schedule_slp (loop_vinfo);
6869                 }
6870
6871               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6872               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6873                 {
6874                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6875                     {
6876                       pattern_def_seq = NULL;
6877                       gsi_next (&si);
6878                     }
6879                   continue;
6880                 }
6881             }
6882
6883           /* -------- vectorize statement ------------ */
6884           if (dump_enabled_p ())
6885             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6886
6887           grouped_store = false;
6888           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6889           if (is_store)
6890             {
6891               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6892                 {
6893                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6894                      interleaving chain was completed - free all the stores in
6895                      the chain.  */
6896                   gsi_next (&si);
6897                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6898                 }
6899               else
6900                 {
6901                   /* Free the attached stmt_vec_info and remove the stmt.  */
6902                   gimple *store = gsi_stmt (si);
6903                   free_stmt_vec_info (store);
6904                   unlink_stmt_vdef (store);
6905                   gsi_remove (&si, true);
6906                   release_defs (store);
6907                 }
6908
6909               /* Stores can only appear at the end of pattern statements.  */
6910               gcc_assert (!transform_pattern_stmt);
6911               pattern_def_seq = NULL;
6912             }
6913           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6914             {
6915               pattern_def_seq = NULL;
6916               gsi_next (&si);
6917             }
6918         }                       /* stmts in BB */
6919     }                           /* BBs in loop */
6920
6921   slpeel_make_loop_iterate_ntimes (loop, ratio);
6922
6923   /* Reduce loop iterations by the vectorization factor.  */
6924   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6925                       expected_iterations / vectorization_factor);
6926   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6927       && loop->nb_iterations_upper_bound != 0)
6928     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6929   loop->nb_iterations_upper_bound
6930     = wi::udiv_floor (loop->nb_iterations_upper_bound + 1,
6931                       vectorization_factor) - 1;
6932
6933   if (loop->any_estimate)
6934     {
6935       loop->nb_iterations_estimate
6936         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6937        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6938            && loop->nb_iterations_estimate != 0)
6939          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6940     }
6941
6942   if (dump_enabled_p ())
6943     {
6944       dump_printf_loc (MSG_NOTE, vect_location,
6945                        "LOOP VECTORIZED\n");
6946       if (loop->inner)
6947         dump_printf_loc (MSG_NOTE, vect_location,
6948                          "OUTER LOOP VECTORIZED\n");
6949       dump_printf (MSG_NOTE, "\n");
6950     }
6951
6952   /* Free SLP instances here because otherwise stmt reference counting
6953      won't work.  */
6954   slp_instance instance;
6955   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
6956     vect_free_slp_instance (instance);
6957   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
6958 }
6959
6960 /* The code below is trying to perform simple optimization - revert
6961    if-conversion for masked stores, i.e. if the mask of a store is zero
6962    do not perform it and all stored value producers also if possible.
6963    For example,
6964      for (i=0; i<n; i++)
6965        if (c[i])
6966         {
6967           p1[i] += 1;
6968           p2[i] = p3[i] +2;
6969         }
6970    this transformation will produce the following semi-hammock:
6971
6972    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
6973      {
6974        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
6975        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
6976        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
6977        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
6978        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
6979        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
6980      }
6981 */
6982
6983 void
6984 optimize_mask_stores (struct loop *loop)
6985 {
6986   basic_block *bbs = get_loop_body (loop);
6987   unsigned nbbs = loop->num_nodes;
6988   unsigned i;
6989   basic_block bb;
6990   gimple_stmt_iterator gsi;
6991   gimple *stmt;
6992   auto_vec<gimple *> worklist;
6993
6994   vect_location = find_loop_location (loop);
6995   /* Pick up all masked stores in loop if any.  */
6996   for (i = 0; i < nbbs; i++)
6997     {
6998       bb = bbs[i];
6999       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7000            gsi_next (&gsi))
7001         {
7002           stmt = gsi_stmt (gsi);
7003           if (is_gimple_call (stmt)
7004               && gimple_call_internal_p (stmt)
7005               && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7006             worklist.safe_push (stmt);
7007         }
7008     }
7009
7010   free (bbs);
7011   if (worklist.is_empty ())
7012     return;
7013
7014   /* Loop has masked stores.  */
7015   while (!worklist.is_empty ())
7016     {
7017       gimple *last, *last_store;
7018       edge e, efalse;
7019       tree mask;
7020       basic_block store_bb, join_bb;
7021       gimple_stmt_iterator gsi_to;
7022       tree vdef, new_vdef;
7023       gphi *phi;
7024       tree vectype;
7025       tree zero;
7026
7027       last = worklist.pop ();
7028       mask = gimple_call_arg (last, 2);
7029       bb = gimple_bb (last);
7030       /* Create new bb.  */
7031       e = split_block (bb, last);
7032       join_bb = e->dest;
7033       store_bb = create_empty_bb (bb);
7034       add_bb_to_loop (store_bb, loop);
7035       e->flags = EDGE_TRUE_VALUE;
7036       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7037       /* Put STORE_BB to likely part.  */
7038       efalse->probability = PROB_UNLIKELY;
7039       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7040       make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7041       if (dom_info_available_p (CDI_DOMINATORS))
7042         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7043       if (dump_enabled_p ())
7044         dump_printf_loc (MSG_NOTE, vect_location,
7045                          "Create new block %d to sink mask stores.",
7046                          store_bb->index);
7047       /* Create vector comparison with boolean result.  */
7048       vectype = TREE_TYPE (mask);
7049       zero = build_zero_cst (vectype);
7050       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7051       gsi = gsi_last_bb (bb);
7052       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7053       /* Create new PHI node for vdef of the last masked store:
7054          .MEM_2 = VDEF <.MEM_1>
7055          will be converted to
7056          .MEM.3 = VDEF <.MEM_1>
7057          and new PHI node will be created in join bb
7058          .MEM_2 = PHI <.MEM_1, .MEM_3>
7059       */
7060       vdef = gimple_vdef (last);
7061       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7062       gimple_set_vdef (last, new_vdef);
7063       phi = create_phi_node (vdef, join_bb);
7064       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7065
7066       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7067       while (true)
7068         {
7069           gimple_stmt_iterator gsi_from;
7070           gimple *stmt1 = NULL;
7071
7072           /* Move masked store to STORE_BB.  */
7073           last_store = last;
7074           gsi = gsi_for_stmt (last);
7075           gsi_from = gsi;
7076           /* Shift GSI to the previous stmt for further traversal.  */
7077           gsi_prev (&gsi);
7078           gsi_to = gsi_start_bb (store_bb);
7079           gsi_move_before (&gsi_from, &gsi_to);
7080           /* Setup GSI_TO to the non-empty block start.  */
7081           gsi_to = gsi_start_bb (store_bb);
7082           if (dump_enabled_p ())
7083             {
7084               dump_printf_loc (MSG_NOTE, vect_location,
7085                                "Move stmt to created bb\n");
7086               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7087             }
7088           /* Move all stored value producers if possible.  */
7089           while (!gsi_end_p (gsi))
7090             {
7091               tree lhs;
7092               imm_use_iterator imm_iter;
7093               use_operand_p use_p;
7094               bool res;
7095
7096               /* Skip debug statements.  */
7097               if (is_gimple_debug (gsi_stmt (gsi)))
7098                 {
7099                   gsi_prev (&gsi);
7100                   continue;
7101                 }
7102               stmt1 = gsi_stmt (gsi);
7103               /* Do not consider statements writing to memory or having
7104                  volatile operand.  */
7105               if (gimple_vdef (stmt1)
7106                   || gimple_has_volatile_ops (stmt1))
7107                 break;
7108               gsi_from = gsi;
7109               gsi_prev (&gsi);
7110               lhs = gimple_get_lhs (stmt1);
7111               if (!lhs)
7112                 break;
7113
7114               /* LHS of vectorized stmt must be SSA_NAME.  */
7115               if (TREE_CODE (lhs) != SSA_NAME)
7116                 break;
7117
7118               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7119                 {
7120                   /* Remove dead scalar statement.  */
7121                   if (has_zero_uses (lhs))
7122                     {
7123                       gsi_remove (&gsi_from, true);
7124                       continue;
7125                     }
7126                 }
7127
7128               /* Check that LHS does not have uses outside of STORE_BB.  */
7129               res = true;
7130               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7131                 {
7132                   gimple *use_stmt;
7133                   use_stmt = USE_STMT (use_p);
7134                   if (is_gimple_debug (use_stmt))
7135                     continue;
7136                   if (gimple_bb (use_stmt) != store_bb)
7137                     {
7138                       res = false;
7139                       break;
7140                     }
7141                 }
7142               if (!res)
7143                 break;
7144
7145               if (gimple_vuse (stmt1)
7146                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7147                 break;
7148
7149               /* Can move STMT1 to STORE_BB.  */
7150               if (dump_enabled_p ())
7151                 {
7152                   dump_printf_loc (MSG_NOTE, vect_location,
7153                                    "Move stmt to created bb\n");
7154                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7155                 }
7156               gsi_move_before (&gsi_from, &gsi_to);
7157               /* Shift GSI_TO for further insertion.  */
7158               gsi_prev (&gsi_to);
7159             }
7160           /* Put other masked stores with the same mask to STORE_BB.  */
7161           if (worklist.is_empty ()
7162               || gimple_call_arg (worklist.last (), 2) != mask
7163               || worklist.last () != stmt1)
7164             break;
7165           last = worklist.pop ();
7166         }
7167       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7168     }
7169 }