gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "cfgloop.h"
  45 #include "params.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50
  51 /* Loop Vectorization Pass.
  52
  53    This pass tries to vectorize loops.
  54
  55    For example, the vectorizer transforms the following simple loop:
  56
  57         short a[N]; short b[N]; short c[N]; int i;
  58
  59         for (i=0; i<N; i++){
  60           a[i] = b[i] + c[i];
  61         }
  62
  63    as if it was manually vectorized by rewriting the source code into:
  64
  65         typedef int __attribute__((mode(V8HI))) v8hi;
  66         short a[N];  short b[N]; short c[N];   int i;
  67         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  68         v8hi va, vb, vc;
  69
  70         for (i=0; i<N/8; i++){
  71           vb = pb[i];
  72           vc = pc[i];
  73           va = vb + vc;
  74           pa[i] = va;
  75         }
  76
  77         The main entry to this pass is vectorize_loops(), in which
  78    the vectorizer applies a set of analyses on a given set of loops,
  79    followed by the actual vectorization transformation for the loops that
  80    had successfully passed the analysis phase.
  81         Throughout this pass we make a distinction between two types of
  82    data: scalars (which are represented by SSA_NAMES), and memory references
  83    ("data-refs").  These two types of data require different handling both
  84    during analysis and transformation. The types of data-refs that the
  85    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  86    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  87    accesses are required to have a simple (consecutive) access pattern.
  88
  89    Analysis phase:
  90    ===============
  91         The driver for the analysis phase is vect_analyze_loop().
  92    It applies a set of analyses, some of which rely on the scalar evolution
  93    analyzer (scev) developed by Sebastian Pop.
  94
  95         During the analysis phase the vectorizer records some information
  96    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  97    loop, as well as general information about the loop as a whole, which is
  98    recorded in a "loop_vec_info" struct attached to each loop.
  99
 100    Transformation phase:
 101    =====================
 102         The loop transformation phase scans all the stmts in the loop, and
 103    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 104    the loop that needs to be vectorized.  It inserts the vector code sequence
 105    just before the scalar stmt S, and records a pointer to the vector code
 106    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 107    attached to S).  This pointer will be used for the vectorization of following
 108    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 109    otherwise, we rely on dead code elimination for removing it.
 110
 111         For example, say stmt S1 was vectorized into stmt VS1:
 112
 113    VS1: vb = px[i];
 114    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 115    S2:  a = b;
 116
 117    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 118    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 119    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 120    resulting sequence would be:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    VS2: va = vb;
 125    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 126
 127         Operands that are not SSA_NAMEs, are data-refs that appear in
 128    load/store operations (like 'x[i]' in S1), and are handled differently.
 129
 130    Target modeling:
 131    =================
 132         Currently the only target specific information that is used is the
 133    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 134    Targets that can support different sizes of vectors, for now will need
 135    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 136    flexibility will be added in the future.
 137
 138         Since we only vectorize operations which vector form can be
 139    expressed using existing tree codes, to verify that an operation is
 140    supported, the vectorizer checks the relevant optab at the relevant
 141    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 142    the value found is CODE_FOR_nothing, then there's no target support, and
 143    we can't vectorize the stmt.
 144
 145    For additional information on this project see:
 146    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 147 */
 148
 149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 150
 151 /* Function vect_determine_vectorization_factor
 152
 153    Determine the vectorization factor (VF).  VF is the number of data elements
 154    that are operated upon in parallel in a single iteration of the vectorized
 155    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 156    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 157    elements can fit in a single vector register.
 158
 159    We currently support vectorization of loops in which all types operated upon
 160    are of the same size.  Therefore this function currently sets VF according to
 161    the size of the types operated upon, and fails if there are multiple sizes
 162    in the loop.
 163
 164    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 165    original loop:
 166         for (i=0; i<N; i++){
 167           a[i] = b[i] + c[i];
 168         }
 169
 170    vectorized loop:
 171         for (i=0; i<N; i+=VF){
 172           a[i:VF] = b[i:VF] + c[i:VF];
 173         }
 174 */
 175
 176 static bool
 177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 178 {
 179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 180   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 181   unsigned nbbs = loop->num_nodes;
 182   unsigned int vectorization_factor = 0;
 183   tree scalar_type;
 184   gphi *phi;
 185   tree vectype;
 186   unsigned int nunits;
 187   stmt_vec_info stmt_info;
 188   unsigned i;
 189   HOST_WIDE_INT dummy;
 190   gimple *stmt, *pattern_stmt = NULL;
 191   gimple_seq pattern_def_seq = NULL;
 192   gimple_stmt_iterator pattern_def_si = gsi_none ();
 193   bool analyze_pattern_stmt = false;
 194   bool bool_result;
 195   auto_vec<stmt_vec_info> mask_producers;
 196
 197   if (dump_enabled_p ())
 198     dump_printf_loc (MSG_NOTE, vect_location,
 199                      "=== vect_determine_vectorization_factor ===\n");
 200
 201   for (i = 0; i < nbbs; i++)
 202     {
 203       basic_block bb = bbs[i];
 204
 205       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 206            gsi_next (&si))
 207         {
 208           phi = si.phi ();
 209           stmt_info = vinfo_for_stmt (phi);
 210           if (dump_enabled_p ())
 211             {
 212               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 213               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 214               dump_printf (MSG_NOTE, "\n");
 215             }
 216
 217           gcc_assert (stmt_info);
 218
 219           if (STMT_VINFO_RELEVANT_P (stmt_info))
 220             {
 221               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 222               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 223
 224               if (dump_enabled_p ())
 225                 {
 226                   dump_printf_loc (MSG_NOTE, vect_location,
 227                                    "get vectype for scalar type:  ");
 228                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 229                   dump_printf (MSG_NOTE, "\n");
 230                 }
 231
 232               vectype = get_vectype_for_scalar_type (scalar_type);
 233               if (!vectype)
 234                 {
 235                   if (dump_enabled_p ())
 236                     {
 237                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 238                                        "not vectorized: unsupported "
 239                                        "data-type ");
 240                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 241                                          scalar_type);
 242                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 243                     }
 244                   return false;
 245                 }
 246               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 247
 248               if (dump_enabled_p ())
 249                 {
 250                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 251                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 252                   dump_printf (MSG_NOTE, "\n");
 253                 }
 254
 255               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 256               if (dump_enabled_p ())
 257                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 258                                  nunits);
 259
 260               if (!vectorization_factor
 261                   || (nunits > vectorization_factor))
 262                 vectorization_factor = nunits;
 263             }
 264         }
 265
 266       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 267            !gsi_end_p (si) || analyze_pattern_stmt;)
 268         {
 269           tree vf_vectype;
 270
 271           if (analyze_pattern_stmt)
 272             stmt = pattern_stmt;
 273           else
 274             stmt = gsi_stmt (si);
 275
 276           stmt_info = vinfo_for_stmt (stmt);
 277
 278           if (dump_enabled_p ())
 279             {
 280               dump_printf_loc (MSG_NOTE, vect_location,
 281                                "==> examining statement: ");
 282               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 283               dump_printf (MSG_NOTE, "\n");
 284             }
 285
 286           gcc_assert (stmt_info);
 287
 288           /* Skip stmts which do not need to be vectorized.  */
 289           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 290                && !STMT_VINFO_LIVE_P (stmt_info))
 291               || gimple_clobber_p (stmt))
 292             {
 293               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 294                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 295                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 296                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 297                 {
 298                   stmt = pattern_stmt;
 299                   stmt_info = vinfo_for_stmt (pattern_stmt);
 300                   if (dump_enabled_p ())
 301                     {
 302                       dump_printf_loc (MSG_NOTE, vect_location,
 303                                        "==> examining pattern statement: ");
 304                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 305                       dump_printf (MSG_NOTE, "\n");
 306                     }
 307                 }
 308               else
 309                 {
 310                   if (dump_enabled_p ())
 311                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 312                   gsi_next (&si);
 313                   continue;
 314                 }
 315             }
 316           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 317                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 318                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 319                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 320             analyze_pattern_stmt = true;
 321
 322           /* If a pattern statement has def stmts, analyze them too.  */
 323           if (is_pattern_stmt_p (stmt_info))
 324             {
 325               if (pattern_def_seq == NULL)
 326                 {
 327                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 328                   pattern_def_si = gsi_start (pattern_def_seq);
 329                 }
 330               else if (!gsi_end_p (pattern_def_si))
 331                 gsi_next (&pattern_def_si);
 332               if (pattern_def_seq != NULL)
 333                 {
 334                   gimple *pattern_def_stmt = NULL;
 335                   stmt_vec_info pattern_def_stmt_info = NULL;
 336
 337                   while (!gsi_end_p (pattern_def_si))
 338                     {
 339                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 340                       pattern_def_stmt_info
 341                         = vinfo_for_stmt (pattern_def_stmt);
 342                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 343                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 344                         break;
 345                       gsi_next (&pattern_def_si);
 346                     }
 347
 348                   if (!gsi_end_p (pattern_def_si))
 349                     {
 350                       if (dump_enabled_p ())
 351                         {
 352                           dump_printf_loc (MSG_NOTE, vect_location,
 353                                            "==> examining pattern def stmt: ");
 354                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 355                                             pattern_def_stmt, 0);
 356                           dump_printf (MSG_NOTE, "\n");
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 398                 }
 399               return false;
 400             }
 401
 402           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 403             {
 404               if (dump_enabled_p ())
 405                 {
 406                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                    "not vectorized: vector stmt in loop:");
 408                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 409                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (is_gimple_call (stmt)
 431                   && gimple_call_internal_p (stmt)
 432                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 433                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 434               else
 435                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 436
 437               /* Bool ops don't participate in vectorization factor
 438                  computation.  For comparison use compared types to
 439                  compute a factor.  */
 440               if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info))
 443                     mask_producers.safe_push (stmt_info);
 444                   bool_result = true;
 445
 446                   if (gimple_code (stmt) == GIMPLE_ASSIGN
 447                       && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                          == tcc_comparison
 449                       && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
 450                          != BOOLEAN_TYPE)
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 531                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 558           if (!vectorization_factor
 559               || (nunits > vectorization_factor))
 560             vectorization_factor = nunits;
 561
 562           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 563             {
 564               pattern_def_seq = NULL;
 565               gsi_next (&si);
 566             }
 567         }
 568     }
 569
 570   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 571   if (dump_enabled_p ())
 572     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 573                      vectorization_factor);
 574   if (vectorization_factor <= 1)
 575     {
 576       if (dump_enabled_p ())
 577         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 578                          "not vectorized: unsupported data-type\n");
 579       return false;
 580     }
 581   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 582
 583   for (i = 0; i < mask_producers.length (); i++)
 584     {
 585       tree mask_type = NULL;
 586
 587       stmt = STMT_VINFO_STMT (mask_producers[i]);
 588
 589       if (gimple_code (stmt) == GIMPLE_ASSIGN
 590           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 591           && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
 592         {
 593           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 594           mask_type = get_mask_type_for_scalar_type (scalar_type);
 595
 596           if (!mask_type)
 597             {
 598               if (dump_enabled_p ())
 599                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 600                                  "not vectorized: unsupported mask\n");
 601               return false;
 602             }
 603         }
 604       else
 605         {
 606           tree rhs;
 607           ssa_op_iter iter;
 608           gimple *def_stmt;
 609           enum vect_def_type dt;
 610
 611           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 612             {
 613               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 614                                        &def_stmt, &dt, &vectype))
 615                 {
 616                   if (dump_enabled_p ())
 617                     {
 618                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 619                                        "not vectorized: can't compute mask type "
 620                                        "for statement, ");
 621                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 622                                         0);
 623                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 624                     }
 625                   return false;
 626                 }
 627
 628               /* No vectype probably means external definition.
 629                  Allow it in case there is another operand which
 630                  allows to determine mask type.  */
 631               if (!vectype)
 632                 continue;
 633
 634               if (!mask_type)
 635                 mask_type = vectype;
 636               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 637                        != TYPE_VECTOR_SUBPARTS (vectype))
 638                 {
 639                   if (dump_enabled_p ())
 640                     {
 641                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 642                                        "not vectorized: different sized masks "
 643                                        "types in statement, ");
 644                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 645                                          mask_type);
 646                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 647                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 648                                          vectype);
 649                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 650                     }
 651                   return false;
 652                 }
 653               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 654                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 655                 {
 656                   if (dump_enabled_p ())
 657                     {
 658                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 659                                        "not vectorized: mixed mask and "
 660                                        "nonmask vector types in statement, ");
 661                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 662                                          mask_type);
 663                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 664                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 665                                          vectype);
 666                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 667                     }
 668                   return false;
 669                 }
 670             }
 671
 672           /* We may compare boolean value loaded as vector of integers.
 673              Fix mask_type in such case.  */
 674           if (mask_type
 675               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 676               && gimple_code (stmt) == GIMPLE_ASSIGN
 677               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 678             mask_type = build_same_sized_truth_vector_type (mask_type);
 679         }
 680
 681       /* No mask_type should mean loop invariant predicate.
 682          This is probably a subject for optimization in
 683          if-conversion.  */
 684       if (!mask_type)
 685         {
 686           if (dump_enabled_p ())
 687             {
 688               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 689                                "not vectorized: can't compute mask type "
 690                                "for statement, ");
 691               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 692                                 0);
 693               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 694             }
 695           return false;
 696         }
 697
 698       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 699     }
 700
 701   return true;
 702 }
 703
 704
 705 /* Function vect_is_simple_iv_evolution.
 706
 707    FORNOW: A simple evolution of an induction variables in the loop is
 708    considered a polynomial evolution.  */
 709
 710 static bool
 711 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 712                              tree * step)
 713 {
 714   tree init_expr;
 715   tree step_expr;
 716   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 717   basic_block bb;
 718
 719   /* When there is no evolution in this loop, the evolution function
 720      is not "simple".  */
 721   if (evolution_part == NULL_TREE)
 722     return false;
 723
 724   /* When the evolution is a polynomial of degree >= 2
 725      the evolution function is not "simple".  */
 726   if (tree_is_chrec (evolution_part))
 727     return false;
 728
 729   step_expr = evolution_part;
 730   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 731
 732   if (dump_enabled_p ())
 733     {
 734       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 735       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 736       dump_printf (MSG_NOTE, ",  init: ");
 737       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 738       dump_printf (MSG_NOTE, "\n");
 739     }
 740
 741   *init = init_expr;
 742   *step = step_expr;
 743
 744   if (TREE_CODE (step_expr) != INTEGER_CST
 745       && (TREE_CODE (step_expr) != SSA_NAME
 746           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 747               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 748           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 749               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 750                   || !flag_associative_math)))
 751       && (TREE_CODE (step_expr) != REAL_CST
 752           || !flag_associative_math))
 753     {
 754       if (dump_enabled_p ())
 755         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 756                          "step unknown.\n");
 757       return false;
 758     }
 759
 760   return true;
 761 }
 762
 763 /* Function vect_analyze_scalar_cycles_1.
 764
 765    Examine the cross iteration def-use cycles of scalar variables
 766    in LOOP.  LOOP_VINFO represents the loop that is now being
 767    considered for vectorization (can be LOOP, or an outer-loop
 768    enclosing LOOP).  */
 769
 770 static void
 771 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 772 {
 773   basic_block bb = loop->header;
 774   tree init, step;
 775   auto_vec<gimple *, 64> worklist;
 776   gphi_iterator gsi;
 777   bool double_reduc;
 778
 779   if (dump_enabled_p ())
 780     dump_printf_loc (MSG_NOTE, vect_location,
 781                      "=== vect_analyze_scalar_cycles ===\n");
 782
 783   /* First - identify all inductions.  Reduction detection assumes that all the
 784      inductions have been identified, therefore, this order must not be
 785      changed.  */
 786   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 787     {
 788       gphi *phi = gsi.phi ();
 789       tree access_fn = NULL;
 790       tree def = PHI_RESULT (phi);
 791       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 792
 793       if (dump_enabled_p ())
 794         {
 795           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 796           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 797           dump_printf (MSG_NOTE, "\n");
 798         }
 799
 800       /* Skip virtual phi's.  The data dependences that are associated with
 801          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 802       if (virtual_operand_p (def))
 803         continue;
 804
 805       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 806
 807       /* Analyze the evolution function.  */
 808       access_fn = analyze_scalar_evolution (loop, def);
 809       if (access_fn)
 810         {
 811           STRIP_NOPS (access_fn);
 812           if (dump_enabled_p ())
 813             {
 814               dump_printf_loc (MSG_NOTE, vect_location,
 815                                "Access function of PHI: ");
 816               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 817               dump_printf (MSG_NOTE, "\n");
 818             }
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 820             = initial_condition_in_loop_num (access_fn, loop->num);
 821           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 822             = evolution_part_in_loop_num (access_fn, loop->num);
 823         }
 824
 825       if (!access_fn
 826           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 827           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 828               && TREE_CODE (step) != INTEGER_CST))
 829         {
 830           worklist.safe_push (phi);
 831           continue;
 832         }
 833
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 835                   != NULL_TREE);
 836       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 837
 838       if (dump_enabled_p ())
 839         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 840       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 841     }
 842
 843
 844   /* Second - identify all reductions and nested cycles.  */
 845   while (worklist.length () > 0)
 846     {
 847       gimple *phi = worklist.pop ();
 848       tree def = PHI_RESULT (phi);
 849       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 850       gimple *reduc_stmt;
 851       bool nested_cycle;
 852
 853       if (dump_enabled_p ())
 854         {
 855           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 856           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 857           dump_printf (MSG_NOTE, "\n");
 858         }
 859
 860       gcc_assert (!virtual_operand_p (def)
 861                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 862
 863       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 864       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 865                                                 &double_reduc, false);
 866       if (reduc_stmt)
 867         {
 868           if (double_reduc)
 869             {
 870               if (dump_enabled_p ())
 871                 dump_printf_loc (MSG_NOTE, vect_location,
 872                                  "Detected double reduction.\n");
 873
 874               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 875               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 876                                                     vect_double_reduction_def;
 877             }
 878           else
 879             {
 880               if (nested_cycle)
 881                 {
 882                   if (dump_enabled_p ())
 883                     dump_printf_loc (MSG_NOTE, vect_location,
 884                                      "Detected vectorizable nested cycle.\n");
 885
 886                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 887                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 888                                                              vect_nested_cycle;
 889                 }
 890               else
 891                 {
 892                   if (dump_enabled_p ())
 893                     dump_printf_loc (MSG_NOTE, vect_location,
 894                                      "Detected reduction.\n");
 895
 896                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 897                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 898                                                            vect_reduction_def;
 899                   /* Store the reduction cycles for possible vectorization in
 900                      loop-aware SLP.  */
 901                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 902                 }
 903             }
 904         }
 905       else
 906         if (dump_enabled_p ())
 907           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 908                            "Unknown def-use cycle pattern.\n");
 909     }
 910 }
 911
 912
 913 /* Function vect_analyze_scalar_cycles.
 914
 915    Examine the cross iteration def-use cycles of scalar variables, by
 916    analyzing the loop-header PHIs of scalar variables.  Classify each
 917    cycle as one of the following: invariant, induction, reduction, unknown.
 918    We do that for the loop represented by LOOP_VINFO, and also to its
 919    inner-loop, if exists.
 920    Examples for scalar cycles:
 921
 922    Example1: reduction:
 923
 924               loop1:
 925               for (i=0; i<N; i++)
 926                  sum += a[i];
 927
 928    Example2: induction:
 929
 930               loop2:
 931               for (i=0; i<N; i++)
 932                  a[i] = i;  */
 933
 934 static void
 935 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 936 {
 937   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 938
 939   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 940
 941   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 942      Reductions in such inner-loop therefore have different properties than
 943      the reductions in the nest that gets vectorized:
 944      1. When vectorized, they are executed in the same order as in the original
 945         scalar loop, so we can't change the order of computation when
 946         vectorizing them.
 947      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 948         current checks are too strict.  */
 949
 950   if (loop->inner)
 951     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 952 }
 953
 954 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 955
 956 static void
 957 vect_fixup_reduc_chain (gimple *stmt)
 958 {
 959   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 960   gimple *stmtp;
 961   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 962               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 963   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 964   do
 965     {
 966       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 968       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 969       if (stmt)
 970         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 971           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 972     }
 973   while (stmt);
 974   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 975 }
 976
 977 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 978
 979 static void
 980 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 981 {
 982   gimple *first;
 983   unsigned i;
 984
 985   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 986     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 987       {
 988         vect_fixup_reduc_chain (first);
 989         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 990           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 991       }
 992 }
 993
 994 /* Function vect_get_loop_niters.
 995
 996    Determine how many iterations the loop is executed and place it
 997    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 998    in NUMBER_OF_ITERATIONSM1.
 999
1000    Return the loop exit condition.  */
1001
1002
1003 static gcond *
1004 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1005                       tree *number_of_iterationsm1)
1006 {
1007   tree niters;
1008
1009   if (dump_enabled_p ())
1010     dump_printf_loc (MSG_NOTE, vect_location,
1011                      "=== get_loop_niters ===\n");
1012
1013   niters = number_of_latch_executions (loop);
1014   *number_of_iterationsm1 = niters;
1015
1016   /* We want the number of loop header executions which is the number
1017      of latch executions plus one.
1018      ???  For UINT_MAX latch executions this number overflows to zero
1019      for loops like do { n++; } while (n != 0);  */
1020   if (niters && !chrec_contains_undetermined (niters))
1021     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1022                           build_int_cst (TREE_TYPE (niters), 1));
1023   *number_of_iterations = niters;
1024
1025   return get_loop_exit_condition (loop);
1026 }
1027
1028
1029 /* Function bb_in_loop_p
1030
1031    Used as predicate for dfs order traversal of the loop bbs.  */
1032
1033 static bool
1034 bb_in_loop_p (const_basic_block bb, const void *data)
1035 {
1036   const struct loop *const loop = (const struct loop *)data;
1037   if (flow_bb_inside_loop_p (loop, bb))
1038     return true;
1039   return false;
1040 }
1041
1042
1043 /* Function new_loop_vec_info.
1044
1045    Create and initialize a new loop_vec_info struct for LOOP, as well as
1046    stmt_vec_info structs for all the stmts in LOOP.  */
1047
1048 static loop_vec_info
1049 new_loop_vec_info (struct loop *loop)
1050 {
1051   loop_vec_info res;
1052   basic_block *bbs;
1053   gimple_stmt_iterator si;
1054   unsigned int i, nbbs;
1055
1056   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1057   res->kind = vec_info::loop;
1058   LOOP_VINFO_LOOP (res) = loop;
1059
1060   bbs = get_loop_body (loop);
1061
1062   /* Create/Update stmt_info for all stmts in the loop.  */
1063   for (i = 0; i < loop->num_nodes; i++)
1064     {
1065       basic_block bb = bbs[i];
1066
1067       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068         {
1069           gimple *phi = gsi_stmt (si);
1070           gimple_set_uid (phi, 0);
1071           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1072         }
1073
1074       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075         {
1076           gimple *stmt = gsi_stmt (si);
1077           gimple_set_uid (stmt, 0);
1078           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1079         }
1080     }
1081
1082   /* CHECKME: We want to visit all BBs before their successors (except for
1083      latch blocks, for which this assertion wouldn't hold).  In the simple
1084      case of the loop forms we allow, a dfs order of the BBs would the same
1085      as reversed postorder traversal, so we are safe.  */
1086
1087    free (bbs);
1088    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1089    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1090                               bbs, loop->num_nodes, loop);
1091    gcc_assert (nbbs == loop->num_nodes);
1092
1093   LOOP_VINFO_BBS (res) = bbs;
1094   LOOP_VINFO_NITERSM1 (res) = NULL;
1095   LOOP_VINFO_NITERS (res) = NULL;
1096   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1097   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1098   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1099   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1100   LOOP_VINFO_VECT_FACTOR (res) = 0;
1101   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1102   LOOP_VINFO_DATAREFS (res) = vNULL;
1103   LOOP_VINFO_DDRS (res) = vNULL;
1104   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1105   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1106   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1107   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1108   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1109   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1110   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1111   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1112   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1113   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1114   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1115   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1116
1117   return res;
1118 }
1119
1120
1121 /* Function destroy_loop_vec_info.
1122
1123    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1124    stmts in the loop.  */
1125
1126 void
1127 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1128 {
1129   struct loop *loop;
1130   basic_block *bbs;
1131   int nbbs;
1132   gimple_stmt_iterator si;
1133   int j;
1134   vec<slp_instance> slp_instances;
1135   slp_instance instance;
1136   bool swapped;
1137
1138   if (!loop_vinfo)
1139     return;
1140
1141   loop = LOOP_VINFO_LOOP (loop_vinfo);
1142
1143   bbs = LOOP_VINFO_BBS (loop_vinfo);
1144   nbbs = clean_stmts ? loop->num_nodes : 0;
1145   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1146
1147   for (j = 0; j < nbbs; j++)
1148     {
1149       basic_block bb = bbs[j];
1150       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1151         free_stmt_vec_info (gsi_stmt (si));
1152
1153       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1154         {
1155           gimple *stmt = gsi_stmt (si);
1156
1157           /* We may have broken canonical form by moving a constant
1158              into RHS1 of a commutative op.  Fix such occurrences.  */
1159           if (swapped && is_gimple_assign (stmt))
1160             {
1161               enum tree_code code = gimple_assign_rhs_code (stmt);
1162
1163               if ((code == PLUS_EXPR
1164                    || code == POINTER_PLUS_EXPR
1165                    || code == MULT_EXPR)
1166                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1167                 swap_ssa_operands (stmt,
1168                                    gimple_assign_rhs1_ptr (stmt),
1169                                    gimple_assign_rhs2_ptr (stmt));
1170             }
1171
1172           /* Free stmt_vec_info.  */
1173           free_stmt_vec_info (stmt);
1174           gsi_next (&si);
1175         }
1176     }
1177
1178   free (LOOP_VINFO_BBS (loop_vinfo));
1179   vect_destroy_datarefs (loop_vinfo);
1180   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1181   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1182   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1183   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1184   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1185   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1186   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1187     vect_free_slp_instance (instance);
1188
1189   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1190   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1191   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1192   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1193
1194   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1195   loop_vinfo->scalar_cost_vec.release ();
1196
1197   free (loop_vinfo);
1198   loop->aux = NULL;
1199 }
1200
1201
1202 /* Calculate the cost of one scalar iteration of the loop.  */
1203 static void
1204 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1205 {
1206   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1207   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1208   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1209   int innerloop_iters, i;
1210
1211   /* Count statements in scalar loop.  Using this as scalar cost for a single
1212      iteration for now.
1213
1214      TODO: Add outer loop support.
1215
1216      TODO: Consider assigning different costs to different scalar
1217      statements.  */
1218
1219   /* FORNOW.  */
1220   innerloop_iters = 1;
1221   if (loop->inner)
1222     innerloop_iters = 50; /* FIXME */
1223
1224   for (i = 0; i < nbbs; i++)
1225     {
1226       gimple_stmt_iterator si;
1227       basic_block bb = bbs[i];
1228
1229       if (bb->loop_father == loop->inner)
1230         factor = innerloop_iters;
1231       else
1232         factor = 1;
1233
1234       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1235         {
1236           gimple *stmt = gsi_stmt (si);
1237           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1238
1239           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1240             continue;
1241
1242           /* Skip stmts that are not vectorized inside the loop.  */
1243           if (stmt_info
1244               && !STMT_VINFO_RELEVANT_P (stmt_info)
1245               && (!STMT_VINFO_LIVE_P (stmt_info)
1246                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1247               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1248             continue;
1249
1250           vect_cost_for_stmt kind;
1251           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1252             {
1253               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1254                kind = scalar_load;
1255              else
1256                kind = scalar_store;
1257             }
1258           else
1259             kind = scalar_stmt;
1260
1261           scalar_single_iter_cost
1262             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1263                                  factor, kind, NULL, 0, vect_prologue);
1264         }
1265     }
1266   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1267     = scalar_single_iter_cost;
1268 }
1269
1270
1271 /* Function vect_analyze_loop_form_1.
1272
1273    Verify that certain CFG restrictions hold, including:
1274    - the loop has a pre-header
1275    - the loop has a single entry and exit
1276    - the loop exit condition is simple enough, and the number of iterations
1277      can be analyzed (a countable loop).  */
1278
1279 bool
1280 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1281                           tree *number_of_iterationsm1,
1282                           tree *number_of_iterations, gcond **inner_loop_cond)
1283 {
1284   if (dump_enabled_p ())
1285     dump_printf_loc (MSG_NOTE, vect_location,
1286                      "=== vect_analyze_loop_form ===\n");
1287
1288   /* Different restrictions apply when we are considering an inner-most loop,
1289      vs. an outer (nested) loop.
1290      (FORNOW. May want to relax some of these restrictions in the future).  */
1291
1292   if (!loop->inner)
1293     {
1294       /* Inner-most loop.  We currently require that the number of BBs is
1295          exactly 2 (the header and latch).  Vectorizable inner-most loops
1296          look like this:
1297
1298                         (pre-header)
1299                            |
1300                           header <--------+
1301                            | |            |
1302                            | +--> latch --+
1303                            |
1304                         (exit-bb)  */
1305
1306       if (loop->num_nodes != 2)
1307         {
1308           if (dump_enabled_p ())
1309             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310                              "not vectorized: control flow in loop.\n");
1311           return false;
1312         }
1313
1314       if (empty_block_p (loop->header))
1315         {
1316           if (dump_enabled_p ())
1317             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318                              "not vectorized: empty loop.\n");
1319           return false;
1320         }
1321     }
1322   else
1323     {
1324       struct loop *innerloop = loop->inner;
1325       edge entryedge;
1326
1327       /* Nested loop. We currently require that the loop is doubly-nested,
1328          contains a single inner loop, and the number of BBs is exactly 5.
1329          Vectorizable outer-loops look like this:
1330
1331                         (pre-header)
1332                            |
1333                           header <---+
1334                            |         |
1335                           inner-loop |
1336                            |         |
1337                           tail ------+
1338                            |
1339                         (exit-bb)
1340
1341          The inner-loop has the properties expected of inner-most loops
1342          as described above.  */
1343
1344       if ((loop->inner)->inner || (loop->inner)->next)
1345         {
1346           if (dump_enabled_p ())
1347             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348                              "not vectorized: multiple nested loops.\n");
1349           return false;
1350         }
1351
1352       if (loop->num_nodes != 5)
1353         {
1354           if (dump_enabled_p ())
1355             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1356                              "not vectorized: control flow in loop.\n");
1357           return false;
1358         }
1359
1360       entryedge = loop_preheader_edge (innerloop);
1361       if (entryedge->src != loop->header
1362           || !single_exit (innerloop)
1363           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1364         {
1365           if (dump_enabled_p ())
1366             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1367                              "not vectorized: unsupported outerloop form.\n");
1368           return false;
1369         }
1370
1371       /* Analyze the inner-loop.  */
1372       tree inner_niterm1, inner_niter;
1373       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1374                                       &inner_niterm1, &inner_niter, NULL))
1375         {
1376           if (dump_enabled_p ())
1377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378                              "not vectorized: Bad inner loop.\n");
1379           return false;
1380         }
1381
1382       if (!expr_invariant_in_loop_p (loop, inner_niter))
1383         {
1384           if (dump_enabled_p ())
1385             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1386                              "not vectorized: inner-loop count not"
1387                              " invariant.\n");
1388           return false;
1389         }
1390
1391       if (dump_enabled_p ())
1392         dump_printf_loc (MSG_NOTE, vect_location,
1393                          "Considering outer-loop vectorization.\n");
1394     }
1395
1396   if (!single_exit (loop)
1397       || EDGE_COUNT (loop->header->preds) != 2)
1398     {
1399       if (dump_enabled_p ())
1400         {
1401           if (!single_exit (loop))
1402             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1403                              "not vectorized: multiple exits.\n");
1404           else if (EDGE_COUNT (loop->header->preds) != 2)
1405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406                              "not vectorized: too many incoming edges.\n");
1407         }
1408       return false;
1409     }
1410
1411   /* We assume that the loop exit condition is at the end of the loop. i.e,
1412      that the loop is represented as a do-while (with a proper if-guard
1413      before the loop if needed), where the loop header contains all the
1414      executable statements, and the latch is empty.  */
1415   if (!empty_block_p (loop->latch)
1416       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1417     {
1418       if (dump_enabled_p ())
1419         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420                          "not vectorized: latch block not empty.\n");
1421       return false;
1422     }
1423
1424   /* Make sure there exists a single-predecessor exit bb:  */
1425   if (!single_pred_p (single_exit (loop)->dest))
1426     {
1427       edge e = single_exit (loop);
1428       if (!(e->flags & EDGE_ABNORMAL))
1429         {
1430           split_loop_exit_edge (e);
1431           if (dump_enabled_p ())
1432             dump_printf (MSG_NOTE, "split exit edge.\n");
1433         }
1434       else
1435         {
1436           if (dump_enabled_p ())
1437             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                              "not vectorized: abnormal loop exit edge.\n");
1439           return false;
1440         }
1441     }
1442
1443   *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1444                                      number_of_iterationsm1);
1445   if (!*loop_cond)
1446     {
1447       if (dump_enabled_p ())
1448         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                          "not vectorized: complicated exit condition.\n");
1450       return false;
1451     }
1452
1453   if (!*number_of_iterations
1454       || chrec_contains_undetermined (*number_of_iterations))
1455     {
1456       if (dump_enabled_p ())
1457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1458                          "not vectorized: number of iterations cannot be "
1459                          "computed.\n");
1460       return false;
1461     }
1462
1463   if (integer_zerop (*number_of_iterations))
1464     {
1465       if (dump_enabled_p ())
1466         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467                          "not vectorized: number of iterations = 0.\n");
1468       return false;
1469     }
1470
1471   return true;
1472 }
1473
1474 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1475
1476 loop_vec_info
1477 vect_analyze_loop_form (struct loop *loop)
1478 {
1479   tree number_of_iterations, number_of_iterationsm1;
1480   gcond *loop_cond, *inner_loop_cond = NULL;
1481
1482   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1483                                   &number_of_iterations, &inner_loop_cond))
1484     return NULL;
1485
1486   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1487   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1488   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1489   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1490
1491   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1492     {
1493       if (dump_enabled_p ())
1494         {
1495           dump_printf_loc (MSG_NOTE, vect_location,
1496                            "Symbolic number of iterations is ");
1497           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1498           dump_printf (MSG_NOTE, "\n");
1499         }
1500     }
1501
1502   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1503   if (inner_loop_cond)
1504     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1505       = loop_exit_ctrl_vec_info_type;
1506
1507   gcc_assert (!loop->aux);
1508   loop->aux = loop_vinfo;
1509   return loop_vinfo;
1510 }
1511
1512
1513
1514 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1515    statements update the vectorization factor.  */
1516
1517 static void
1518 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1519 {
1520   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1521   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1522   int nbbs = loop->num_nodes;
1523   unsigned int vectorization_factor;
1524   int i;
1525
1526   if (dump_enabled_p ())
1527     dump_printf_loc (MSG_NOTE, vect_location,
1528                      "=== vect_update_vf_for_slp ===\n");
1529
1530   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1531   gcc_assert (vectorization_factor != 0);
1532
1533   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1534      vectorization factor of the loop is the unrolling factor required by
1535      the SLP instances.  If that unrolling factor is 1, we say, that we
1536      perform pure SLP on loop - cross iteration parallelism is not
1537      exploited.  */
1538   bool only_slp_in_loop = true;
1539   for (i = 0; i < nbbs; i++)
1540     {
1541       basic_block bb = bbs[i];
1542       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1543            gsi_next (&si))
1544         {
1545           gimple *stmt = gsi_stmt (si);
1546           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1547           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1548               && STMT_VINFO_RELATED_STMT (stmt_info))
1549             {
1550               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1551               stmt_info = vinfo_for_stmt (stmt);
1552             }
1553           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1554                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1555               && !PURE_SLP_STMT (stmt_info))
1556             /* STMT needs both SLP and loop-based vectorization.  */
1557             only_slp_in_loop = false;
1558         }
1559     }
1560
1561   if (only_slp_in_loop)
1562     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1563   else
1564     vectorization_factor
1565       = least_common_multiple (vectorization_factor,
1566                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1567
1568   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1569   if (dump_enabled_p ())
1570     dump_printf_loc (MSG_NOTE, vect_location,
1571                      "Updating vectorization factor to %d\n",
1572                      vectorization_factor);
1573 }
1574
1575 /* Function vect_analyze_loop_operations.
1576
1577    Scan the loop stmts and make sure they are all vectorizable.  */
1578
1579 static bool
1580 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1581 {
1582   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1583   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1584   int nbbs = loop->num_nodes;
1585   int i;
1586   stmt_vec_info stmt_info;
1587   bool need_to_vectorize = false;
1588   bool ok;
1589
1590   if (dump_enabled_p ())
1591     dump_printf_loc (MSG_NOTE, vect_location,
1592                      "=== vect_analyze_loop_operations ===\n");
1593
1594   for (i = 0; i < nbbs; i++)
1595     {
1596       basic_block bb = bbs[i];
1597
1598       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1599            gsi_next (&si))
1600         {
1601           gphi *phi = si.phi ();
1602           ok = true;
1603
1604           stmt_info = vinfo_for_stmt (phi);
1605           if (dump_enabled_p ())
1606             {
1607               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1608               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1609               dump_printf (MSG_NOTE, "\n");
1610             }
1611           if (virtual_operand_p (gimple_phi_result (phi)))
1612             continue;
1613
1614           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1615              (i.e., a phi in the tail of the outer-loop).  */
1616           if (! is_loop_header_bb_p (bb))
1617             {
1618               /* FORNOW: we currently don't support the case that these phis
1619                  are not used in the outerloop (unless it is double reduction,
1620                  i.e., this phi is vect_reduction_def), cause this case
1621                  requires to actually do something here.  */
1622               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1623                    || STMT_VINFO_LIVE_P (stmt_info))
1624                   && STMT_VINFO_DEF_TYPE (stmt_info)
1625                      != vect_double_reduction_def)
1626                 {
1627                   if (dump_enabled_p ())
1628                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1629                                      "Unsupported loop-closed phi in "
1630                                      "outer-loop.\n");
1631                   return false;
1632                 }
1633
1634               /* If PHI is used in the outer loop, we check that its operand
1635                  is defined in the inner loop.  */
1636               if (STMT_VINFO_RELEVANT_P (stmt_info))
1637                 {
1638                   tree phi_op;
1639                   gimple *op_def_stmt;
1640
1641                   if (gimple_phi_num_args (phi) != 1)
1642                     return false;
1643
1644                   phi_op = PHI_ARG_DEF (phi, 0);
1645                   if (TREE_CODE (phi_op) != SSA_NAME)
1646                     return false;
1647
1648                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1649                   if (gimple_nop_p (op_def_stmt)
1650                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1651                       || !vinfo_for_stmt (op_def_stmt))
1652                     return false;
1653
1654                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1655                         != vect_used_in_outer
1656                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1657                            != vect_used_in_outer_by_reduction)
1658                     return false;
1659                 }
1660
1661               continue;
1662             }
1663
1664           gcc_assert (stmt_info);
1665
1666           if (STMT_VINFO_LIVE_P (stmt_info))
1667             {
1668               /* FORNOW: not yet supported.  */
1669               if (dump_enabled_p ())
1670                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1671                                  "not vectorized: value used after loop.\n");
1672               return false;
1673             }
1674
1675           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1676               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1677             {
1678               /* A scalar-dependence cycle that we don't support.  */
1679               if (dump_enabled_p ())
1680                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1681                                  "not vectorized: scalar dependence cycle.\n");
1682               return false;
1683             }
1684
1685           if (STMT_VINFO_RELEVANT_P (stmt_info))
1686             {
1687               need_to_vectorize = true;
1688               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1689                 ok = vectorizable_induction (phi, NULL, NULL);
1690             }
1691
1692           if (!ok)
1693             {
1694               if (dump_enabled_p ())
1695                 {
1696                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697                                    "not vectorized: relevant phi not "
1698                                    "supported: ");
1699                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1700                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1701                 }
1702               return false;
1703             }
1704         }
1705
1706       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1707            gsi_next (&si))
1708         {
1709           gimple *stmt = gsi_stmt (si);
1710           if (!gimple_clobber_p (stmt)
1711               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1712             return false;
1713         }
1714     } /* bbs */
1715
1716   /* All operations in the loop are either irrelevant (deal with loop
1717      control, or dead), or only used outside the loop and can be moved
1718      out of the loop (e.g. invariants, inductions).  The loop can be
1719      optimized away by scalar optimizations.  We're better off not
1720      touching this loop.  */
1721   if (!need_to_vectorize)
1722     {
1723       if (dump_enabled_p ())
1724         dump_printf_loc (MSG_NOTE, vect_location,
1725                          "All the computation can be taken out of the loop.\n");
1726       if (dump_enabled_p ())
1727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1728                          "not vectorized: redundant loop. no profit to "
1729                          "vectorize.\n");
1730       return false;
1731     }
1732
1733   return true;
1734 }
1735
1736
1737 /* Function vect_analyze_loop_2.
1738
1739    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1740    for it.  The different analyses will record information in the
1741    loop_vec_info struct.  */
1742 static bool
1743 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1744 {
1745   bool ok;
1746   int max_vf = MAX_VECTORIZATION_FACTOR;
1747   int min_vf = 2;
1748   unsigned int n_stmts = 0;
1749
1750   /* The first group of checks is independent of the vector size.  */
1751   fatal = true;
1752
1753   /* Find all data references in the loop (which correspond to vdefs/vuses)
1754      and analyze their evolution in the loop.  */
1755
1756   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1757
1758   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1759   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1760     {
1761       if (dump_enabled_p ())
1762         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763                          "not vectorized: loop contains function calls"
1764                          " or data references that cannot be analyzed\n");
1765       return false;
1766     }
1767
1768   for (unsigned i = 0; i < loop->num_nodes; i++)
1769     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1770          !gsi_end_p (gsi); gsi_next (&gsi))
1771       {
1772         gimple *stmt = gsi_stmt (gsi);
1773         if (is_gimple_debug (stmt))
1774           continue;
1775         ++n_stmts;
1776         if (!find_data_references_in_stmt (loop, stmt,
1777                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1778           {
1779             if (is_gimple_call (stmt) && loop->safelen)
1780               {
1781                 tree fndecl = gimple_call_fndecl (stmt), op;
1782                 if (fndecl != NULL_TREE)
1783                   {
1784                     cgraph_node *node = cgraph_node::get (fndecl);
1785                     if (node != NULL && node->simd_clones != NULL)
1786                       {
1787                         unsigned int j, n = gimple_call_num_args (stmt);
1788                         for (j = 0; j < n; j++)
1789                           {
1790                             op = gimple_call_arg (stmt, j);
1791                             if (DECL_P (op)
1792                                 || (REFERENCE_CLASS_P (op)
1793                                     && get_base_address (op)))
1794                               break;
1795                           }
1796                         op = gimple_call_lhs (stmt);
1797                         /* Ignore #pragma omp declare simd functions
1798                            if they don't have data references in the
1799                            call stmt itself.  */
1800                         if (j == n
1801                             && !(op
1802                                  && (DECL_P (op)
1803                                      || (REFERENCE_CLASS_P (op)
1804                                          && get_base_address (op)))))
1805                           continue;
1806                       }
1807                   }
1808               }
1809             if (dump_enabled_p ())
1810               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1811                                "not vectorized: loop contains function "
1812                                "calls or data references that cannot "
1813                                "be analyzed\n");
1814             return false;
1815           }
1816       }
1817
1818   /* Analyze the data references and also adjust the minimal
1819      vectorization factor according to the loads and stores.  */
1820
1821   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1822   if (!ok)
1823     {
1824       if (dump_enabled_p ())
1825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                          "bad data references.\n");
1827       return false;
1828     }
1829
1830   /* Classify all cross-iteration scalar data-flow cycles.
1831      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1832   vect_analyze_scalar_cycles (loop_vinfo);
1833
1834   vect_pattern_recog (loop_vinfo);
1835
1836   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1837
1838   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1839      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1840
1841   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1842   if (!ok)
1843     {
1844       if (dump_enabled_p ())
1845         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1846                          "bad data access.\n");
1847       return false;
1848     }
1849
1850   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1851
1852   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1853   if (!ok)
1854     {
1855       if (dump_enabled_p ())
1856         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857                          "unexpected pattern.\n");
1858       return false;
1859     }
1860
1861   /* While the rest of the analysis below depends on it in some way.  */
1862   fatal = false;
1863
1864   /* Analyze data dependences between the data-refs in the loop
1865      and adjust the maximum vectorization factor according to
1866      the dependences.
1867      FORNOW: fail at the first data dependence that we encounter.  */
1868
1869   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1870   if (!ok
1871       || max_vf < min_vf)
1872     {
1873       if (dump_enabled_p ())
1874             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875                              "bad data dependence.\n");
1876       return false;
1877     }
1878
1879   ok = vect_determine_vectorization_factor (loop_vinfo);
1880   if (!ok)
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884                          "can't determine vectorization factor.\n");
1885       return false;
1886     }
1887   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1888     {
1889       if (dump_enabled_p ())
1890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891                          "bad data dependence.\n");
1892       return false;
1893     }
1894
1895   /* Compute the scalar iteration cost.  */
1896   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1897
1898   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1899   HOST_WIDE_INT estimated_niter;
1900   unsigned th;
1901   int min_scalar_loop_bound;
1902
1903   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1904   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1905   if (!ok)
1906     return false;
1907
1908   /* If there are any SLP instances mark them as pure_slp.  */
1909   bool slp = vect_make_slp_decision (loop_vinfo);
1910   if (slp)
1911     {
1912       /* Find stmts that need to be both vectorized and SLPed.  */
1913       vect_detect_hybrid_slp (loop_vinfo);
1914
1915       /* Update the vectorization factor based on the SLP decision.  */
1916       vect_update_vf_for_slp (loop_vinfo);
1917     }
1918
1919   /* This is the point where we can re-start analysis with SLP forced off.  */
1920 start_over:
1921
1922   /* Now the vectorization factor is final.  */
1923   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1924   gcc_assert (vectorization_factor != 0);
1925
1926   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1927     dump_printf_loc (MSG_NOTE, vect_location,
1928                      "vectorization_factor = %d, niters = "
1929                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1930                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1931
1932   HOST_WIDE_INT max_niter
1933     = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1934   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1935        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1936       || (max_niter != -1
1937           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1938     {
1939       if (dump_enabled_p ())
1940         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941                          "not vectorized: iteration count smaller than "
1942                          "vectorization factor.\n");
1943       return false;
1944     }
1945
1946   /* Analyze the alignment of the data-refs in the loop.
1947      Fail if a data reference is found that cannot be vectorized.  */
1948
1949   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1950   if (!ok)
1951     {
1952       if (dump_enabled_p ())
1953         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1954                          "bad data alignment.\n");
1955       return false;
1956     }
1957
1958   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1959      It is important to call pruning after vect_analyze_data_ref_accesses,
1960      since we use grouping information gathered by interleaving analysis.  */
1961   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1962   if (!ok)
1963     {
1964       if (dump_enabled_p ())
1965         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1966                          "number of versioning for alias "
1967                          "run-time tests exceeds %d "
1968                          "(--param vect-max-version-for-alias-checks)\n",
1969                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1970       return false;
1971     }
1972
1973   /* This pass will decide on using loop versioning and/or loop peeling in
1974      order to enhance the alignment of data references in the loop.  */
1975   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1976   if (!ok)
1977     {
1978       if (dump_enabled_p ())
1979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980                          "bad data alignment.\n");
1981       return false;
1982     }
1983
1984   if (slp)
1985     {
1986       /* Analyze operations in the SLP instances.  Note this may
1987          remove unsupported SLP instances which makes the above
1988          SLP kind detection invalid.  */
1989       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1990       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1991                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1992       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1993         goto again;
1994     }
1995
1996   /* Scan all the remaining operations in the loop that are not subject
1997      to SLP and make sure they are vectorizable.  */
1998   ok = vect_analyze_loop_operations (loop_vinfo);
1999   if (!ok)
2000     {
2001       if (dump_enabled_p ())
2002         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003                          "bad operation or unsupported loop bound.\n");
2004       return false;
2005     }
2006
2007   /* Analyze cost.  Decide if worth while to vectorize.  */
2008   int min_profitable_estimate, min_profitable_iters;
2009   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2010                                       &min_profitable_estimate);
2011
2012   if (min_profitable_iters < 0)
2013     {
2014       if (dump_enabled_p ())
2015         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2016                          "not vectorized: vectorization not profitable.\n");
2017       if (dump_enabled_p ())
2018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2019                          "not vectorized: vector version will never be "
2020                          "profitable.\n");
2021       goto again;
2022     }
2023
2024   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2025                             * vectorization_factor) - 1);
2026
2027   /* Use the cost model only if it is more conservative than user specified
2028      threshold.  */
2029   th = (unsigned) min_scalar_loop_bound;
2030   if (min_profitable_iters
2031       && (!min_scalar_loop_bound
2032           || min_profitable_iters > min_scalar_loop_bound))
2033     th = (unsigned) min_profitable_iters;
2034
2035   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2036
2037   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2038       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2039     {
2040       if (dump_enabled_p ())
2041         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2042                          "not vectorized: vectorization not profitable.\n");
2043       if (dump_enabled_p ())
2044         dump_printf_loc (MSG_NOTE, vect_location,
2045                          "not vectorized: iteration count smaller than user "
2046                          "specified loop bound parameter or minimum profitable "
2047                          "iterations (whichever is more conservative).\n");
2048       goto again;
2049     }
2050
2051   estimated_niter
2052     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2053   if (estimated_niter != -1
2054       && ((unsigned HOST_WIDE_INT) estimated_niter
2055           <= MAX (th, (unsigned)min_profitable_estimate)))
2056     {
2057       if (dump_enabled_p ())
2058         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059                          "not vectorized: estimated iteration count too "
2060                          "small.\n");
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_NOTE, vect_location,
2063                          "not vectorized: estimated iteration count smaller "
2064                          "than specified loop bound parameter or minimum "
2065                          "profitable iterations (whichever is more "
2066                          "conservative).\n");
2067       goto again;
2068     }
2069
2070   /* Decide whether we need to create an epilogue loop to handle
2071      remaining scalar iterations.  */
2072   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2073         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2074        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2075
2076   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2077       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2078     {
2079       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2080                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2081           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2082         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2083     }
2084   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2085            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2086                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2087                /* In case of versioning, check if the maximum number of
2088                   iterations is greater than th.  If they are identical,
2089                   the epilogue is unnecessary.  */
2090                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2091                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2092                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2093     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2094
2095   /* If an epilogue loop is required make sure we can create one.  */
2096   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2097       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2098     {
2099       if (dump_enabled_p ())
2100         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2101       if (!vect_can_advance_ivs_p (loop_vinfo)
2102           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2103                                            single_exit (LOOP_VINFO_LOOP
2104                                                          (loop_vinfo))))
2105         {
2106           if (dump_enabled_p ())
2107             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108                              "not vectorized: can't create required "
2109                              "epilog loop\n");
2110           goto again;
2111         }
2112     }
2113
2114   gcc_assert (vectorization_factor
2115               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2116
2117   /* Ok to vectorize!  */
2118   return true;
2119
2120 again:
2121   /* Try again with SLP forced off but if we didn't do any SLP there is
2122      no point in re-trying.  */
2123   if (!slp)
2124     return false;
2125
2126   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2127      via interleaving or lane instructions or if there were any SLP
2128      reductions.  */
2129   slp_instance instance;
2130   slp_tree node;
2131   unsigned i, j;
2132   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2133     {
2134       stmt_vec_info vinfo;
2135       vinfo = vinfo_for_stmt
2136           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2137       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2138         return false;
2139       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2140       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2141       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2142       if (! vect_store_lanes_supported (vectype, size)
2143           && ! vect_grouped_store_supported (vectype, size))
2144         return false;
2145       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2146         {
2147           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2148           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2149           size = STMT_VINFO_GROUP_SIZE (vinfo);
2150           vectype = STMT_VINFO_VECTYPE (vinfo);
2151           if (! vect_load_lanes_supported (vectype, size)
2152               && ! vect_grouped_load_supported (vectype, size))
2153             return false;
2154         }
2155     }
2156
2157   if (dump_enabled_p ())
2158     dump_printf_loc (MSG_NOTE, vect_location,
2159                      "re-trying with SLP disabled\n");
2160
2161   /* Roll back state appropriately.  No SLP this time.  */
2162   slp = false;
2163   /* Restore vectorization factor as it were without SLP.  */
2164   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2165   /* Free the SLP instances.  */
2166   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2167     vect_free_slp_instance (instance);
2168   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2169   /* Reset SLP type to loop_vect on all stmts.  */
2170   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2171     {
2172       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2173       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2174            !gsi_end_p (si); gsi_next (&si))
2175         {
2176           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2177           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2178             {
2179               gcc_assert (STMT_SLP_TYPE (stmt_info) == loop_vect);
2180               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2181             }
2182           STMT_SLP_TYPE (stmt_info) = loop_vect;
2183         }
2184     }
2185   /* Free optimized alias test DDRS.  */
2186   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2187   /* Reset target cost data.  */
2188   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2189   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2190     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2191   /* Reset assorted flags.  */
2192   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2193   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2194
2195   goto start_over;
2196 }
2197
2198 /* Function vect_analyze_loop.
2199
2200    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2201    for it.  The different analyses will record information in the
2202    loop_vec_info struct.  */
2203 loop_vec_info
2204 vect_analyze_loop (struct loop *loop)
2205 {
2206   loop_vec_info loop_vinfo;
2207   unsigned int vector_sizes;
2208
2209   /* Autodetect first vector size we try.  */
2210   current_vector_size = 0;
2211   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2212
2213   if (dump_enabled_p ())
2214     dump_printf_loc (MSG_NOTE, vect_location,
2215                      "===== analyze_loop_nest =====\n");
2216
2217   if (loop_outer (loop)
2218       && loop_vec_info_for_loop (loop_outer (loop))
2219       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2220     {
2221       if (dump_enabled_p ())
2222         dump_printf_loc (MSG_NOTE, vect_location,
2223                          "outer-loop already vectorized.\n");
2224       return NULL;
2225     }
2226
2227   while (1)
2228     {
2229       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2230       loop_vinfo = vect_analyze_loop_form (loop);
2231       if (!loop_vinfo)
2232         {
2233           if (dump_enabled_p ())
2234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2235                              "bad loop form.\n");
2236           return NULL;
2237         }
2238
2239       bool fatal = false;
2240       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2241         {
2242           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2243
2244           return loop_vinfo;
2245         }
2246
2247       destroy_loop_vec_info (loop_vinfo, true);
2248
2249       vector_sizes &= ~current_vector_size;
2250       if (fatal
2251           || vector_sizes == 0
2252           || current_vector_size == 0)
2253         return NULL;
2254
2255       /* Try the next biggest vector size.  */
2256       current_vector_size = 1 << floor_log2 (vector_sizes);
2257       if (dump_enabled_p ())
2258         dump_printf_loc (MSG_NOTE, vect_location,
2259                          "***** Re-trying analysis with "
2260                          "vector size %d\n", current_vector_size);
2261     }
2262 }
2263
2264
2265 /* Function reduction_code_for_scalar_code
2266
2267    Input:
2268    CODE - tree_code of a reduction operations.
2269
2270    Output:
2271    REDUC_CODE - the corresponding tree-code to be used to reduce the
2272       vector of partial results into a single scalar result, or ERROR_MARK
2273       if the operation is a supported reduction operation, but does not have
2274       such a tree-code.
2275
2276    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2277
2278 static bool
2279 reduction_code_for_scalar_code (enum tree_code code,
2280                                 enum tree_code *reduc_code)
2281 {
2282   switch (code)
2283     {
2284       case MAX_EXPR:
2285         *reduc_code = REDUC_MAX_EXPR;
2286         return true;
2287
2288       case MIN_EXPR:
2289         *reduc_code = REDUC_MIN_EXPR;
2290         return true;
2291
2292       case PLUS_EXPR:
2293         *reduc_code = REDUC_PLUS_EXPR;
2294         return true;
2295
2296       case MULT_EXPR:
2297       case MINUS_EXPR:
2298       case BIT_IOR_EXPR:
2299       case BIT_XOR_EXPR:
2300       case BIT_AND_EXPR:
2301         *reduc_code = ERROR_MARK;
2302         return true;
2303
2304       default:
2305        return false;
2306     }
2307 }
2308
2309
2310 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2311    STMT is printed with a message MSG. */
2312
2313 static void
2314 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2315 {
2316   dump_printf_loc (msg_type, vect_location, "%s", msg);
2317   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2318   dump_printf (msg_type, "\n");
2319 }
2320
2321
2322 /* Detect SLP reduction of the form:
2323
2324    #a1 = phi <a5, a0>
2325    a2 = operation (a1)
2326    a3 = operation (a2)
2327    a4 = operation (a3)
2328    a5 = operation (a4)
2329
2330    #a = phi <a5>
2331
2332    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2333    FIRST_STMT is the first reduction stmt in the chain
2334    (a2 = operation (a1)).
2335
2336    Return TRUE if a reduction chain was detected.  */
2337
2338 static bool
2339 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2340                        gimple *first_stmt)
2341 {
2342   struct loop *loop = (gimple_bb (phi))->loop_father;
2343   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2344   enum tree_code code;
2345   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2346   stmt_vec_info use_stmt_info, current_stmt_info;
2347   tree lhs;
2348   imm_use_iterator imm_iter;
2349   use_operand_p use_p;
2350   int nloop_uses, size = 0, n_out_of_loop_uses;
2351   bool found = false;
2352
2353   if (loop != vect_loop)
2354     return false;
2355
2356   lhs = PHI_RESULT (phi);
2357   code = gimple_assign_rhs_code (first_stmt);
2358   while (1)
2359     {
2360       nloop_uses = 0;
2361       n_out_of_loop_uses = 0;
2362       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2363         {
2364           gimple *use_stmt = USE_STMT (use_p);
2365           if (is_gimple_debug (use_stmt))
2366             continue;
2367
2368           /* Check if we got back to the reduction phi.  */
2369           if (use_stmt == phi)
2370             {
2371               loop_use_stmt = use_stmt;
2372               found = true;
2373               break;
2374             }
2375
2376           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2377             {
2378               loop_use_stmt = use_stmt;
2379               nloop_uses++;
2380             }
2381            else
2382              n_out_of_loop_uses++;
2383
2384            /* There are can be either a single use in the loop or two uses in
2385               phi nodes.  */
2386            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2387              return false;
2388         }
2389
2390       if (found)
2391         break;
2392
2393       /* We reached a statement with no loop uses.  */
2394       if (nloop_uses == 0)
2395         return false;
2396
2397       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2398       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2399         return false;
2400
2401       if (!is_gimple_assign (loop_use_stmt)
2402           || code != gimple_assign_rhs_code (loop_use_stmt)
2403           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2404         return false;
2405
2406       /* Insert USE_STMT into reduction chain.  */
2407       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2408       if (current_stmt)
2409         {
2410           current_stmt_info = vinfo_for_stmt (current_stmt);
2411           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2412           GROUP_FIRST_ELEMENT (use_stmt_info)
2413             = GROUP_FIRST_ELEMENT (current_stmt_info);
2414         }
2415       else
2416         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2417
2418       lhs = gimple_assign_lhs (loop_use_stmt);
2419       current_stmt = loop_use_stmt;
2420       size++;
2421    }
2422
2423   if (!found || loop_use_stmt != phi || size < 2)
2424     return false;
2425
2426   /* Swap the operands, if needed, to make the reduction operand be the second
2427      operand.  */
2428   lhs = PHI_RESULT (phi);
2429   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2430   while (next_stmt)
2431     {
2432       if (gimple_assign_rhs2 (next_stmt) == lhs)
2433         {
2434           tree op = gimple_assign_rhs1 (next_stmt);
2435           gimple *def_stmt = NULL;
2436
2437           if (TREE_CODE (op) == SSA_NAME)
2438             def_stmt = SSA_NAME_DEF_STMT (op);
2439
2440           /* Check that the other def is either defined in the loop
2441              ("vect_internal_def"), or it's an induction (defined by a
2442              loop-header phi-node).  */
2443           if (def_stmt
2444               && gimple_bb (def_stmt)
2445               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2446               && (is_gimple_assign (def_stmt)
2447                   || is_gimple_call (def_stmt)
2448                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2449                            == vect_induction_def
2450                   || (gimple_code (def_stmt) == GIMPLE_PHI
2451                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2452                                   == vect_internal_def
2453                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2454             {
2455               lhs = gimple_assign_lhs (next_stmt);
2456               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2457               continue;
2458             }
2459
2460           return false;
2461         }
2462       else
2463         {
2464           tree op = gimple_assign_rhs2 (next_stmt);
2465           gimple *def_stmt = NULL;
2466
2467           if (TREE_CODE (op) == SSA_NAME)
2468             def_stmt = SSA_NAME_DEF_STMT (op);
2469
2470           /* Check that the other def is either defined in the loop
2471             ("vect_internal_def"), or it's an induction (defined by a
2472             loop-header phi-node).  */
2473           if (def_stmt
2474               && gimple_bb (def_stmt)
2475               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2476               && (is_gimple_assign (def_stmt)
2477                   || is_gimple_call (def_stmt)
2478                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2479                               == vect_induction_def
2480                   || (gimple_code (def_stmt) == GIMPLE_PHI
2481                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2482                                   == vect_internal_def
2483                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2484             {
2485               if (dump_enabled_p ())
2486                 {
2487                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2488                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2489                   dump_printf (MSG_NOTE, "\n");
2490                 }
2491
2492               swap_ssa_operands (next_stmt,
2493                                  gimple_assign_rhs1_ptr (next_stmt),
2494                                  gimple_assign_rhs2_ptr (next_stmt));
2495               update_stmt (next_stmt);
2496
2497               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2498                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2499             }
2500           else
2501             return false;
2502         }
2503
2504       lhs = gimple_assign_lhs (next_stmt);
2505       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2506     }
2507
2508   /* Save the chain for further analysis in SLP detection.  */
2509   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2510   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2511   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2512
2513   return true;
2514 }
2515
2516
2517 /* Function vect_is_simple_reduction_1
2518
2519    (1) Detect a cross-iteration def-use cycle that represents a simple
2520    reduction computation.  We look for the following pattern:
2521
2522    loop_header:
2523      a1 = phi < a0, a2 >
2524      a3 = ...
2525      a2 = operation (a3, a1)
2526
2527    or
2528
2529    a3 = ...
2530    loop_header:
2531      a1 = phi < a0, a2 >
2532      a2 = operation (a3, a1)
2533
2534    such that:
2535    1. operation is commutative and associative and it is safe to
2536       change the order of the computation (if CHECK_REDUCTION is true)
2537    2. no uses for a2 in the loop (a2 is used out of the loop)
2538    3. no uses of a1 in the loop besides the reduction operation
2539    4. no uses of a1 outside the loop.
2540
2541    Conditions 1,4 are tested here.
2542    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2543
2544    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2545    nested cycles, if CHECK_REDUCTION is false.
2546
2547    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2548    reductions:
2549
2550      a1 = phi < a0, a2 >
2551      inner loop (def of a3)
2552      a2 = phi < a3 >
2553
2554    (4) Detect condition expressions, ie:
2555      for (int i = 0; i < N; i++)
2556        if (a[i] < val)
2557         ret_val = a[i];
2558
2559 */
2560
2561 static gimple *
2562 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2563                           bool check_reduction, bool *double_reduc,
2564                           bool need_wrapping_integral_overflow,
2565                           enum vect_reduction_type *v_reduc_type)
2566 {
2567   struct loop *loop = (gimple_bb (phi))->loop_father;
2568   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2569   edge latch_e = loop_latch_edge (loop);
2570   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2571   gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2572   enum tree_code orig_code, code;
2573   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2574   tree type;
2575   int nloop_uses;
2576   tree name;
2577   imm_use_iterator imm_iter;
2578   use_operand_p use_p;
2579   bool phi_def;
2580
2581   *double_reduc = false;
2582   *v_reduc_type = TREE_CODE_REDUCTION;
2583
2584   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2585      otherwise, we assume outer loop vectorization.  */
2586   gcc_assert ((check_reduction && loop == vect_loop)
2587               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2588
2589   name = PHI_RESULT (phi);
2590   /* ???  If there are no uses of the PHI result the inner loop reduction
2591      won't be detected as possibly double-reduction by vectorizable_reduction
2592      because that tries to walk the PHI arg from the preheader edge which
2593      can be constant.  See PR60382.  */
2594   if (has_zero_uses (name))
2595     return NULL;
2596   nloop_uses = 0;
2597   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2598     {
2599       gimple *use_stmt = USE_STMT (use_p);
2600       if (is_gimple_debug (use_stmt))
2601         continue;
2602
2603       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2604         {
2605           if (dump_enabled_p ())
2606             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2607                              "intermediate value used outside loop.\n");
2608
2609           return NULL;
2610         }
2611
2612       nloop_uses++;
2613       if (nloop_uses > 1)
2614         {
2615           if (dump_enabled_p ())
2616             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617                              "reduction used in loop.\n");
2618           return NULL;
2619         }
2620     }
2621
2622   if (TREE_CODE (loop_arg) != SSA_NAME)
2623     {
2624       if (dump_enabled_p ())
2625         {
2626           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627                            "reduction: not ssa_name: ");
2628           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2629           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2630         }
2631       return NULL;
2632     }
2633
2634   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2635   if (!def_stmt)
2636     {
2637       if (dump_enabled_p ())
2638         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2639                          "reduction: no def_stmt.\n");
2640       return NULL;
2641     }
2642
2643   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2644     {
2645       if (dump_enabled_p ())
2646         {
2647           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2648           dump_printf (MSG_NOTE, "\n");
2649         }
2650       return NULL;
2651     }
2652
2653   if (is_gimple_assign (def_stmt))
2654     {
2655       name = gimple_assign_lhs (def_stmt);
2656       phi_def = false;
2657     }
2658   else
2659     {
2660       name = PHI_RESULT (def_stmt);
2661       phi_def = true;
2662     }
2663
2664   nloop_uses = 0;
2665   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2666     {
2667       gimple *use_stmt = USE_STMT (use_p);
2668       if (is_gimple_debug (use_stmt))
2669         continue;
2670       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2671         nloop_uses++;
2672       if (nloop_uses > 1)
2673         {
2674           if (dump_enabled_p ())
2675             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2676                              "reduction used in loop.\n");
2677           return NULL;
2678         }
2679     }
2680
2681   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2682      defined in the inner loop.  */
2683   if (phi_def)
2684     {
2685       op1 = PHI_ARG_DEF (def_stmt, 0);
2686
2687       if (gimple_phi_num_args (def_stmt) != 1
2688           || TREE_CODE (op1) != SSA_NAME)
2689         {
2690           if (dump_enabled_p ())
2691             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2692                              "unsupported phi node definition.\n");
2693
2694           return NULL;
2695         }
2696
2697       def1 = SSA_NAME_DEF_STMT (op1);
2698       if (gimple_bb (def1)
2699           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2700           && loop->inner
2701           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2702           && is_gimple_assign (def1))
2703         {
2704           if (dump_enabled_p ())
2705             report_vect_op (MSG_NOTE, def_stmt,
2706                             "detected double reduction: ");
2707
2708           *double_reduc = true;
2709           return def_stmt;
2710         }
2711
2712       return NULL;
2713     }
2714
2715   code = orig_code = gimple_assign_rhs_code (def_stmt);
2716
2717   /* We can handle "res -= x[i]", which is non-associative by
2718      simply rewriting this into "res += -x[i]".  Avoid changing
2719      gimple instruction for the first simple tests and only do this
2720      if we're allowed to change code at all.  */
2721   if (code == MINUS_EXPR
2722       && (op1 = gimple_assign_rhs1 (def_stmt))
2723       && TREE_CODE (op1) == SSA_NAME
2724       && SSA_NAME_DEF_STMT (op1) == phi)
2725     code = PLUS_EXPR;
2726
2727   if (check_reduction)
2728     {
2729       if (code == COND_EXPR)
2730         *v_reduc_type = COND_REDUCTION;
2731       else if (!commutative_tree_code (code) || !associative_tree_code (code))
2732         {
2733           if (dump_enabled_p ())
2734             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2735                             "reduction: not commutative/associative: ");
2736           return NULL;
2737         }
2738     }
2739
2740   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2741     {
2742       if (code != COND_EXPR)
2743         {
2744           if (dump_enabled_p ())
2745             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2746                             "reduction: not binary operation: ");
2747
2748           return NULL;
2749         }
2750
2751       op3 = gimple_assign_rhs1 (def_stmt);
2752       if (COMPARISON_CLASS_P (op3))
2753         {
2754           op4 = TREE_OPERAND (op3, 1);
2755           op3 = TREE_OPERAND (op3, 0);
2756         }
2757
2758       op1 = gimple_assign_rhs2 (def_stmt);
2759       op2 = gimple_assign_rhs3 (def_stmt);
2760
2761       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2762         {
2763           if (dump_enabled_p ())
2764             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2765                             "reduction: uses not ssa_names: ");
2766
2767           return NULL;
2768         }
2769     }
2770   else
2771     {
2772       op1 = gimple_assign_rhs1 (def_stmt);
2773       op2 = gimple_assign_rhs2 (def_stmt);
2774
2775       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2776         {
2777           if (dump_enabled_p ())
2778             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2779                             "reduction: uses not ssa_names: ");
2780
2781           return NULL;
2782         }
2783    }
2784
2785   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2786   if ((TREE_CODE (op1) == SSA_NAME
2787        && !types_compatible_p (type,TREE_TYPE (op1)))
2788       || (TREE_CODE (op2) == SSA_NAME
2789           && !types_compatible_p (type, TREE_TYPE (op2)))
2790       || (op3 && TREE_CODE (op3) == SSA_NAME
2791           && !types_compatible_p (type, TREE_TYPE (op3)))
2792       || (op4 && TREE_CODE (op4) == SSA_NAME
2793           && !types_compatible_p (type, TREE_TYPE (op4))))
2794     {
2795       if (dump_enabled_p ())
2796         {
2797           dump_printf_loc (MSG_NOTE, vect_location,
2798                            "reduction: multiple types: operation type: ");
2799           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2800           dump_printf (MSG_NOTE, ", operands types: ");
2801           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2802                              TREE_TYPE (op1));
2803           dump_printf (MSG_NOTE, ",");
2804           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2805                              TREE_TYPE (op2));
2806           if (op3)
2807             {
2808               dump_printf (MSG_NOTE, ",");
2809               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2810                                  TREE_TYPE (op3));
2811             }
2812
2813           if (op4)
2814             {
2815               dump_printf (MSG_NOTE, ",");
2816               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2817                                  TREE_TYPE (op4));
2818             }
2819           dump_printf (MSG_NOTE, "\n");
2820         }
2821
2822       return NULL;
2823     }
2824
2825   /* Check that it's ok to change the order of the computation.
2826      Generally, when vectorizing a reduction we change the order of the
2827      computation.  This may change the behavior of the program in some
2828      cases, so we need to check that this is ok.  One exception is when
2829      vectorizing an outer-loop: the inner-loop is executed sequentially,
2830      and therefore vectorizing reductions in the inner-loop during
2831      outer-loop vectorization is safe.  */
2832
2833   if (*v_reduc_type != COND_REDUCTION)
2834     {
2835       /* CHECKME: check for !flag_finite_math_only too?  */
2836       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2837           && check_reduction)
2838         {
2839           /* Changing the order of operations changes the semantics.  */
2840           if (dump_enabled_p ())
2841             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2842                         "reduction: unsafe fp math optimization: ");
2843           return NULL;
2844         }
2845       else if (INTEGRAL_TYPE_P (type) && check_reduction)
2846         {
2847           if (!operation_no_trapping_overflow (type, code))
2848             {
2849               /* Changing the order of operations changes the semantics.  */
2850               if (dump_enabled_p ())
2851                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2852                                 "reduction: unsafe int math optimization"
2853                                 " (overflow traps): ");
2854               return NULL;
2855             }
2856           if (need_wrapping_integral_overflow
2857               && !TYPE_OVERFLOW_WRAPS (type)
2858               && operation_can_overflow (code))
2859             {
2860               /* Changing the order of operations changes the semantics.  */
2861               if (dump_enabled_p ())
2862                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2863                                 "reduction: unsafe int math optimization"
2864                                 " (overflow doesn't wrap): ");
2865               return NULL;
2866             }
2867         }
2868       else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2869         {
2870           /* Changing the order of operations changes the semantics.  */
2871           if (dump_enabled_p ())
2872           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2873                           "reduction: unsafe fixed-point math optimization: ");
2874           return NULL;
2875         }
2876     }
2877
2878   /* Reduction is safe. We're dealing with one of the following:
2879      1) integer arithmetic and no trapv
2880      2) floating point arithmetic, and special flags permit this optimization
2881      3) nested cycle (i.e., outer loop vectorization).  */
2882   if (TREE_CODE (op1) == SSA_NAME)
2883     def1 = SSA_NAME_DEF_STMT (op1);
2884
2885   if (TREE_CODE (op2) == SSA_NAME)
2886     def2 = SSA_NAME_DEF_STMT (op2);
2887
2888   if (code != COND_EXPR
2889       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2890     {
2891       if (dump_enabled_p ())
2892         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2893       return NULL;
2894     }
2895
2896   /* Check that one def is the reduction def, defined by PHI,
2897      the other def is either defined in the loop ("vect_internal_def"),
2898      or it's an induction (defined by a loop-header phi-node).  */
2899
2900   if (def2 && def2 == phi
2901       && (code == COND_EXPR
2902           || !def1 || gimple_nop_p (def1)
2903           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2904           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2905               && (is_gimple_assign (def1)
2906                   || is_gimple_call (def1)
2907                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2908                       == vect_induction_def
2909                   || (gimple_code (def1) == GIMPLE_PHI
2910                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2911                           == vect_internal_def
2912                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2913     {
2914       if (dump_enabled_p ())
2915         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2916       return def_stmt;
2917     }
2918
2919   if (def1 && def1 == phi
2920       && (code == COND_EXPR
2921           || !def2 || gimple_nop_p (def2)
2922           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2923           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2924               && (is_gimple_assign (def2)
2925                   || is_gimple_call (def2)
2926                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2927                       == vect_induction_def
2928                   || (gimple_code (def2) == GIMPLE_PHI
2929                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2930                           == vect_internal_def
2931                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2932     {
2933       if (check_reduction
2934           && orig_code != MINUS_EXPR)
2935         {
2936           if (code == COND_EXPR)
2937             {
2938               /* No current known use where this case would be useful.  */
2939               if (dump_enabled_p ())
2940                 report_vect_op (MSG_NOTE, def_stmt,
2941                                 "detected reduction: cannot currently swap "
2942                                 "operands for cond_expr");
2943               return NULL;
2944             }
2945
2946           /* Swap operands (just for simplicity - so that the rest of the code
2947              can assume that the reduction variable is always the last (second)
2948              argument).  */
2949           if (dump_enabled_p ())
2950             report_vect_op (MSG_NOTE, def_stmt,
2951                             "detected reduction: need to swap operands: ");
2952
2953           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2954                              gimple_assign_rhs2_ptr (def_stmt));
2955
2956           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2957             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2958         }
2959       else
2960         {
2961           if (dump_enabled_p ())
2962             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2963         }
2964
2965       return def_stmt;
2966     }
2967
2968   /* Try to find SLP reduction chain.  */
2969   if (check_reduction && code != COND_EXPR
2970       && vect_is_slp_reduction (loop_info, phi, def_stmt))
2971     {
2972       if (dump_enabled_p ())
2973         report_vect_op (MSG_NOTE, def_stmt,
2974                         "reduction: detected reduction chain: ");
2975
2976       return def_stmt;
2977     }
2978
2979   if (dump_enabled_p ())
2980     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2981                     "reduction: unknown pattern: ");
2982
2983   return NULL;
2984 }
2985
2986 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2987    in-place if it enables detection of more reductions.  Arguments
2988    as there.  */
2989
2990 gimple *
2991 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2992                              bool check_reduction, bool *double_reduc,
2993                              bool need_wrapping_integral_overflow)
2994 {
2995   enum vect_reduction_type v_reduc_type;
2996   return vect_is_simple_reduction (loop_info, phi, check_reduction,
2997                                    double_reduc,
2998                                    need_wrapping_integral_overflow,
2999                                    &v_reduc_type);
3000 }
3001
3002 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3003 int
3004 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3005                              int *peel_iters_epilogue,
3006                              stmt_vector_for_cost *scalar_cost_vec,
3007                              stmt_vector_for_cost *prologue_cost_vec,
3008                              stmt_vector_for_cost *epilogue_cost_vec)
3009 {
3010   int retval = 0;
3011   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3012
3013   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3014     {
3015       *peel_iters_epilogue = vf/2;
3016       if (dump_enabled_p ())
3017         dump_printf_loc (MSG_NOTE, vect_location,
3018                          "cost model: epilogue peel iters set to vf/2 "
3019                          "because loop iterations are unknown .\n");
3020
3021       /* If peeled iterations are known but number of scalar loop
3022          iterations are unknown, count a taken branch per peeled loop.  */
3023       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3024                                  NULL, 0, vect_prologue);
3025       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3026                                  NULL, 0, vect_epilogue);
3027     }
3028   else
3029     {
3030       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3031       peel_iters_prologue = niters < peel_iters_prologue ?
3032                             niters : peel_iters_prologue;
3033       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3034       /* If we need to peel for gaps, but no peeling is required, we have to
3035          peel VF iterations.  */
3036       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3037         *peel_iters_epilogue = vf;
3038     }
3039
3040   stmt_info_for_cost *si;
3041   int j;
3042   if (peel_iters_prologue)
3043     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3044       retval += record_stmt_cost (prologue_cost_vec,
3045                                   si->count * peel_iters_prologue,
3046                                   si->kind, NULL, si->misalign,
3047                                   vect_prologue);
3048   if (*peel_iters_epilogue)
3049     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3050       retval += record_stmt_cost (epilogue_cost_vec,
3051                                   si->count * *peel_iters_epilogue,
3052                                   si->kind, NULL, si->misalign,
3053                                   vect_epilogue);
3054
3055   return retval;
3056 }
3057
3058 /* Function vect_estimate_min_profitable_iters
3059
3060    Return the number of iterations required for the vector version of the
3061    loop to be profitable relative to the cost of the scalar version of the
3062    loop.  */
3063
3064 static void
3065 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3066                                     int *ret_min_profitable_niters,
3067                                     int *ret_min_profitable_estimate)
3068 {
3069   int min_profitable_iters;
3070   int min_profitable_estimate;
3071   int peel_iters_prologue;
3072   int peel_iters_epilogue;
3073   unsigned vec_inside_cost = 0;
3074   int vec_outside_cost = 0;
3075   unsigned vec_prologue_cost = 0;
3076   unsigned vec_epilogue_cost = 0;
3077   int scalar_single_iter_cost = 0;
3078   int scalar_outside_cost = 0;
3079   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3080   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3081   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3082
3083   /* Cost model disabled.  */
3084   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3085     {
3086       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3087       *ret_min_profitable_niters = 0;
3088       *ret_min_profitable_estimate = 0;
3089       return;
3090     }
3091
3092   /* Requires loop versioning tests to handle misalignment.  */
3093   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3094     {
3095       /*  FIXME: Make cost depend on complexity of individual check.  */
3096       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3097       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3098                             vect_prologue);
3099       dump_printf (MSG_NOTE,
3100                    "cost model: Adding cost of checks for loop "
3101                    "versioning to treat misalignment.\n");
3102     }
3103
3104   /* Requires loop versioning with alias checks.  */
3105   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3106     {
3107       /*  FIXME: Make cost depend on complexity of individual check.  */
3108       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3109       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3110                             vect_prologue);
3111       dump_printf (MSG_NOTE,
3112                    "cost model: Adding cost of checks for loop "
3113                    "versioning aliasing.\n");
3114     }
3115
3116   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3117       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3118     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3119                           vect_prologue);
3120
3121   /* Count statements in scalar loop.  Using this as scalar cost for a single
3122      iteration for now.
3123
3124      TODO: Add outer loop support.
3125
3126      TODO: Consider assigning different costs to different scalar
3127      statements.  */
3128
3129   scalar_single_iter_cost
3130     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3131
3132   /* Add additional cost for the peeled instructions in prologue and epilogue
3133      loop.
3134
3135      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3136      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3137
3138      TODO: Build an expression that represents peel_iters for prologue and
3139      epilogue to be used in a run-time test.  */
3140
3141   if (npeel  < 0)
3142     {
3143       peel_iters_prologue = vf/2;
3144       dump_printf (MSG_NOTE, "cost model: "
3145                    "prologue peel iters set to vf/2.\n");
3146
3147       /* If peeling for alignment is unknown, loop bound of main loop becomes
3148          unknown.  */
3149       peel_iters_epilogue = vf/2;
3150       dump_printf (MSG_NOTE, "cost model: "
3151                    "epilogue peel iters set to vf/2 because "
3152                    "peeling for alignment is unknown.\n");
3153
3154       /* If peeled iterations are unknown, count a taken branch and a not taken
3155          branch per peeled loop. Even if scalar loop iterations are known,
3156          vector iterations are not known since peeled prologue iterations are
3157          not known. Hence guards remain the same.  */
3158       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3159                             NULL, 0, vect_prologue);
3160       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3161                             NULL, 0, vect_prologue);
3162       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3163                             NULL, 0, vect_epilogue);
3164       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3165                             NULL, 0, vect_epilogue);
3166       stmt_info_for_cost *si;
3167       int j;
3168       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3169         {
3170           struct _stmt_vec_info *stmt_info
3171             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3172           (void) add_stmt_cost (target_cost_data,
3173                                 si->count * peel_iters_prologue,
3174                                 si->kind, stmt_info, si->misalign,
3175                                 vect_prologue);
3176           (void) add_stmt_cost (target_cost_data,
3177                                 si->count * peel_iters_epilogue,
3178                                 si->kind, stmt_info, si->misalign,
3179                                 vect_epilogue);
3180         }
3181     }
3182   else
3183     {
3184       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3185       stmt_info_for_cost *si;
3186       int j;
3187       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3188
3189       prologue_cost_vec.create (2);
3190       epilogue_cost_vec.create (2);
3191       peel_iters_prologue = npeel;
3192
3193       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3194                                           &peel_iters_epilogue,
3195                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3196                                             (loop_vinfo),
3197                                           &prologue_cost_vec,
3198                                           &epilogue_cost_vec);
3199
3200       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3201         {
3202           struct _stmt_vec_info *stmt_info
3203             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3204           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3205                                 si->misalign, vect_prologue);
3206         }
3207
3208       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3209         {
3210           struct _stmt_vec_info *stmt_info
3211             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3212           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3213                                 si->misalign, vect_epilogue);
3214         }
3215
3216       prologue_cost_vec.release ();
3217       epilogue_cost_vec.release ();
3218     }
3219
3220   /* FORNOW: The scalar outside cost is incremented in one of the
3221      following ways:
3222
3223      1. The vectorizer checks for alignment and aliasing and generates
3224      a condition that allows dynamic vectorization.  A cost model
3225      check is ANDED with the versioning condition.  Hence scalar code
3226      path now has the added cost of the versioning check.
3227
3228        if (cost > th & versioning_check)
3229          jmp to vector code
3230
3231      Hence run-time scalar is incremented by not-taken branch cost.
3232
3233      2. The vectorizer then checks if a prologue is required.  If the
3234      cost model check was not done before during versioning, it has to
3235      be done before the prologue check.
3236
3237        if (cost <= th)
3238          prologue = scalar_iters
3239        if (prologue == 0)
3240          jmp to vector code
3241        else
3242          execute prologue
3243        if (prologue == num_iters)
3244          go to exit
3245
3246      Hence the run-time scalar cost is incremented by a taken branch,
3247      plus a not-taken branch, plus a taken branch cost.
3248
3249      3. The vectorizer then checks if an epilogue is required.  If the
3250      cost model check was not done before during prologue check, it
3251      has to be done with the epilogue check.
3252
3253        if (prologue == 0)
3254          jmp to vector code
3255        else
3256          execute prologue
3257        if (prologue == num_iters)
3258          go to exit
3259        vector code:
3260          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3261            jmp to epilogue
3262
3263      Hence the run-time scalar cost should be incremented by 2 taken
3264      branches.
3265
3266      TODO: The back end may reorder the BBS's differently and reverse
3267      conditions/branch directions.  Change the estimates below to
3268      something more reasonable.  */
3269
3270   /* If the number of iterations is known and we do not do versioning, we can
3271      decide whether to vectorize at compile time.  Hence the scalar version
3272      do not carry cost model guard costs.  */
3273   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3274       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3275       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3276     {
3277       /* Cost model check occurs at versioning.  */
3278       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3279           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3280         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3281       else
3282         {
3283           /* Cost model check occurs at prologue generation.  */
3284           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3285             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3286               + vect_get_stmt_cost (cond_branch_not_taken);
3287           /* Cost model check occurs at epilogue generation.  */
3288           else
3289             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3290         }
3291     }
3292
3293   /* Complete the target-specific cost calculations.  */
3294   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3295                &vec_inside_cost, &vec_epilogue_cost);
3296
3297   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3298
3299   if (dump_enabled_p ())
3300     {
3301       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3302       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3303                    vec_inside_cost);
3304       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3305                    vec_prologue_cost);
3306       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3307                    vec_epilogue_cost);
3308       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3309                    scalar_single_iter_cost);
3310       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3311                    scalar_outside_cost);
3312       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3313                    vec_outside_cost);
3314       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3315                    peel_iters_prologue);
3316       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3317                    peel_iters_epilogue);
3318     }
3319
3320   /* Calculate number of iterations required to make the vector version
3321      profitable, relative to the loop bodies only.  The following condition
3322      must hold true:
3323      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3324      where
3325      SIC = scalar iteration cost, VIC = vector iteration cost,
3326      VOC = vector outside cost, VF = vectorization factor,
3327      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3328      SOC = scalar outside cost for run time cost model check.  */
3329
3330   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3331     {
3332       if (vec_outside_cost <= 0)
3333         min_profitable_iters = 1;
3334       else
3335         {
3336           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3337                                   - vec_inside_cost * peel_iters_prologue
3338                                   - vec_inside_cost * peel_iters_epilogue)
3339                                  / ((scalar_single_iter_cost * vf)
3340                                     - vec_inside_cost);
3341
3342           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3343               <= (((int) vec_inside_cost * min_profitable_iters)
3344                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3345             min_profitable_iters++;
3346         }
3347     }
3348   /* vector version will never be profitable.  */
3349   else
3350     {
3351       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3352         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3353                     "did not happen for a simd loop");
3354
3355       if (dump_enabled_p ())
3356         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3357                          "cost model: the vector iteration cost = %d "
3358                          "divided by the scalar iteration cost = %d "
3359                          "is greater or equal to the vectorization factor = %d"
3360                          ".\n",
3361                          vec_inside_cost, scalar_single_iter_cost, vf);
3362       *ret_min_profitable_niters = -1;
3363       *ret_min_profitable_estimate = -1;
3364       return;
3365     }
3366
3367   dump_printf (MSG_NOTE,
3368                "  Calculated minimum iters for profitability: %d\n",
3369                min_profitable_iters);
3370
3371   min_profitable_iters =
3372         min_profitable_iters < vf ? vf : min_profitable_iters;
3373
3374   /* Because the condition we create is:
3375      if (niters <= min_profitable_iters)
3376        then skip the vectorized loop.  */
3377   min_profitable_iters--;
3378
3379   if (dump_enabled_p ())
3380     dump_printf_loc (MSG_NOTE, vect_location,
3381                      "  Runtime profitability threshold = %d\n",
3382                      min_profitable_iters);
3383
3384   *ret_min_profitable_niters = min_profitable_iters;
3385
3386   /* Calculate number of iterations required to make the vector version
3387      profitable, relative to the loop bodies only.
3388
3389      Non-vectorized variant is SIC * niters and it must win over vector
3390      variant on the expected loop trip count.  The following condition must hold true:
3391      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3392
3393   if (vec_outside_cost <= 0)
3394     min_profitable_estimate = 1;
3395   else
3396     {
3397       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3398                                  - vec_inside_cost * peel_iters_prologue
3399                                  - vec_inside_cost * peel_iters_epilogue)
3400                                  / ((scalar_single_iter_cost * vf)
3401                                    - vec_inside_cost);
3402     }
3403   min_profitable_estimate --;
3404   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3405   if (dump_enabled_p ())
3406     dump_printf_loc (MSG_NOTE, vect_location,
3407                      "  Static estimate profitability threshold = %d\n",
3408                       min_profitable_iters);
3409
3410   *ret_min_profitable_estimate = min_profitable_estimate;
3411 }
3412
3413 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3414    vector elements (not bits) for a vector of mode MODE.  */
3415 static void
3416 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3417                               unsigned char *sel)
3418 {
3419   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3420
3421   for (i = 0; i < nelt; i++)
3422     sel[i] = (i + offset) & (2*nelt - 1);
3423 }
3424
3425 /* Checks whether the target supports whole-vector shifts for vectors of mode
3426    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3427    it supports vec_perm_const with masks for all necessary shift amounts.  */
3428 static bool
3429 have_whole_vector_shift (enum machine_mode mode)
3430 {
3431   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3432     return true;
3433
3434   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3435     return false;
3436
3437   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3438   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3439
3440   for (i = nelt/2; i >= 1; i/=2)
3441     {
3442       calc_vec_perm_mask_for_shift (mode, i, sel);
3443       if (!can_vec_perm_p (mode, false, sel))
3444         return false;
3445     }
3446   return true;
3447 }
3448
3449 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3450
3451 static tree
3452 get_reduction_op (gimple *stmt, int reduc_index)
3453 {
3454   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3455     {
3456     case GIMPLE_SINGLE_RHS:
3457       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3458                   == ternary_op);
3459       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3460     case GIMPLE_UNARY_RHS:
3461       return gimple_assign_rhs1 (stmt);
3462     case GIMPLE_BINARY_RHS:
3463       return (reduc_index
3464               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3465     case GIMPLE_TERNARY_RHS:
3466       return gimple_op (stmt, reduc_index + 1);
3467     default:
3468       gcc_unreachable ();
3469     }
3470 }
3471
3472 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3473    functions. Design better to avoid maintenance issues.  */
3474
3475 /* Function vect_model_reduction_cost.
3476
3477    Models cost for a reduction operation, including the vector ops
3478    generated within the strip-mine loop, the initial definition before
3479    the loop, and the epilogue code that must be generated.  */
3480
3481 static bool
3482 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3483                            int ncopies, int reduc_index)
3484 {
3485   int prologue_cost = 0, epilogue_cost = 0;
3486   enum tree_code code;
3487   optab optab;
3488   tree vectype;
3489   gimple *stmt, *orig_stmt;
3490   tree reduction_op;
3491   machine_mode mode;
3492   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3493   struct loop *loop = NULL;
3494   void *target_cost_data;
3495
3496   if (loop_vinfo)
3497     {
3498       loop = LOOP_VINFO_LOOP (loop_vinfo);
3499       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3500     }
3501   else
3502     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3503
3504   /* Condition reductions generate two reductions in the loop.  */
3505   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3506     ncopies *= 2;
3507
3508   /* Cost of reduction op inside loop.  */
3509   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3510                                         stmt_info, 0, vect_body);
3511   stmt = STMT_VINFO_STMT (stmt_info);
3512
3513   reduction_op = get_reduction_op (stmt, reduc_index);
3514
3515   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3516   if (!vectype)
3517     {
3518       if (dump_enabled_p ())
3519         {
3520           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3521                            "unsupported data-type ");
3522           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3523                              TREE_TYPE (reduction_op));
3524           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3525         }
3526       return false;
3527    }
3528
3529   mode = TYPE_MODE (vectype);
3530   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3531
3532   if (!orig_stmt)
3533     orig_stmt = STMT_VINFO_STMT (stmt_info);
3534
3535   code = gimple_assign_rhs_code (orig_stmt);
3536
3537   /* Add in cost for initial definition.
3538      For cond reduction we have four vectors: initial index, step, initial
3539      result of the data reduction, initial value of the index reduction.  */
3540   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3541                        == COND_REDUCTION ? 4 : 1;
3542   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3543                                   scalar_to_vec, stmt_info, 0,
3544                                   vect_prologue);
3545
3546   /* Determine cost of epilogue code.
3547
3548      We have a reduction operator that will reduce the vector in one statement.
3549      Also requires scalar extract.  */
3550
3551   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3552     {
3553       if (reduc_code != ERROR_MARK)
3554         {
3555           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3556             {
3557               /* An EQ stmt and an COND_EXPR stmt.  */
3558               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3559                                               vector_stmt, stmt_info, 0,
3560                                               vect_epilogue);
3561               /* Reduction of the max index and a reduction of the found
3562                  values.  */
3563               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3564                                               vec_to_scalar, stmt_info, 0,
3565                                               vect_epilogue);
3566               /* A broadcast of the max value.  */
3567               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3568                                               scalar_to_vec, stmt_info, 0,
3569                                               vect_epilogue);
3570             }
3571           else
3572             {
3573               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3574                                               stmt_info, 0, vect_epilogue);
3575               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3576                                               vec_to_scalar, stmt_info, 0,
3577                                               vect_epilogue);
3578             }
3579         }
3580       else
3581         {
3582           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3583           tree bitsize =
3584             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3585           int element_bitsize = tree_to_uhwi (bitsize);
3586           int nelements = vec_size_in_bits / element_bitsize;
3587
3588           optab = optab_for_tree_code (code, vectype, optab_default);
3589
3590           /* We have a whole vector shift available.  */
3591           if (VECTOR_MODE_P (mode)
3592               && optab_handler (optab, mode) != CODE_FOR_nothing
3593               && have_whole_vector_shift (mode))
3594             {
3595               /* Final reduction via vector shifts and the reduction operator.
3596                  Also requires scalar extract.  */
3597               epilogue_cost += add_stmt_cost (target_cost_data,
3598                                               exact_log2 (nelements) * 2,
3599                                               vector_stmt, stmt_info, 0,
3600                                               vect_epilogue);
3601               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3602                                               vec_to_scalar, stmt_info, 0,
3603                                               vect_epilogue);
3604             }
3605           else
3606             /* Use extracts and reduction op for final reduction.  For N
3607                elements, we have N extracts and N-1 reduction ops.  */
3608             epilogue_cost += add_stmt_cost (target_cost_data,
3609                                             nelements + nelements - 1,
3610                                             vector_stmt, stmt_info, 0,
3611                                             vect_epilogue);
3612         }
3613     }
3614
3615   if (dump_enabled_p ())
3616     dump_printf (MSG_NOTE,
3617                  "vect_model_reduction_cost: inside_cost = %d, "
3618                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3619                  prologue_cost, epilogue_cost);
3620
3621   return true;
3622 }
3623
3624
3625 /* Function vect_model_induction_cost.
3626
3627    Models cost for induction operations.  */
3628
3629 static void
3630 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3631 {
3632   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3633   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3634   unsigned inside_cost, prologue_cost;
3635
3636   /* loop cost for vec_loop.  */
3637   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3638                                stmt_info, 0, vect_body);
3639
3640   /* prologue cost for vec_init and vec_step.  */
3641   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3642                                  stmt_info, 0, vect_prologue);
3643
3644   if (dump_enabled_p ())
3645     dump_printf_loc (MSG_NOTE, vect_location,
3646                      "vect_model_induction_cost: inside_cost = %d, "
3647                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3648 }
3649
3650
3651 /* Function get_initial_def_for_induction
3652
3653    Input:
3654    STMT - a stmt that performs an induction operation in the loop.
3655    IV_PHI - the initial value of the induction variable
3656
3657    Output:
3658    Return a vector variable, initialized with the first VF values of
3659    the induction variable.  E.g., for an iv with IV_PHI='X' and
3660    evolution S, for a vector of 4 units, we want to return:
3661    [X, X + S, X + 2*S, X + 3*S].  */
3662
3663 static tree
3664 get_initial_def_for_induction (gimple *iv_phi)
3665 {
3666   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3667   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3668   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3669   tree vectype;
3670   int nunits;
3671   edge pe = loop_preheader_edge (loop);
3672   struct loop *iv_loop;
3673   basic_block new_bb;
3674   tree new_vec, vec_init, vec_step, t;
3675   tree new_name;
3676   gimple *new_stmt;
3677   gphi *induction_phi;
3678   tree induc_def, vec_def, vec_dest;
3679   tree init_expr, step_expr;
3680   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3681   int i;
3682   int ncopies;
3683   tree expr;
3684   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3685   bool nested_in_vect_loop = false;
3686   gimple_seq stmts;
3687   imm_use_iterator imm_iter;
3688   use_operand_p use_p;
3689   gimple *exit_phi;
3690   edge latch_e;
3691   tree loop_arg;
3692   gimple_stmt_iterator si;
3693   basic_block bb = gimple_bb (iv_phi);
3694   tree stepvectype;
3695   tree resvectype;
3696
3697   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3698   if (nested_in_vect_loop_p (loop, iv_phi))
3699     {
3700       nested_in_vect_loop = true;
3701       iv_loop = loop->inner;
3702     }
3703   else
3704     iv_loop = loop;
3705   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3706
3707   latch_e = loop_latch_edge (iv_loop);
3708   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3709
3710   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3711   gcc_assert (step_expr != NULL_TREE);
3712
3713   pe = loop_preheader_edge (iv_loop);
3714   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3715                                      loop_preheader_edge (iv_loop));
3716
3717   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3718   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3719   gcc_assert (vectype);
3720   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3721   ncopies = vf / nunits;
3722
3723   gcc_assert (phi_info);
3724   gcc_assert (ncopies >= 1);
3725
3726   /* Convert the step to the desired type.  */
3727   stmts = NULL;
3728   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3729   if (stmts)
3730     {
3731       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3732       gcc_assert (!new_bb);
3733     }
3734
3735   /* Find the first insertion point in the BB.  */
3736   si = gsi_after_labels (bb);
3737
3738   /* Create the vector that holds the initial_value of the induction.  */
3739   if (nested_in_vect_loop)
3740     {
3741       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3742          been created during vectorization of previous stmts.  We obtain it
3743          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3744       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3745       /* If the initial value is not of proper type, convert it.  */
3746       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3747         {
3748           new_stmt
3749             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3750                                                           vect_simple_var,
3751                                                           "vec_iv_"),
3752                                    VIEW_CONVERT_EXPR,
3753                                    build1 (VIEW_CONVERT_EXPR, vectype,
3754                                            vec_init));
3755           vec_init = gimple_assign_lhs (new_stmt);
3756           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3757                                                  new_stmt);
3758           gcc_assert (!new_bb);
3759           set_vinfo_for_stmt (new_stmt,
3760                               new_stmt_vec_info (new_stmt, loop_vinfo));
3761         }
3762     }
3763   else
3764     {
3765       vec<constructor_elt, va_gc> *v;
3766
3767       /* iv_loop is the loop to be vectorized. Create:
3768          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3769       stmts = NULL;
3770       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3771
3772       vec_alloc (v, nunits);
3773       bool constant_p = is_gimple_min_invariant (new_name);
3774       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3775       for (i = 1; i < nunits; i++)
3776         {
3777           /* Create: new_name_i = new_name + step_expr  */
3778           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3779                                    new_name, step_expr);
3780           if (!is_gimple_min_invariant (new_name))
3781             constant_p = false;
3782           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3783         }
3784       if (stmts)
3785         {
3786           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3787           gcc_assert (!new_bb);
3788         }
3789
3790       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3791       if (constant_p)
3792         new_vec = build_vector_from_ctor (vectype, v);
3793       else
3794         new_vec = build_constructor (vectype, v);
3795       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3796     }
3797
3798
3799   /* Create the vector that holds the step of the induction.  */
3800   if (nested_in_vect_loop)
3801     /* iv_loop is nested in the loop to be vectorized. Generate:
3802        vec_step = [S, S, S, S]  */
3803     new_name = step_expr;
3804   else
3805     {
3806       /* iv_loop is the loop to be vectorized. Generate:
3807           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3808       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3809         {
3810           expr = build_int_cst (integer_type_node, vf);
3811           expr = fold_convert (TREE_TYPE (step_expr), expr);
3812         }
3813       else
3814         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3815       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3816                               expr, step_expr);
3817       if (TREE_CODE (step_expr) == SSA_NAME)
3818         new_name = vect_init_vector (iv_phi, new_name,
3819                                      TREE_TYPE (step_expr), NULL);
3820     }
3821
3822   t = unshare_expr (new_name);
3823   gcc_assert (CONSTANT_CLASS_P (new_name)
3824               || TREE_CODE (new_name) == SSA_NAME);
3825   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3826   gcc_assert (stepvectype);
3827   new_vec = build_vector_from_val (stepvectype, t);
3828   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3829
3830
3831   /* Create the following def-use cycle:
3832      loop prolog:
3833          vec_init = ...
3834          vec_step = ...
3835      loop:
3836          vec_iv = PHI <vec_init, vec_loop>
3837          ...
3838          STMT
3839          ...
3840          vec_loop = vec_iv + vec_step;  */
3841
3842   /* Create the induction-phi that defines the induction-operand.  */
3843   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3844   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3845   set_vinfo_for_stmt (induction_phi,
3846                       new_stmt_vec_info (induction_phi, loop_vinfo));
3847   induc_def = PHI_RESULT (induction_phi);
3848
3849   /* Create the iv update inside the loop  */
3850   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3851   vec_def = make_ssa_name (vec_dest, new_stmt);
3852   gimple_assign_set_lhs (new_stmt, vec_def);
3853   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3854   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3855
3856   /* Set the arguments of the phi node:  */
3857   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3858   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3859                UNKNOWN_LOCATION);
3860
3861
3862   /* In case that vectorization factor (VF) is bigger than the number
3863      of elements that we can fit in a vectype (nunits), we have to generate
3864      more than one vector stmt - i.e - we need to "unroll" the
3865      vector stmt by a factor VF/nunits.  For more details see documentation
3866      in vectorizable_operation.  */
3867
3868   if (ncopies > 1)
3869     {
3870       stmt_vec_info prev_stmt_vinfo;
3871       /* FORNOW. This restriction should be relaxed.  */
3872       gcc_assert (!nested_in_vect_loop);
3873
3874       /* Create the vector that holds the step of the induction.  */
3875       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3876         {
3877           expr = build_int_cst (integer_type_node, nunits);
3878           expr = fold_convert (TREE_TYPE (step_expr), expr);
3879         }
3880       else
3881         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3882       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3883                               expr, step_expr);
3884       if (TREE_CODE (step_expr) == SSA_NAME)
3885         new_name = vect_init_vector (iv_phi, new_name,
3886                                      TREE_TYPE (step_expr), NULL);
3887       t = unshare_expr (new_name);
3888       gcc_assert (CONSTANT_CLASS_P (new_name)
3889                   || TREE_CODE (new_name) == SSA_NAME);
3890       new_vec = build_vector_from_val (stepvectype, t);
3891       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3892
3893       vec_def = induc_def;
3894       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3895       for (i = 1; i < ncopies; i++)
3896         {
3897           /* vec_i = vec_prev + vec_step  */
3898           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3899                                           vec_def, vec_step);
3900           vec_def = make_ssa_name (vec_dest, new_stmt);
3901           gimple_assign_set_lhs (new_stmt, vec_def);
3902
3903           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3904           if (!useless_type_conversion_p (resvectype, vectype))
3905             {
3906               new_stmt
3907                 = gimple_build_assign
3908                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3909                                                 "vec_iv_"),
3910                          VIEW_CONVERT_EXPR,
3911                          build1 (VIEW_CONVERT_EXPR, resvectype,
3912                                  gimple_assign_lhs (new_stmt)));
3913               gimple_assign_set_lhs (new_stmt,
3914                                      make_ssa_name
3915                                        (gimple_assign_lhs (new_stmt), new_stmt));
3916               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3917             }
3918           set_vinfo_for_stmt (new_stmt,
3919                               new_stmt_vec_info (new_stmt, loop_vinfo));
3920           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3921           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3922         }
3923     }
3924
3925   if (nested_in_vect_loop)
3926     {
3927       /* Find the loop-closed exit-phi of the induction, and record
3928          the final vector of induction results:  */
3929       exit_phi = NULL;
3930       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3931         {
3932           gimple *use_stmt = USE_STMT (use_p);
3933           if (is_gimple_debug (use_stmt))
3934             continue;
3935
3936           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3937             {
3938               exit_phi = use_stmt;
3939               break;
3940             }
3941         }
3942       if (exit_phi)
3943         {
3944           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3945           /* FORNOW. Currently not supporting the case that an inner-loop induction
3946              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3947           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3948                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3949
3950           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3951           if (dump_enabled_p ())
3952             {
3953               dump_printf_loc (MSG_NOTE, vect_location,
3954                                "vector of inductions after inner-loop:");
3955               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3956               dump_printf (MSG_NOTE, "\n");
3957             }
3958         }
3959     }
3960
3961
3962   if (dump_enabled_p ())
3963     {
3964       dump_printf_loc (MSG_NOTE, vect_location,
3965                        "transform induction: created def-use cycle: ");
3966       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3967       dump_printf (MSG_NOTE, "\n");
3968       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3969                         SSA_NAME_DEF_STMT (vec_def), 0);
3970       dump_printf (MSG_NOTE, "\n");
3971     }
3972
3973   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3974   if (!useless_type_conversion_p (resvectype, vectype))
3975     {
3976       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3977                                                              vect_simple_var,
3978                                                              "vec_iv_"),
3979                                       VIEW_CONVERT_EXPR,
3980                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3981                                               induc_def));
3982       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3983       gimple_assign_set_lhs (new_stmt, induc_def);
3984       si = gsi_after_labels (bb);
3985       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3986       set_vinfo_for_stmt (new_stmt,
3987                           new_stmt_vec_info (new_stmt, loop_vinfo));
3988       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3989         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3990     }
3991
3992   return induc_def;
3993 }
3994
3995
3996 /* Function get_initial_def_for_reduction
3997
3998    Input:
3999    STMT - a stmt that performs a reduction operation in the loop.
4000    INIT_VAL - the initial value of the reduction variable
4001
4002    Output:
4003    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4004         of the reduction (used for adjusting the epilog - see below).
4005    Return a vector variable, initialized according to the operation that STMT
4006         performs. This vector will be used as the initial value of the
4007         vector of partial results.
4008
4009    Option1 (adjust in epilog): Initialize the vector as follows:
4010      add/bit or/xor:    [0,0,...,0,0]
4011      mult/bit and:      [1,1,...,1,1]
4012      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4013    and when necessary (e.g. add/mult case) let the caller know
4014    that it needs to adjust the result by init_val.
4015
4016    Option2: Initialize the vector as follows:
4017      add/bit or/xor:    [init_val,0,0,...,0]
4018      mult/bit and:      [init_val,1,1,...,1]
4019      min/max/cond_expr: [init_val,init_val,...,init_val]
4020    and no adjustments are needed.
4021
4022    For example, for the following code:
4023
4024    s = init_val;
4025    for (i=0;i<n;i++)
4026      s = s + a[i];
4027
4028    STMT is 's = s + a[i]', and the reduction variable is 's'.
4029    For a vector of 4 units, we want to return either [0,0,0,init_val],
4030    or [0,0,0,0] and let the caller know that it needs to adjust
4031    the result at the end by 'init_val'.
4032
4033    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4034    initialization vector is simpler (same element in all entries), if
4035    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4036
4037    A cost model should help decide between these two schemes.  */
4038
4039 tree
4040 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4041                                tree *adjustment_def)
4042 {
4043   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4044   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4045   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4046   tree scalar_type = TREE_TYPE (init_val);
4047   tree vectype = get_vectype_for_scalar_type (scalar_type);
4048   int nunits;
4049   enum tree_code code = gimple_assign_rhs_code (stmt);
4050   tree def_for_init;
4051   tree init_def;
4052   tree *elts;
4053   int i;
4054   bool nested_in_vect_loop = false;
4055   tree init_value;
4056   REAL_VALUE_TYPE real_init_val = dconst0;
4057   int int_init_val = 0;
4058   gimple *def_stmt = NULL;
4059
4060   gcc_assert (vectype);
4061   nunits = TYPE_VECTOR_SUBPARTS (vectype);
4062
4063   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4064               || SCALAR_FLOAT_TYPE_P (scalar_type));
4065
4066   if (nested_in_vect_loop_p (loop, stmt))
4067     nested_in_vect_loop = true;
4068   else
4069     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4070
4071   /* In case of double reduction we only create a vector variable to be put
4072      in the reduction phi node.  The actual statement creation is done in
4073      vect_create_epilog_for_reduction.  */
4074   if (adjustment_def && nested_in_vect_loop
4075       && TREE_CODE (init_val) == SSA_NAME
4076       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4077       && gimple_code (def_stmt) == GIMPLE_PHI
4078       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4079       && vinfo_for_stmt (def_stmt)
4080       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4081           == vect_double_reduction_def)
4082     {
4083       *adjustment_def = NULL;
4084       return vect_create_destination_var (init_val, vectype);
4085     }
4086
4087   if (TREE_CONSTANT (init_val))
4088     {
4089       if (SCALAR_FLOAT_TYPE_P (scalar_type))
4090         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
4091       else
4092         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
4093     }
4094   else
4095     init_value = init_val;
4096
4097   switch (code)
4098     {
4099       case WIDEN_SUM_EXPR:
4100       case DOT_PROD_EXPR:
4101       case SAD_EXPR:
4102       case PLUS_EXPR:
4103       case MINUS_EXPR:
4104       case BIT_IOR_EXPR:
4105       case BIT_XOR_EXPR:
4106       case MULT_EXPR:
4107       case BIT_AND_EXPR:
4108         /* ADJUSMENT_DEF is NULL when called from
4109            vect_create_epilog_for_reduction to vectorize double reduction.  */
4110         if (adjustment_def)
4111           {
4112             if (nested_in_vect_loop)
4113               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
4114             else
4115               *adjustment_def = init_val;
4116           }
4117
4118         if (code == MULT_EXPR)
4119           {
4120             real_init_val = dconst1;
4121             int_init_val = 1;
4122           }
4123
4124         if (code == BIT_AND_EXPR)
4125           int_init_val = -1;
4126
4127         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4128           def_for_init = build_real (scalar_type, real_init_val);
4129         else
4130           def_for_init = build_int_cst (scalar_type, int_init_val);
4131
4132         /* Create a vector of '0' or '1' except the first element.  */
4133         elts = XALLOCAVEC (tree, nunits);
4134         for (i = nunits - 2; i >= 0; --i)
4135           elts[i + 1] = def_for_init;
4136
4137         /* Option1: the first element is '0' or '1' as well.  */
4138         if (adjustment_def)
4139           {
4140             elts[0] = def_for_init;
4141             init_def = build_vector (vectype, elts);
4142             break;
4143           }
4144
4145         /* Option2: the first element is INIT_VAL.  */
4146         elts[0] = init_val;
4147         if (TREE_CONSTANT (init_val))
4148           init_def = build_vector (vectype, elts);
4149         else
4150           {
4151             vec<constructor_elt, va_gc> *v;
4152             vec_alloc (v, nunits);
4153             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4154             for (i = 1; i < nunits; ++i)
4155               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4156             init_def = build_constructor (vectype, v);
4157           }
4158
4159         break;
4160
4161       case MIN_EXPR:
4162       case MAX_EXPR:
4163       case COND_EXPR:
4164         if (adjustment_def)
4165           {
4166             *adjustment_def = NULL_TREE;
4167             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4168               {
4169                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4170                 break;
4171               }
4172           }
4173         init_def = build_vector_from_val (vectype, init_value);
4174         break;
4175
4176       default:
4177         gcc_unreachable ();
4178     }
4179
4180   return init_def;
4181 }
4182
4183 /* Function vect_create_epilog_for_reduction
4184
4185    Create code at the loop-epilog to finalize the result of a reduction
4186    computation.
4187
4188    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4189      reduction statements.
4190    STMT is the scalar reduction stmt that is being vectorized.
4191    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4192      number of elements that we can fit in a vectype (nunits).  In this case
4193      we have to generate more than one vector stmt - i.e - we need to "unroll"
4194      the vector stmt by a factor VF/nunits.  For more details see documentation
4195      in vectorizable_operation.
4196    REDUC_CODE is the tree-code for the epilog reduction.
4197    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4198      computation.
4199    REDUC_INDEX is the index of the operand in the right hand side of the
4200      statement that is defined by REDUCTION_PHI.
4201    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4202    SLP_NODE is an SLP node containing a group of reduction statements. The
4203      first one in this group is STMT.
4204    INDUCTION_INDEX is the index of the loop for condition reductions.
4205      Otherwise it is undefined.
4206
4207    This function:
4208    1. Creates the reduction def-use cycles: sets the arguments for
4209       REDUCTION_PHIS:
4210       The loop-entry argument is the vectorized initial-value of the reduction.
4211       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4212       sums.
4213    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4214       by applying the operation specified by REDUC_CODE if available, or by
4215       other means (whole-vector shifts or a scalar loop).
4216       The function also creates a new phi node at the loop exit to preserve
4217       loop-closed form, as illustrated below.
4218
4219      The flow at the entry to this function:
4220
4221         loop:
4222           vec_def = phi <null, null>            # REDUCTION_PHI
4223           VECT_DEF = vector_stmt                # vectorized form of STMT
4224           s_loop = scalar_stmt                  # (scalar) STMT
4225         loop_exit:
4226           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4227           use <s_out0>
4228           use <s_out0>
4229
4230      The above is transformed by this function into:
4231
4232         loop:
4233           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4234           VECT_DEF = vector_stmt                # vectorized form of STMT
4235           s_loop = scalar_stmt                  # (scalar) STMT
4236         loop_exit:
4237           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4238           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4239           v_out2 = reduce <v_out1>
4240           s_out3 = extract_field <v_out2, 0>
4241           s_out4 = adjust_result <s_out3>
4242           use <s_out4>
4243           use <s_out4>
4244 */
4245
4246 static void
4247 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4248                                   int ncopies, enum tree_code reduc_code,
4249                                   vec<gimple *> reduction_phis,
4250                                   int reduc_index, bool double_reduc,
4251                                   slp_tree slp_node, tree induction_index)
4252 {
4253   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4254   stmt_vec_info prev_phi_info;
4255   tree vectype;
4256   machine_mode mode;
4257   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4258   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4259   basic_block exit_bb;
4260   tree scalar_dest;
4261   tree scalar_type;
4262   gimple *new_phi = NULL, *phi;
4263   gimple_stmt_iterator exit_gsi;
4264   tree vec_dest;
4265   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4266   gimple *epilog_stmt = NULL;
4267   enum tree_code code = gimple_assign_rhs_code (stmt);
4268   gimple *exit_phi;
4269   tree bitsize;
4270   tree adjustment_def = NULL;
4271   tree vec_initial_def = NULL;
4272   tree reduction_op, expr, def, initial_def = NULL;
4273   tree orig_name, scalar_result;
4274   imm_use_iterator imm_iter, phi_imm_iter;
4275   use_operand_p use_p, phi_use_p;
4276   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4277   bool nested_in_vect_loop = false;
4278   auto_vec<gimple *> new_phis;
4279   auto_vec<gimple *> inner_phis;
4280   enum vect_def_type dt = vect_unknown_def_type;
4281   int j, i;
4282   auto_vec<tree> scalar_results;
4283   unsigned int group_size = 1, k, ratio;
4284   auto_vec<tree> vec_initial_defs;
4285   auto_vec<gimple *> phis;
4286   bool slp_reduc = false;
4287   tree new_phi_result;
4288   gimple *inner_phi = NULL;
4289
4290   if (slp_node)
4291     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4292
4293   if (nested_in_vect_loop_p (loop, stmt))
4294     {
4295       outer_loop = loop;
4296       loop = loop->inner;
4297       nested_in_vect_loop = true;
4298       gcc_assert (!slp_node);
4299     }
4300
4301   reduction_op = get_reduction_op (stmt, reduc_index);
4302
4303   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4304   gcc_assert (vectype);
4305   mode = TYPE_MODE (vectype);
4306
4307   /* 1. Create the reduction def-use cycle:
4308      Set the arguments of REDUCTION_PHIS, i.e., transform
4309
4310         loop:
4311           vec_def = phi <null, null>            # REDUCTION_PHI
4312           VECT_DEF = vector_stmt                # vectorized form of STMT
4313           ...
4314
4315      into:
4316
4317         loop:
4318           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4319           VECT_DEF = vector_stmt                # vectorized form of STMT
4320           ...
4321
4322      (in case of SLP, do it for all the phis). */
4323
4324   /* Get the loop-entry arguments.  */
4325   if (slp_node)
4326     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4327                        NULL, slp_node, reduc_index);
4328   else
4329     {
4330       /* Get at the scalar def before the loop, that defines the initial value
4331          of the reduction variable.  */
4332       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4333       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4334                                            loop_preheader_edge (loop));
4335       vec_initial_defs.create (1);
4336       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4337                                                        &adjustment_def);
4338       vec_initial_defs.quick_push (vec_initial_def);
4339     }
4340
4341   /* Set phi nodes arguments.  */
4342   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4343     {
4344       tree vec_init_def, def;
4345       gimple_seq stmts;
4346       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4347                                            true, NULL_TREE);
4348       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4349       def = vect_defs[i];
4350       for (j = 0; j < ncopies; j++)
4351         {
4352           /* Set the loop-entry arg of the reduction-phi.  */
4353
4354           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4355               == INTEGER_INDUC_COND_REDUCTION)
4356             {
4357               /* Initialise the reduction phi to zero.  This prevents initial
4358                  values of non-zero interferring with the reduction op.  */
4359               gcc_assert (ncopies == 1);
4360               gcc_assert (i == 0);
4361
4362               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4363               tree zero_vec = build_zero_cst (vec_init_def_type);
4364
4365               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4366                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4367             }
4368           else
4369             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4370                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4371
4372           /* Set the loop-latch arg for the reduction-phi.  */
4373           if (j > 0)
4374             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4375
4376           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4377                        UNKNOWN_LOCATION);
4378
4379           if (dump_enabled_p ())
4380             {
4381               dump_printf_loc (MSG_NOTE, vect_location,
4382                                "transform reduction: created def-use cycle: ");
4383               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4384               dump_printf (MSG_NOTE, "\n");
4385               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4386               dump_printf (MSG_NOTE, "\n");
4387             }
4388
4389           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4390         }
4391     }
4392
4393   /* 2. Create epilog code.
4394         The reduction epilog code operates across the elements of the vector
4395         of partial results computed by the vectorized loop.
4396         The reduction epilog code consists of:
4397
4398         step 1: compute the scalar result in a vector (v_out2)
4399         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4400         step 3: adjust the scalar result (s_out3) if needed.
4401
4402         Step 1 can be accomplished using one the following three schemes:
4403           (scheme 1) using reduc_code, if available.
4404           (scheme 2) using whole-vector shifts, if available.
4405           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4406                      combined.
4407
4408           The overall epilog code looks like this:
4409
4410           s_out0 = phi <s_loop>         # original EXIT_PHI
4411           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4412           v_out2 = reduce <v_out1>              # step 1
4413           s_out3 = extract_field <v_out2, 0>    # step 2
4414           s_out4 = adjust_result <s_out3>       # step 3
4415
4416           (step 3 is optional, and steps 1 and 2 may be combined).
4417           Lastly, the uses of s_out0 are replaced by s_out4.  */
4418
4419
4420   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4421          v_out1 = phi <VECT_DEF>
4422          Store them in NEW_PHIS.  */
4423
4424   exit_bb = single_exit (loop)->dest;
4425   prev_phi_info = NULL;
4426   new_phis.create (vect_defs.length ());
4427   FOR_EACH_VEC_ELT (vect_defs, i, def)
4428     {
4429       for (j = 0; j < ncopies; j++)
4430         {
4431           tree new_def = copy_ssa_name (def);
4432           phi = create_phi_node (new_def, exit_bb);
4433           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4434           if (j == 0)
4435             new_phis.quick_push (phi);
4436           else
4437             {
4438               def = vect_get_vec_def_for_stmt_copy (dt, def);
4439               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4440             }
4441
4442           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4443           prev_phi_info = vinfo_for_stmt (phi);
4444         }
4445     }
4446
4447   /* The epilogue is created for the outer-loop, i.e., for the loop being
4448      vectorized.  Create exit phis for the outer loop.  */
4449   if (double_reduc)
4450     {
4451       loop = outer_loop;
4452       exit_bb = single_exit (loop)->dest;
4453       inner_phis.create (vect_defs.length ());
4454       FOR_EACH_VEC_ELT (new_phis, i, phi)
4455         {
4456           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4457           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4458           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4459                            PHI_RESULT (phi));
4460           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4461                                                             loop_vinfo));
4462           inner_phis.quick_push (phi);
4463           new_phis[i] = outer_phi;
4464           prev_phi_info = vinfo_for_stmt (outer_phi);
4465           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4466             {
4467               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4468               new_result = copy_ssa_name (PHI_RESULT (phi));
4469               outer_phi = create_phi_node (new_result, exit_bb);
4470               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4471                                PHI_RESULT (phi));
4472               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4473                                                                 loop_vinfo));
4474               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4475               prev_phi_info = vinfo_for_stmt (outer_phi);
4476             }
4477         }
4478     }
4479
4480   exit_gsi = gsi_after_labels (exit_bb);
4481
4482   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4483          (i.e. when reduc_code is not available) and in the final adjustment
4484          code (if needed).  Also get the original scalar reduction variable as
4485          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4486          represents a reduction pattern), the tree-code and scalar-def are
4487          taken from the original stmt that the pattern-stmt (STMT) replaces.
4488          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4489          are taken from STMT.  */
4490
4491   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4492   if (!orig_stmt)
4493     {
4494       /* Regular reduction  */
4495       orig_stmt = stmt;
4496     }
4497   else
4498     {
4499       /* Reduction pattern  */
4500       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4501       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4502       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4503     }
4504
4505   code = gimple_assign_rhs_code (orig_stmt);
4506   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4507      partial results are added and not subtracted.  */
4508   if (code == MINUS_EXPR)
4509     code = PLUS_EXPR;
4510
4511   scalar_dest = gimple_assign_lhs (orig_stmt);
4512   scalar_type = TREE_TYPE (scalar_dest);
4513   scalar_results.create (group_size);
4514   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4515   bitsize = TYPE_SIZE (scalar_type);
4516
4517   /* In case this is a reduction in an inner-loop while vectorizing an outer
4518      loop - we don't need to extract a single scalar result at the end of the
4519      inner-loop (unless it is double reduction, i.e., the use of reduction is
4520      outside the outer-loop).  The final vector of partial results will be used
4521      in the vectorized outer-loop, or reduced to a scalar result at the end of
4522      the outer-loop.  */
4523   if (nested_in_vect_loop && !double_reduc)
4524     goto vect_finalize_reduction;
4525
4526   /* SLP reduction without reduction chain, e.g.,
4527      # a1 = phi <a2, a0>
4528      # b1 = phi <b2, b0>
4529      a2 = operation (a1)
4530      b2 = operation (b1)  */
4531   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4532
4533   /* In case of reduction chain, e.g.,
4534      # a1 = phi <a3, a0>
4535      a2 = operation (a1)
4536      a3 = operation (a2),
4537
4538      we may end up with more than one vector result.  Here we reduce them to
4539      one vector.  */
4540   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4541     {
4542       tree first_vect = PHI_RESULT (new_phis[0]);
4543       tree tmp;
4544       gassign *new_vec_stmt = NULL;
4545
4546       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4547       for (k = 1; k < new_phis.length (); k++)
4548         {
4549           gimple *next_phi = new_phis[k];
4550           tree second_vect = PHI_RESULT (next_phi);
4551
4552           tmp = build2 (code, vectype,  first_vect, second_vect);
4553           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4554           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4555           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4556           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4557         }
4558
4559       new_phi_result = first_vect;
4560       if (new_vec_stmt)
4561         {
4562           new_phis.truncate (0);
4563           new_phis.safe_push (new_vec_stmt);
4564         }
4565     }
4566   else
4567     new_phi_result = PHI_RESULT (new_phis[0]);
4568
4569   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4570     {
4571       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4572          various data values where the condition matched and another vector
4573          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4574          need to extract the last matching index (which will be the index with
4575          highest value) and use this to index into the data vector.
4576          For the case where there were no matches, the data vector will contain
4577          all default values and the index vector will be all zeros.  */
4578
4579       /* Get various versions of the type of the vector of indexes.  */
4580       tree index_vec_type = TREE_TYPE (induction_index);
4581       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4582       tree index_scalar_type = TREE_TYPE (index_vec_type);
4583       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4584         (index_vec_type);
4585
4586       /* Get an unsigned integer version of the type of the data vector.  */
4587       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4588       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4589       tree vectype_unsigned = build_vector_type
4590         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4591
4592       /* First we need to create a vector (ZERO_VEC) of zeros and another
4593          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4594          can create using a MAX reduction and then expanding.
4595          In the case where the loop never made any matches, the max index will
4596          be zero.  */
4597
4598       /* Vector of {0, 0, 0,...}.  */
4599       tree zero_vec = make_ssa_name (vectype);
4600       tree zero_vec_rhs = build_zero_cst (vectype);
4601       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4602       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4603
4604       /* Find maximum value from the vector of found indexes.  */
4605       tree max_index = make_ssa_name (index_scalar_type);
4606       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4607                                                     induction_index);
4608       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4609
4610       /* Vector of {max_index, max_index, max_index,...}.  */
4611       tree max_index_vec = make_ssa_name (index_vec_type);
4612       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4613                                                       max_index);
4614       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4615                                                         max_index_vec_rhs);
4616       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4617
4618       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4619          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4620          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4621          otherwise.  Only one value should match, resulting in a vector
4622          (VEC_COND) with one data value and the rest zeros.
4623          In the case where the loop never made any matches, every index will
4624          match, resulting in a vector with all data values (which will all be
4625          the default value).  */
4626
4627       /* Compare the max index vector to the vector of found indexes to find
4628          the position of the max value.  */
4629       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4630       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4631                                                       induction_index,
4632                                                       max_index_vec);
4633       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4634
4635       /* Use the compare to choose either values from the data vector or
4636          zero.  */
4637       tree vec_cond = make_ssa_name (vectype);
4638       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4639                                                    vec_compare, new_phi_result,
4640                                                    zero_vec);
4641       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4642
4643       /* Finally we need to extract the data value from the vector (VEC_COND)
4644          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4645          reduction, but because this doesn't exist, we can use a MAX reduction
4646          instead.  The data value might be signed or a float so we need to cast
4647          it first.
4648          In the case where the loop never made any matches, the data values are
4649          all identical, and so will reduce down correctly.  */
4650
4651       /* Make the matched data values unsigned.  */
4652       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4653       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4654                                        vec_cond);
4655       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4656                                                         VIEW_CONVERT_EXPR,
4657                                                         vec_cond_cast_rhs);
4658       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4659
4660       /* Reduce down to a scalar value.  */
4661       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4662       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4663                                       optab_default);
4664       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4665                   != CODE_FOR_nothing);
4666       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4667                                                      REDUC_MAX_EXPR,
4668                                                      vec_cond_cast);
4669       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4670
4671       /* Convert the reduced value back to the result type and set as the
4672          result.  */
4673       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4674                                      data_reduc);
4675       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4676       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4677       gimple_assign_set_lhs (epilog_stmt, new_temp);
4678       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4679       scalar_results.safe_push (new_temp);
4680     }
4681
4682   /* 2.3 Create the reduction code, using one of the three schemes described
4683          above. In SLP we simply need to extract all the elements from the
4684          vector (without reducing them), so we use scalar shifts.  */
4685   else if (reduc_code != ERROR_MARK && !slp_reduc)
4686     {
4687       tree tmp;
4688       tree vec_elem_type;
4689
4690       /*** Case 1:  Create:
4691            v_out2 = reduc_expr <v_out1>  */
4692
4693       if (dump_enabled_p ())
4694         dump_printf_loc (MSG_NOTE, vect_location,
4695                          "Reduce using direct vector reduction.\n");
4696
4697       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4698       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4699         {
4700           tree tmp_dest =
4701               vect_create_destination_var (scalar_dest, vec_elem_type);
4702           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4703           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4704           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4705           gimple_assign_set_lhs (epilog_stmt, new_temp);
4706           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4707
4708           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4709         }
4710       else
4711         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4712
4713       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4714       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4715       gimple_assign_set_lhs (epilog_stmt, new_temp);
4716       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4717
4718       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4719           == INTEGER_INDUC_COND_REDUCTION)
4720         {
4721           /* Earlier we set the initial value to be zero.  Check the result
4722              and if it is zero then replace with the original initial
4723              value.  */
4724           tree zero = build_zero_cst (scalar_type);
4725           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4726
4727           tmp = make_ssa_name (new_scalar_dest);
4728           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4729                                              initial_def, new_temp);
4730           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4731           new_temp = tmp;
4732         }
4733
4734       scalar_results.safe_push (new_temp);
4735     }
4736   else
4737     {
4738       bool reduce_with_shift = have_whole_vector_shift (mode);
4739       int element_bitsize = tree_to_uhwi (bitsize);
4740       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4741       tree vec_temp;
4742
4743       /* Regardless of whether we have a whole vector shift, if we're
4744          emulating the operation via tree-vect-generic, we don't want
4745          to use it.  Only the first round of the reduction is likely
4746          to still be profitable via emulation.  */
4747       /* ??? It might be better to emit a reduction tree code here, so that
4748          tree-vect-generic can expand the first round via bit tricks.  */
4749       if (!VECTOR_MODE_P (mode))
4750         reduce_with_shift = false;
4751       else
4752         {
4753           optab optab = optab_for_tree_code (code, vectype, optab_default);
4754           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4755             reduce_with_shift = false;
4756         }
4757
4758       if (reduce_with_shift && !slp_reduc)
4759         {
4760           int nelements = vec_size_in_bits / element_bitsize;
4761           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4762
4763           int elt_offset;
4764
4765           tree zero_vec = build_zero_cst (vectype);
4766           /*** Case 2: Create:
4767              for (offset = nelements/2; offset >= 1; offset/=2)
4768                 {
4769                   Create:  va' = vec_shift <va, offset>
4770                   Create:  va = vop <va, va'>
4771                 }  */
4772
4773           tree rhs;
4774
4775           if (dump_enabled_p ())
4776             dump_printf_loc (MSG_NOTE, vect_location,
4777                              "Reduce using vector shifts\n");
4778
4779           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4780           new_temp = new_phi_result;
4781           for (elt_offset = nelements / 2;
4782                elt_offset >= 1;
4783                elt_offset /= 2)
4784             {
4785               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4786               tree mask = vect_gen_perm_mask_any (vectype, sel);
4787               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4788                                                  new_temp, zero_vec, mask);
4789               new_name = make_ssa_name (vec_dest, epilog_stmt);
4790               gimple_assign_set_lhs (epilog_stmt, new_name);
4791               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4792
4793               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4794                                                  new_temp);
4795               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4796               gimple_assign_set_lhs (epilog_stmt, new_temp);
4797               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4798             }
4799
4800           /* 2.4  Extract the final scalar result.  Create:
4801              s_out3 = extract_field <v_out2, bitpos>  */
4802
4803           if (dump_enabled_p ())
4804             dump_printf_loc (MSG_NOTE, vect_location,
4805                              "extract scalar result\n");
4806
4807           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4808                         bitsize, bitsize_zero_node);
4809           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4810           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4811           gimple_assign_set_lhs (epilog_stmt, new_temp);
4812           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4813           scalar_results.safe_push (new_temp);
4814         }
4815       else
4816         {
4817           /*** Case 3: Create:
4818              s = extract_field <v_out2, 0>
4819              for (offset = element_size;
4820                   offset < vector_size;
4821                   offset += element_size;)
4822                {
4823                  Create:  s' = extract_field <v_out2, offset>
4824                  Create:  s = op <s, s'>  // For non SLP cases
4825                }  */
4826
4827           if (dump_enabled_p ())
4828             dump_printf_loc (MSG_NOTE, vect_location,
4829                              "Reduce using scalar code.\n");
4830
4831           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4832           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4833             {
4834               int bit_offset;
4835               if (gimple_code (new_phi) == GIMPLE_PHI)
4836                 vec_temp = PHI_RESULT (new_phi);
4837               else
4838                 vec_temp = gimple_assign_lhs (new_phi);
4839               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4840                             bitsize_zero_node);
4841               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4842               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4843               gimple_assign_set_lhs (epilog_stmt, new_temp);
4844               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4845
4846               /* In SLP we don't need to apply reduction operation, so we just
4847                  collect s' values in SCALAR_RESULTS.  */
4848               if (slp_reduc)
4849                 scalar_results.safe_push (new_temp);
4850
4851               for (bit_offset = element_bitsize;
4852                    bit_offset < vec_size_in_bits;
4853                    bit_offset += element_bitsize)
4854                 {
4855                   tree bitpos = bitsize_int (bit_offset);
4856                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4857                                      bitsize, bitpos);
4858
4859                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4860                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4861                   gimple_assign_set_lhs (epilog_stmt, new_name);
4862                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4863
4864                   if (slp_reduc)
4865                     {
4866                       /* In SLP we don't need to apply reduction operation, so
4867                          we just collect s' values in SCALAR_RESULTS.  */
4868                       new_temp = new_name;
4869                       scalar_results.safe_push (new_name);
4870                     }
4871                   else
4872                     {
4873                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4874                                                          new_name, new_temp);
4875                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4876                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4877                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4878                     }
4879                 }
4880             }
4881
4882           /* The only case where we need to reduce scalar results in SLP, is
4883              unrolling.  If the size of SCALAR_RESULTS is greater than
4884              GROUP_SIZE, we reduce them combining elements modulo
4885              GROUP_SIZE.  */
4886           if (slp_reduc)
4887             {
4888               tree res, first_res, new_res;
4889               gimple *new_stmt;
4890
4891               /* Reduce multiple scalar results in case of SLP unrolling.  */
4892               for (j = group_size; scalar_results.iterate (j, &res);
4893                    j++)
4894                 {
4895                   first_res = scalar_results[j % group_size];
4896                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4897                                                   first_res, res);
4898                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4899                   gimple_assign_set_lhs (new_stmt, new_res);
4900                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4901                   scalar_results[j % group_size] = new_res;
4902                 }
4903             }
4904           else
4905             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4906             scalar_results.safe_push (new_temp);
4907         }
4908     }
4909
4910 vect_finalize_reduction:
4911
4912   if (double_reduc)
4913     loop = loop->inner;
4914
4915   /* 2.5 Adjust the final result by the initial value of the reduction
4916          variable. (When such adjustment is not needed, then
4917          'adjustment_def' is zero).  For example, if code is PLUS we create:
4918          new_temp = loop_exit_def + adjustment_def  */
4919
4920   if (adjustment_def)
4921     {
4922       gcc_assert (!slp_reduc);
4923       if (nested_in_vect_loop)
4924         {
4925           new_phi = new_phis[0];
4926           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4927           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4928           new_dest = vect_create_destination_var (scalar_dest, vectype);
4929         }
4930       else
4931         {
4932           new_temp = scalar_results[0];
4933           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4934           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4935           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4936         }
4937
4938       epilog_stmt = gimple_build_assign (new_dest, expr);
4939       new_temp = make_ssa_name (new_dest, epilog_stmt);
4940       gimple_assign_set_lhs (epilog_stmt, new_temp);
4941       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942       if (nested_in_vect_loop)
4943         {
4944           set_vinfo_for_stmt (epilog_stmt,
4945                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
4946           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4947                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4948
4949           if (!double_reduc)
4950             scalar_results.quick_push (new_temp);
4951           else
4952             scalar_results[0] = new_temp;
4953         }
4954       else
4955         scalar_results[0] = new_temp;
4956
4957       new_phis[0] = epilog_stmt;
4958     }
4959
4960   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4961           phis with new adjusted scalar results, i.e., replace use <s_out0>
4962           with use <s_out4>.
4963
4964      Transform:
4965         loop_exit:
4966           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4967           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4968           v_out2 = reduce <v_out1>
4969           s_out3 = extract_field <v_out2, 0>
4970           s_out4 = adjust_result <s_out3>
4971           use <s_out0>
4972           use <s_out0>
4973
4974      into:
4975
4976         loop_exit:
4977           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4978           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4979           v_out2 = reduce <v_out1>
4980           s_out3 = extract_field <v_out2, 0>
4981           s_out4 = adjust_result <s_out3>
4982           use <s_out4>
4983           use <s_out4> */
4984
4985
4986   /* In SLP reduction chain we reduce vector results into one vector if
4987      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4988      the last stmt in the reduction chain, since we are looking for the loop
4989      exit phi node.  */
4990   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4991     {
4992       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4993       /* Handle reduction patterns.  */
4994       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4995         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4996
4997       scalar_dest = gimple_assign_lhs (dest_stmt);
4998       group_size = 1;
4999     }
5000
5001   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5002      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5003      need to match SCALAR_RESULTS with corresponding statements.  The first
5004      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5005      the first vector stmt, etc.
5006      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5007   if (group_size > new_phis.length ())
5008     {
5009       ratio = group_size / new_phis.length ();
5010       gcc_assert (!(group_size % new_phis.length ()));
5011     }
5012   else
5013     ratio = 1;
5014
5015   for (k = 0; k < group_size; k++)
5016     {
5017       if (k % ratio == 0)
5018         {
5019           epilog_stmt = new_phis[k / ratio];
5020           reduction_phi = reduction_phis[k / ratio];
5021           if (double_reduc)
5022             inner_phi = inner_phis[k / ratio];
5023         }
5024
5025       if (slp_reduc)
5026         {
5027           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5028
5029           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5030           /* SLP statements can't participate in patterns.  */
5031           gcc_assert (!orig_stmt);
5032           scalar_dest = gimple_assign_lhs (current_stmt);
5033         }
5034
5035       phis.create (3);
5036       /* Find the loop-closed-use at the loop exit of the original scalar
5037          result.  (The reduction result is expected to have two immediate uses -
5038          one at the latch block, and one at the loop exit).  */
5039       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5040         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5041             && !is_gimple_debug (USE_STMT (use_p)))
5042           phis.safe_push (USE_STMT (use_p));
5043
5044       /* While we expect to have found an exit_phi because of loop-closed-ssa
5045          form we can end up without one if the scalar cycle is dead.  */
5046
5047       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5048         {
5049           if (outer_loop)
5050             {
5051               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5052               gphi *vect_phi;
5053
5054               /* FORNOW. Currently not supporting the case that an inner-loop
5055                  reduction is not used in the outer-loop (but only outside the
5056                  outer-loop), unless it is double reduction.  */
5057               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5058                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5059                           || double_reduc);
5060
5061               if (double_reduc)
5062                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5063               else
5064                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5065               if (!double_reduc
5066                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5067                       != vect_double_reduction_def)
5068                 continue;
5069
5070               /* Handle double reduction:
5071
5072                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5073                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5074                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5075                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5076
5077                  At that point the regular reduction (stmt2 and stmt3) is
5078                  already vectorized, as well as the exit phi node, stmt4.
5079                  Here we vectorize the phi node of double reduction, stmt1, and
5080                  update all relevant statements.  */
5081
5082               /* Go through all the uses of s2 to find double reduction phi
5083                  node, i.e., stmt1 above.  */
5084               orig_name = PHI_RESULT (exit_phi);
5085               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5086                 {
5087                   stmt_vec_info use_stmt_vinfo;
5088                   stmt_vec_info new_phi_vinfo;
5089                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5090                   basic_block bb = gimple_bb (use_stmt);
5091                   gimple *use;
5092
5093                   /* Check that USE_STMT is really double reduction phi
5094                      node.  */
5095                   if (gimple_code (use_stmt) != GIMPLE_PHI
5096                       || gimple_phi_num_args (use_stmt) != 2
5097                       || bb->loop_father != outer_loop)
5098                     continue;
5099                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5100                   if (!use_stmt_vinfo
5101                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5102                           != vect_double_reduction_def)
5103                     continue;
5104
5105                   /* Create vector phi node for double reduction:
5106                      vs1 = phi <vs0, vs2>
5107                      vs1 was created previously in this function by a call to
5108                        vect_get_vec_def_for_operand and is stored in
5109                        vec_initial_def;
5110                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5111                      vs0 is created here.  */
5112
5113                   /* Create vector phi node.  */
5114                   vect_phi = create_phi_node (vec_initial_def, bb);
5115                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5116                                     loop_vec_info_for_loop (outer_loop));
5117                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5118
5119                   /* Create vs0 - initial def of the double reduction phi.  */
5120                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5121                                              loop_preheader_edge (outer_loop));
5122                   init_def = get_initial_def_for_reduction (stmt,
5123                                                           preheader_arg, NULL);
5124                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5125                                                     vectype, NULL);
5126
5127                   /* Update phi node arguments with vs0 and vs2.  */
5128                   add_phi_arg (vect_phi, vect_phi_init,
5129                                loop_preheader_edge (outer_loop),
5130                                UNKNOWN_LOCATION);
5131                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5132                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5133                   if (dump_enabled_p ())
5134                     {
5135                       dump_printf_loc (MSG_NOTE, vect_location,
5136                                        "created double reduction phi node: ");
5137                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5138                       dump_printf (MSG_NOTE, "\n");
5139                     }
5140
5141                   vect_phi_res = PHI_RESULT (vect_phi);
5142
5143                   /* Replace the use, i.e., set the correct vs1 in the regular
5144                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5145                      loop is redundant.  */
5146                   use = reduction_phi;
5147                   for (j = 0; j < ncopies; j++)
5148                     {
5149                       edge pr_edge = loop_preheader_edge (loop);
5150                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5151                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5152                     }
5153                 }
5154             }
5155         }
5156
5157       phis.release ();
5158       if (nested_in_vect_loop)
5159         {
5160           if (double_reduc)
5161             loop = outer_loop;
5162           else
5163             continue;
5164         }
5165
5166       phis.create (3);
5167       /* Find the loop-closed-use at the loop exit of the original scalar
5168          result.  (The reduction result is expected to have two immediate uses,
5169          one at the latch block, and one at the loop exit).  For double
5170          reductions we are looking for exit phis of the outer loop.  */
5171       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5172         {
5173           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5174             {
5175               if (!is_gimple_debug (USE_STMT (use_p)))
5176                 phis.safe_push (USE_STMT (use_p));
5177             }
5178           else
5179             {
5180               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5181                 {
5182                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5183
5184                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5185                     {
5186                       if (!flow_bb_inside_loop_p (loop,
5187                                              gimple_bb (USE_STMT (phi_use_p)))
5188                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5189                         phis.safe_push (USE_STMT (phi_use_p));
5190                     }
5191                 }
5192             }
5193         }
5194
5195       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5196         {
5197           /* Replace the uses:  */
5198           orig_name = PHI_RESULT (exit_phi);
5199           scalar_result = scalar_results[k];
5200           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5201             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5202               SET_USE (use_p, scalar_result);
5203         }
5204
5205       phis.release ();
5206     }
5207 }
5208
5209
5210 /* Function is_nonwrapping_integer_induction.
5211
5212    Check if STMT (which is part of loop LOOP) both increments and
5213    does not cause overflow.  */
5214
5215 static bool
5216 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5217 {
5218   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5219   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5220   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5221   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5222   widest_int ni, max_loop_value, lhs_max;
5223   bool overflow = false;
5224
5225   /* Make sure the loop is integer based.  */
5226   if (TREE_CODE (base) != INTEGER_CST
5227       || TREE_CODE (step) != INTEGER_CST)
5228     return false;
5229
5230   /* Check that the induction increments.  */
5231   if (tree_int_cst_sgn (step) == -1)
5232     return false;
5233
5234   /* Check that the max size of the loop will not wrap.  */
5235
5236   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5237     return true;
5238
5239   if (! max_stmt_executions (loop, &ni))
5240     return false;
5241
5242   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5243                             &overflow);
5244   if (overflow)
5245     return false;
5246
5247   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5248                             TYPE_SIGN (lhs_type), &overflow);
5249   if (overflow)
5250     return false;
5251
5252   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5253           <= TYPE_PRECISION (lhs_type));
5254 }
5255
5256 /* Function vectorizable_reduction.
5257
5258    Check if STMT performs a reduction operation that can be vectorized.
5259    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5260    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5261    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5262
5263    This function also handles reduction idioms (patterns) that have been
5264    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5265    of this form:
5266      X = pattern_expr (arg0, arg1, ..., X)
5267    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5268    sequence that had been detected and replaced by the pattern-stmt (STMT).
5269
5270    This function also handles reduction of condition expressions, for example:
5271      for (int i = 0; i < N; i++)
5272        if (a[i] < value)
5273          last = a[i];
5274    This is handled by vectorising the loop and creating an additional vector
5275    containing the loop indexes for which "a[i] < value" was true.  In the
5276    function epilogue this is reduced to a single max value and then used to
5277    index into the vector of results.
5278
5279    In some cases of reduction patterns, the type of the reduction variable X is
5280    different than the type of the other arguments of STMT.
5281    In such cases, the vectype that is used when transforming STMT into a vector
5282    stmt is different than the vectype that is used to determine the
5283    vectorization factor, because it consists of a different number of elements
5284    than the actual number of elements that are being operated upon in parallel.
5285
5286    For example, consider an accumulation of shorts into an int accumulator.
5287    On some targets it's possible to vectorize this pattern operating on 8
5288    shorts at a time (hence, the vectype for purposes of determining the
5289    vectorization factor should be V8HI); on the other hand, the vectype that
5290    is used to create the vector form is actually V4SI (the type of the result).
5291
5292    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5293    indicates what is the actual level of parallelism (V8HI in the example), so
5294    that the right vectorization factor would be derived.  This vectype
5295    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5296    be used to create the vectorized stmt.  The right vectype for the vectorized
5297    stmt is obtained from the type of the result X:
5298         get_vectype_for_scalar_type (TREE_TYPE (X))
5299
5300    This means that, contrary to "regular" reductions (or "regular" stmts in
5301    general), the following equation:
5302       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5303    does *NOT* necessarily hold for reduction patterns.  */
5304
5305 bool
5306 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5307                         gimple **vec_stmt, slp_tree slp_node)
5308 {
5309   tree vec_dest;
5310   tree scalar_dest;
5311   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5312   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5313   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5314   tree vectype_in = NULL_TREE;
5315   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5316   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5317   enum tree_code code, orig_code, epilog_reduc_code;
5318   machine_mode vec_mode;
5319   int op_type;
5320   optab optab, reduc_optab;
5321   tree new_temp = NULL_TREE;
5322   gimple *def_stmt;
5323   enum vect_def_type dt;
5324   gphi *new_phi = NULL;
5325   tree scalar_type;
5326   bool is_simple_use;
5327   gimple *orig_stmt;
5328   stmt_vec_info orig_stmt_info;
5329   tree expr = NULL_TREE;
5330   int i;
5331   int ncopies;
5332   int epilog_copies;
5333   stmt_vec_info prev_stmt_info, prev_phi_info;
5334   bool single_defuse_cycle = false;
5335   tree reduc_def = NULL_TREE;
5336   gimple *new_stmt = NULL;
5337   int j;
5338   tree ops[3];
5339   bool nested_cycle = false, found_nested_cycle_def = false;
5340   gimple *reduc_def_stmt = NULL;
5341   bool double_reduc = false, dummy;
5342   basic_block def_bb;
5343   struct loop * def_stmt_loop, *outer_loop = NULL;
5344   tree def_arg;
5345   gimple *def_arg_stmt;
5346   auto_vec<tree> vec_oprnds0;
5347   auto_vec<tree> vec_oprnds1;
5348   auto_vec<tree> vect_defs;
5349   auto_vec<gimple *> phis;
5350   int vec_num;
5351   tree def0, def1, tem, op0, op1 = NULL_TREE;
5352   bool first_p = true;
5353   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5354   gimple *cond_expr_induction_def_stmt = NULL;
5355
5356   /* In case of reduction chain we switch to the first stmt in the chain, but
5357      we don't update STMT_INFO, since only the last stmt is marked as reduction
5358      and has reduction properties.  */
5359   if (GROUP_FIRST_ELEMENT (stmt_info)
5360       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5361     {
5362       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5363       first_p = false;
5364     }
5365
5366   if (nested_in_vect_loop_p (loop, stmt))
5367     {
5368       outer_loop = loop;
5369       loop = loop->inner;
5370       nested_cycle = true;
5371     }
5372
5373   /* 1. Is vectorizable reduction?  */
5374   /* Not supportable if the reduction variable is used in the loop, unless
5375      it's a reduction chain.  */
5376   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5377       && !GROUP_FIRST_ELEMENT (stmt_info))
5378     return false;
5379
5380   /* Reductions that are not used even in an enclosing outer-loop,
5381      are expected to be "live" (used out of the loop).  */
5382   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5383       && !STMT_VINFO_LIVE_P (stmt_info))
5384     return false;
5385
5386   /* Make sure it was already recognized as a reduction computation.  */
5387   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5388       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5389     return false;
5390
5391   /* 2. Has this been recognized as a reduction pattern?
5392
5393      Check if STMT represents a pattern that has been recognized
5394      in earlier analysis stages.  For stmts that represent a pattern,
5395      the STMT_VINFO_RELATED_STMT field records the last stmt in
5396      the original sequence that constitutes the pattern.  */
5397
5398   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5399   if (orig_stmt)
5400     {
5401       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5402       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5403       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5404     }
5405
5406   /* 3. Check the operands of the operation.  The first operands are defined
5407         inside the loop body. The last operand is the reduction variable,
5408         which is defined by the loop-header-phi.  */
5409
5410   gcc_assert (is_gimple_assign (stmt));
5411
5412   /* Flatten RHS.  */
5413   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5414     {
5415     case GIMPLE_SINGLE_RHS:
5416       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5417       if (op_type == ternary_op)
5418         {
5419           tree rhs = gimple_assign_rhs1 (stmt);
5420           ops[0] = TREE_OPERAND (rhs, 0);
5421           ops[1] = TREE_OPERAND (rhs, 1);
5422           ops[2] = TREE_OPERAND (rhs, 2);
5423           code = TREE_CODE (rhs);
5424         }
5425       else
5426         return false;
5427       break;
5428
5429     case GIMPLE_BINARY_RHS:
5430       code = gimple_assign_rhs_code (stmt);
5431       op_type = TREE_CODE_LENGTH (code);
5432       gcc_assert (op_type == binary_op);
5433       ops[0] = gimple_assign_rhs1 (stmt);
5434       ops[1] = gimple_assign_rhs2 (stmt);
5435       break;
5436
5437     case GIMPLE_TERNARY_RHS:
5438       code = gimple_assign_rhs_code (stmt);
5439       op_type = TREE_CODE_LENGTH (code);
5440       gcc_assert (op_type == ternary_op);
5441       ops[0] = gimple_assign_rhs1 (stmt);
5442       ops[1] = gimple_assign_rhs2 (stmt);
5443       ops[2] = gimple_assign_rhs3 (stmt);
5444       break;
5445
5446     case GIMPLE_UNARY_RHS:
5447       return false;
5448
5449     default:
5450       gcc_unreachable ();
5451     }
5452   /* The default is that the reduction variable is the last in statement.  */
5453   int reduc_index = op_type - 1;
5454   if (code == MINUS_EXPR)
5455     reduc_index = 0;
5456
5457   if (code == COND_EXPR && slp_node)
5458     return false;
5459
5460   scalar_dest = gimple_assign_lhs (stmt);
5461   scalar_type = TREE_TYPE (scalar_dest);
5462   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5463       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5464     return false;
5465
5466   /* Do not try to vectorize bit-precision reductions.  */
5467   if ((TYPE_PRECISION (scalar_type)
5468        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5469     return false;
5470
5471   /* All uses but the last are expected to be defined in the loop.
5472      The last use is the reduction variable.  In case of nested cycle this
5473      assumption is not true: we use reduc_index to record the index of the
5474      reduction variable.  */
5475   for (i = 0; i < op_type; i++)
5476     {
5477       if (i == reduc_index)
5478         continue;
5479
5480       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5481       if (i == 0 && code == COND_EXPR)
5482         continue;
5483
5484       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5485                                           &def_stmt, &dt, &tem);
5486       if (!vectype_in)
5487         vectype_in = tem;
5488       gcc_assert (is_simple_use);
5489
5490       if (dt != vect_internal_def
5491           && dt != vect_external_def
5492           && dt != vect_constant_def
5493           && dt != vect_induction_def
5494           && !(dt == vect_nested_cycle && nested_cycle))
5495         return false;
5496
5497       if (dt == vect_nested_cycle)
5498         {
5499           found_nested_cycle_def = true;
5500           reduc_def_stmt = def_stmt;
5501           reduc_index = i;
5502         }
5503
5504       if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5505         cond_expr_induction_def_stmt = def_stmt;
5506     }
5507
5508   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5509                                       &def_stmt, &dt, &tem);
5510   if (!vectype_in)
5511     vectype_in = tem;
5512   gcc_assert (is_simple_use);
5513   if (!found_nested_cycle_def)
5514     reduc_def_stmt = def_stmt;
5515
5516   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5517     return false;
5518
5519   if (!(dt == vect_reduction_def
5520         || dt == vect_nested_cycle
5521         || ((dt == vect_internal_def || dt == vect_external_def
5522              || dt == vect_constant_def || dt == vect_induction_def)
5523             && nested_cycle && found_nested_cycle_def)))
5524     {
5525       /* For pattern recognized stmts, orig_stmt might be a reduction,
5526          but some helper statements for the pattern might not, or
5527          might be COND_EXPRs with reduction uses in the condition.  */
5528       gcc_assert (orig_stmt);
5529       return false;
5530     }
5531
5532   enum vect_reduction_type v_reduc_type;
5533   gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5534                                           !nested_cycle, &dummy, false,
5535                                           &v_reduc_type);
5536
5537   /* If we have a condition reduction, see if we can simplify it further.  */
5538   if (v_reduc_type == COND_REDUCTION
5539       && cond_expr_induction_def_stmt != NULL
5540       && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5541     {
5542       if (dump_enabled_p ())
5543         dump_printf_loc (MSG_NOTE, vect_location,
5544                          "condition expression based on integer induction.\n");
5545       STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5546     }
5547   else
5548    STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5549
5550   if (orig_stmt)
5551     gcc_assert (tmp == orig_stmt
5552                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5553   else
5554     /* We changed STMT to be the first stmt in reduction chain, hence we
5555        check that in this case the first element in the chain is STMT.  */
5556     gcc_assert (stmt == tmp
5557                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5558
5559   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5560     return false;
5561
5562   if (slp_node || PURE_SLP_STMT (stmt_info))
5563     ncopies = 1;
5564   else
5565     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5566                / TYPE_VECTOR_SUBPARTS (vectype_in));
5567
5568   gcc_assert (ncopies >= 1);
5569
5570   vec_mode = TYPE_MODE (vectype_in);
5571
5572   if (code == COND_EXPR)
5573     {
5574       /* Only call during the analysis stage, otherwise we'll lose
5575          STMT_VINFO_TYPE.  */
5576       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5577                                                 ops[reduc_index], 0, NULL))
5578         {
5579           if (dump_enabled_p ())
5580             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5581                              "unsupported condition in reduction\n");
5582           return false;
5583         }
5584     }
5585   else
5586     {
5587       /* 4. Supportable by target?  */
5588
5589       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5590           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5591         {
5592           /* Shifts and rotates are only supported by vectorizable_shifts,
5593              not vectorizable_reduction.  */
5594           if (dump_enabled_p ())
5595             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5596                              "unsupported shift or rotation.\n");
5597           return false;
5598         }
5599
5600       /* 4.1. check support for the operation in the loop  */
5601       optab = optab_for_tree_code (code, vectype_in, optab_default);
5602       if (!optab)
5603         {
5604           if (dump_enabled_p ())
5605             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5606                              "no optab.\n");
5607
5608           return false;
5609         }
5610
5611       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5612         {
5613           if (dump_enabled_p ())
5614             dump_printf (MSG_NOTE, "op not supported by target.\n");
5615
5616           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5617               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5618                   < vect_min_worthwhile_factor (code))
5619             return false;
5620
5621           if (dump_enabled_p ())
5622             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5623         }
5624
5625       /* Worthwhile without SIMD support?  */
5626       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5627           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5628              < vect_min_worthwhile_factor (code))
5629         {
5630           if (dump_enabled_p ())
5631             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5632                              "not worthwhile without SIMD support.\n");
5633
5634           return false;
5635         }
5636     }
5637
5638   /* 4.2. Check support for the epilog operation.
5639
5640           If STMT represents a reduction pattern, then the type of the
5641           reduction variable may be different than the type of the rest
5642           of the arguments.  For example, consider the case of accumulation
5643           of shorts into an int accumulator; The original code:
5644                         S1: int_a = (int) short_a;
5645           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5646
5647           was replaced with:
5648                         STMT: int_acc = widen_sum <short_a, int_acc>
5649
5650           This means that:
5651           1. The tree-code that is used to create the vector operation in the
5652              epilog code (that reduces the partial results) is not the
5653              tree-code of STMT, but is rather the tree-code of the original
5654              stmt from the pattern that STMT is replacing.  I.e, in the example
5655              above we want to use 'widen_sum' in the loop, but 'plus' in the
5656              epilog.
5657           2. The type (mode) we use to check available target support
5658              for the vector operation to be created in the *epilog*, is
5659              determined by the type of the reduction variable (in the example
5660              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5661              However the type (mode) we use to check available target support
5662              for the vector operation to be created *inside the loop*, is
5663              determined by the type of the other arguments to STMT (in the
5664              example we'd check this: optab_handler (widen_sum_optab,
5665              vect_short_mode)).
5666
5667           This is contrary to "regular" reductions, in which the types of all
5668           the arguments are the same as the type of the reduction variable.
5669           For "regular" reductions we can therefore use the same vector type
5670           (and also the same tree-code) when generating the epilog code and
5671           when generating the code inside the loop.  */
5672
5673   if (orig_stmt)
5674     {
5675       /* This is a reduction pattern: get the vectype from the type of the
5676          reduction variable, and get the tree-code from orig_stmt.  */
5677       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5678                   == TREE_CODE_REDUCTION);
5679       orig_code = gimple_assign_rhs_code (orig_stmt);
5680       gcc_assert (vectype_out);
5681       vec_mode = TYPE_MODE (vectype_out);
5682     }
5683   else
5684     {
5685       /* Regular reduction: use the same vectype and tree-code as used for
5686          the vector code inside the loop can be used for the epilog code. */
5687       orig_code = code;
5688
5689       if (code == MINUS_EXPR)
5690         orig_code = PLUS_EXPR;
5691
5692       /* For simple condition reductions, replace with the actual expression
5693          we want to base our reduction around.  */
5694       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5695           == INTEGER_INDUC_COND_REDUCTION)
5696         orig_code = MAX_EXPR;
5697     }
5698
5699   if (nested_cycle)
5700     {
5701       def_bb = gimple_bb (reduc_def_stmt);
5702       def_stmt_loop = def_bb->loop_father;
5703       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5704                                        loop_preheader_edge (def_stmt_loop));
5705       if (TREE_CODE (def_arg) == SSA_NAME
5706           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5707           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5708           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5709           && vinfo_for_stmt (def_arg_stmt)
5710           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5711               == vect_double_reduction_def)
5712         double_reduc = true;
5713     }
5714
5715   epilog_reduc_code = ERROR_MARK;
5716
5717   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5718       || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5719                 == INTEGER_INDUC_COND_REDUCTION)
5720     {
5721       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5722         {
5723           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5724                                          optab_default);
5725           if (!reduc_optab)
5726             {
5727               if (dump_enabled_p ())
5728                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5729                                  "no optab for reduction.\n");
5730
5731               epilog_reduc_code = ERROR_MARK;
5732             }
5733           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5734             {
5735               optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5736               if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5737                 {
5738                   if (dump_enabled_p ())
5739                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5740                                      "reduc op not supported by target.\n");
5741
5742                   epilog_reduc_code = ERROR_MARK;
5743                 }
5744             }
5745
5746           /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5747              generated in the epilog using multiple expressions.  This does not
5748              work for condition reductions.  */
5749           if (epilog_reduc_code == ERROR_MARK
5750               && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5751                         == INTEGER_INDUC_COND_REDUCTION)
5752             {
5753               if (dump_enabled_p ())
5754                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5755                                  "no reduc code for scalar code.\n");
5756               return false;
5757             }
5758         }
5759       else
5760         {
5761           if (!nested_cycle || double_reduc)
5762             {
5763               if (dump_enabled_p ())
5764                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5765                                  "no reduc code for scalar code.\n");
5766
5767               return false;
5768             }
5769         }
5770     }
5771   else
5772     {
5773       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5774       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5775       cr_index_vector_type = build_vector_type
5776         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5777
5778       epilog_reduc_code = REDUC_MAX_EXPR;
5779       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5780                                    optab_default);
5781       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5782           == CODE_FOR_nothing)
5783         {
5784           if (dump_enabled_p ())
5785             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5786                              "reduc max op not supported by target.\n");
5787           return false;
5788         }
5789     }
5790
5791   if ((double_reduc
5792        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5793        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5794                 == INTEGER_INDUC_COND_REDUCTION)
5795       && ncopies > 1)
5796     {
5797       if (dump_enabled_p ())
5798         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5799                          "multiple types in double reduction or condition "
5800                          "reduction.\n");
5801       return false;
5802     }
5803
5804   /* In case of widenning multiplication by a constant, we update the type
5805      of the constant to be the type of the other operand.  We check that the
5806      constant fits the type in the pattern recognition pass.  */
5807   if (code == DOT_PROD_EXPR
5808       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5809     {
5810       if (TREE_CODE (ops[0]) == INTEGER_CST)
5811         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5812       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5813         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5814       else
5815         {
5816           if (dump_enabled_p ())
5817             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5818                              "invalid types in dot-prod\n");
5819
5820           return false;
5821         }
5822     }
5823
5824   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5825     {
5826       widest_int ni;
5827
5828       if (! max_loop_iterations (loop, &ni))
5829         {
5830           if (dump_enabled_p ())
5831             dump_printf_loc (MSG_NOTE, vect_location,
5832                              "loop count not known, cannot create cond "
5833                              "reduction.\n");
5834           return false;
5835         }
5836       /* Convert backedges to iterations.  */
5837       ni += 1;
5838
5839       /* The additional index will be the same type as the condition.  Check
5840          that the loop can fit into this less one (because we'll use up the
5841          zero slot for when there are no matches).  */
5842       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5843       if (wi::geu_p (ni, wi::to_widest (max_index)))
5844         {
5845           if (dump_enabled_p ())
5846             dump_printf_loc (MSG_NOTE, vect_location,
5847                              "loop size is greater than data size.\n");
5848           return false;
5849         }
5850     }
5851
5852   if (!vec_stmt) /* transformation not required.  */
5853     {
5854       if (first_p
5855           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5856                                          reduc_index))
5857         return false;
5858       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5859       return true;
5860     }
5861
5862   /** Transform.  **/
5863
5864   if (dump_enabled_p ())
5865     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5866
5867   /* FORNOW: Multiple types are not supported for condition.  */
5868   if (code == COND_EXPR)
5869     gcc_assert (ncopies == 1);
5870
5871   /* Create the destination vector  */
5872   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5873
5874   /* In case the vectorization factor (VF) is bigger than the number
5875      of elements that we can fit in a vectype (nunits), we have to generate
5876      more than one vector stmt - i.e - we need to "unroll" the
5877      vector stmt by a factor VF/nunits.  For more details see documentation
5878      in vectorizable_operation.  */
5879
5880   /* If the reduction is used in an outer loop we need to generate
5881      VF intermediate results, like so (e.g. for ncopies=2):
5882         r0 = phi (init, r0)
5883         r1 = phi (init, r1)
5884         r0 = x0 + r0;
5885         r1 = x1 + r1;
5886     (i.e. we generate VF results in 2 registers).
5887     In this case we have a separate def-use cycle for each copy, and therefore
5888     for each copy we get the vector def for the reduction variable from the
5889     respective phi node created for this copy.
5890
5891     Otherwise (the reduction is unused in the loop nest), we can combine
5892     together intermediate results, like so (e.g. for ncopies=2):
5893         r = phi (init, r)
5894         r = x0 + r;
5895         r = x1 + r;
5896    (i.e. we generate VF/2 results in a single register).
5897    In this case for each copy we get the vector def for the reduction variable
5898    from the vectorized reduction operation generated in the previous iteration.
5899   */
5900
5901   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5902     {
5903       single_defuse_cycle = true;
5904       epilog_copies = 1;
5905     }
5906   else
5907     epilog_copies = ncopies;
5908
5909   prev_stmt_info = NULL;
5910   prev_phi_info = NULL;
5911   if (slp_node)
5912     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5913   else
5914     {
5915       vec_num = 1;
5916       vec_oprnds0.create (1);
5917       if (op_type == ternary_op)
5918         vec_oprnds1.create (1);
5919     }
5920
5921   phis.create (vec_num);
5922   vect_defs.create (vec_num);
5923   if (!slp_node)
5924     vect_defs.quick_push (NULL_TREE);
5925
5926   for (j = 0; j < ncopies; j++)
5927     {
5928       if (j == 0 || !single_defuse_cycle)
5929         {
5930           for (i = 0; i < vec_num; i++)
5931             {
5932               /* Create the reduction-phi that defines the reduction
5933                  operand.  */
5934               new_phi = create_phi_node (vec_dest, loop->header);
5935               set_vinfo_for_stmt (new_phi,
5936                                   new_stmt_vec_info (new_phi, loop_vinfo));
5937                if (j == 0 || slp_node)
5938                  phis.quick_push (new_phi);
5939             }
5940         }
5941
5942       if (code == COND_EXPR)
5943         {
5944           gcc_assert (!slp_node);
5945           vectorizable_condition (stmt, gsi, vec_stmt,
5946                                   PHI_RESULT (phis[0]),
5947                                   reduc_index, NULL);
5948           /* Multiple types are not supported for condition.  */
5949           break;
5950         }
5951
5952       /* Handle uses.  */
5953       if (j == 0)
5954         {
5955           op0 = ops[!reduc_index];
5956           if (op_type == ternary_op)
5957             {
5958               if (reduc_index == 0)
5959                 op1 = ops[2];
5960               else
5961                 op1 = ops[1];
5962             }
5963
5964           if (slp_node)
5965             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5966                                slp_node, -1);
5967           else
5968             {
5969               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5970                                                             stmt);
5971               vec_oprnds0.quick_push (loop_vec_def0);
5972               if (op_type == ternary_op)
5973                {
5974                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5975                  vec_oprnds1.quick_push (loop_vec_def1);
5976                }
5977             }
5978         }
5979       else
5980         {
5981           if (!slp_node)
5982             {
5983               enum vect_def_type dt;
5984               gimple *dummy_stmt;
5985
5986               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5987                                   &dummy_stmt, &dt);
5988               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5989                                                               loop_vec_def0);
5990               vec_oprnds0[0] = loop_vec_def0;
5991               if (op_type == ternary_op)
5992                 {
5993                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5994                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5995                                                                 loop_vec_def1);
5996                   vec_oprnds1[0] = loop_vec_def1;
5997                 }
5998             }
5999
6000           if (single_defuse_cycle)
6001             reduc_def = gimple_assign_lhs (new_stmt);
6002
6003           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6004         }
6005
6006       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6007         {
6008           if (slp_node)
6009             reduc_def = PHI_RESULT (phis[i]);
6010           else
6011             {
6012               if (!single_defuse_cycle || j == 0)
6013                 reduc_def = PHI_RESULT (new_phi);
6014             }
6015
6016           def1 = ((op_type == ternary_op)
6017                   ? vec_oprnds1[i] : NULL);
6018           if (op_type == binary_op)
6019             {
6020               if (reduc_index == 0)
6021                 expr = build2 (code, vectype_out, reduc_def, def0);
6022               else
6023                 expr = build2 (code, vectype_out, def0, reduc_def);
6024             }
6025           else
6026             {
6027               if (reduc_index == 0)
6028                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6029               else
6030                 {
6031                   if (reduc_index == 1)
6032                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
6033                   else
6034                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
6035                 }
6036             }
6037
6038           new_stmt = gimple_build_assign (vec_dest, expr);
6039           new_temp = make_ssa_name (vec_dest, new_stmt);
6040           gimple_assign_set_lhs (new_stmt, new_temp);
6041           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6042
6043           if (slp_node)
6044             {
6045               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6046               vect_defs.quick_push (new_temp);
6047             }
6048           else
6049             vect_defs[0] = new_temp;
6050         }
6051
6052       if (slp_node)
6053         continue;
6054
6055       if (j == 0)
6056         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6057       else
6058         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6059
6060       prev_stmt_info = vinfo_for_stmt (new_stmt);
6061       prev_phi_info = vinfo_for_stmt (new_phi);
6062     }
6063
6064   tree indx_before_incr, indx_after_incr, cond_name = NULL;
6065
6066   /* Finalize the reduction-phi (set its arguments) and create the
6067      epilog reduction code.  */
6068   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6069     {
6070       new_temp = gimple_assign_lhs (*vec_stmt);
6071       vect_defs[0] = new_temp;
6072
6073       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6074          which is updated with the current index of the loop for every match of
6075          the original loop's cond_expr (VEC_STMT).  This results in a vector
6076          containing the last time the condition passed for that vector lane.
6077          The first match will be a 1 to allow 0 to be used for non-matching
6078          indexes.  If there are no matches at all then the vector will be all
6079          zeroes.  */
6080       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6081         {
6082           int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6083           int k;
6084
6085           gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6086
6087           /* First we create a simple vector induction variable which starts
6088              with the values {1,2,3,...} (SERIES_VECT) and increments by the
6089              vector size (STEP).  */
6090
6091           /* Create a {1,2,3,...} vector.  */
6092           tree *vtemp = XALLOCAVEC (tree, nunits_out);
6093           for (k = 0; k < nunits_out; ++k)
6094             vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6095           tree series_vect = build_vector (cr_index_vector_type, vtemp);
6096
6097           /* Create a vector of the step value.  */
6098           tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6099           tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6100
6101           /* Create an induction variable.  */
6102           gimple_stmt_iterator incr_gsi;
6103           bool insert_after;
6104           standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6105           create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6106                      insert_after, &indx_before_incr, &indx_after_incr);
6107
6108           /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6109              filled with zeros (VEC_ZERO).  */
6110
6111           /* Create a vector of 0s.  */
6112           tree zero = build_zero_cst (cr_index_scalar_type);
6113           tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6114
6115           /* Create a vector phi node.  */
6116           tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6117           new_phi = create_phi_node (new_phi_tree, loop->header);
6118           set_vinfo_for_stmt (new_phi,
6119                               new_stmt_vec_info (new_phi, loop_vinfo));
6120           add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6121                        UNKNOWN_LOCATION);
6122
6123           /* Now take the condition from the loops original cond_expr
6124              (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6125              every match uses values from the induction variable
6126              (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6127              (NEW_PHI_TREE).
6128              Finally, we update the phi (NEW_PHI_TREE) to take the value of
6129              the new cond_expr (INDEX_COND_EXPR).  */
6130
6131           /* Turn the condition from vec_stmt into an ssa name.  */
6132           gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6133           tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6134           tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6135           gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6136                                                        ccompare);
6137           gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6138           gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6139           update_stmt (*vec_stmt);
6140
6141           /* Create a conditional, where the condition is taken from vec_stmt
6142              (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6143              and else is the phi (NEW_PHI_TREE).  */
6144           tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6145                                          ccompare_name, indx_before_incr,
6146                                          new_phi_tree);
6147           cond_name = make_ssa_name (cr_index_vector_type);
6148           gimple *index_condition = gimple_build_assign (cond_name,
6149                                                          index_cond_expr);
6150           gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6151           stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6152                                                             loop_vinfo);
6153           STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6154           set_vinfo_for_stmt (index_condition, index_vec_info);
6155
6156           /* Update the phi with the vec cond.  */
6157           add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6158                        UNKNOWN_LOCATION);
6159         }
6160     }
6161
6162   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6163                                     epilog_reduc_code, phis, reduc_index,
6164                                     double_reduc, slp_node, cond_name);
6165
6166   return true;
6167 }
6168
6169 /* Function vect_min_worthwhile_factor.
6170
6171    For a loop where we could vectorize the operation indicated by CODE,
6172    return the minimum vectorization factor that makes it worthwhile
6173    to use generic vectors.  */
6174 int
6175 vect_min_worthwhile_factor (enum tree_code code)
6176 {
6177   switch (code)
6178     {
6179     case PLUS_EXPR:
6180     case MINUS_EXPR:
6181     case NEGATE_EXPR:
6182       return 4;
6183
6184     case BIT_AND_EXPR:
6185     case BIT_IOR_EXPR:
6186     case BIT_XOR_EXPR:
6187     case BIT_NOT_EXPR:
6188       return 2;
6189
6190     default:
6191       return INT_MAX;
6192     }
6193 }
6194
6195
6196 /* Function vectorizable_induction
6197
6198    Check if PHI performs an induction computation that can be vectorized.
6199    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6200    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6201    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6202
6203 bool
6204 vectorizable_induction (gimple *phi,
6205                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6206                         gimple **vec_stmt)
6207 {
6208   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6209   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6210   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6211   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6212   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6213   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6214   tree vec_def;
6215
6216   gcc_assert (ncopies >= 1);
6217   /* FORNOW. These restrictions should be relaxed.  */
6218   if (nested_in_vect_loop_p (loop, phi))
6219     {
6220       imm_use_iterator imm_iter;
6221       use_operand_p use_p;
6222       gimple *exit_phi;
6223       edge latch_e;
6224       tree loop_arg;
6225
6226       if (ncopies > 1)
6227         {
6228           if (dump_enabled_p ())
6229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230                              "multiple types in nested loop.\n");
6231           return false;
6232         }
6233
6234       exit_phi = NULL;
6235       latch_e = loop_latch_edge (loop->inner);
6236       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6237       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6238         {
6239           gimple *use_stmt = USE_STMT (use_p);
6240           if (is_gimple_debug (use_stmt))
6241             continue;
6242
6243           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6244             {
6245               exit_phi = use_stmt;
6246               break;
6247             }
6248         }
6249       if (exit_phi)
6250         {
6251           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6252           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6253                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6254             {
6255               if (dump_enabled_p ())
6256                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6257                                  "inner-loop induction only used outside "
6258                                  "of the outer vectorized loop.\n");
6259               return false;
6260             }
6261         }
6262     }
6263
6264   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6265     return false;
6266
6267   /* FORNOW: SLP not supported.  */
6268   if (STMT_SLP_TYPE (stmt_info))
6269     return false;
6270
6271   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6272
6273   if (gimple_code (phi) != GIMPLE_PHI)
6274     return false;
6275
6276   if (!vec_stmt) /* transformation not required.  */
6277     {
6278       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6279       if (dump_enabled_p ())
6280         dump_printf_loc (MSG_NOTE, vect_location,
6281                          "=== vectorizable_induction ===\n");
6282       vect_model_induction_cost (stmt_info, ncopies);
6283       return true;
6284     }
6285
6286   /** Transform.  **/
6287
6288   if (dump_enabled_p ())
6289     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6290
6291   vec_def = get_initial_def_for_induction (phi);
6292   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6293   return true;
6294 }
6295
6296 /* Function vectorizable_live_operation.
6297
6298    STMT computes a value that is used outside the loop.  Check if
6299    it can be supported.  */
6300
6301 bool
6302 vectorizable_live_operation (gimple *stmt,
6303                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6304                              gimple **vec_stmt)
6305 {
6306   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6307   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6309   tree op;
6310   gimple *def_stmt;
6311   ssa_op_iter iter;
6312
6313   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6314
6315   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6316     return false;
6317
6318   if (!is_gimple_assign (stmt))
6319     {
6320       if (gimple_call_internal_p (stmt)
6321           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6322           && gimple_call_lhs (stmt)
6323           && loop->simduid
6324           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6325           && loop->simduid
6326              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6327         {
6328           edge e = single_exit (loop);
6329           basic_block merge_bb = e->dest;
6330           imm_use_iterator imm_iter;
6331           use_operand_p use_p;
6332           tree lhs = gimple_call_lhs (stmt);
6333
6334           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6335             {
6336               gimple *use_stmt = USE_STMT (use_p);
6337               if (gimple_code (use_stmt) == GIMPLE_PHI
6338                   && gimple_bb (use_stmt) == merge_bb)
6339                 {
6340                   if (vec_stmt)
6341                     {
6342                       tree vfm1
6343                         = build_int_cst (unsigned_type_node,
6344                                          loop_vinfo->vectorization_factor - 1);
6345                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6346                     }
6347                   return true;
6348                 }
6349             }
6350         }
6351
6352       return false;
6353     }
6354
6355   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6356     return false;
6357
6358   /* FORNOW. CHECKME. */
6359   if (nested_in_vect_loop_p (loop, stmt))
6360     return false;
6361
6362   /* FORNOW: support only if all uses are invariant.  This means
6363      that the scalar operations can remain in place, unvectorized.
6364      The original last scalar value that they compute will be used.  */
6365   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6366     {
6367       enum vect_def_type dt = vect_uninitialized_def;
6368
6369       if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6370         {
6371           if (dump_enabled_p ())
6372             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6373                              "use not simple.\n");
6374           return false;
6375         }
6376
6377       if (dt != vect_external_def && dt != vect_constant_def)
6378         return false;
6379     }
6380
6381   /* No transformation is required for the cases we currently support.  */
6382   return true;
6383 }
6384
6385 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6386
6387 static void
6388 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6389 {
6390   ssa_op_iter op_iter;
6391   imm_use_iterator imm_iter;
6392   def_operand_p def_p;
6393   gimple *ustmt;
6394
6395   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6396     {
6397       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6398         {
6399           basic_block bb;
6400
6401           if (!is_gimple_debug (ustmt))
6402             continue;
6403
6404           bb = gimple_bb (ustmt);
6405
6406           if (!flow_bb_inside_loop_p (loop, bb))
6407             {
6408               if (gimple_debug_bind_p (ustmt))
6409                 {
6410                   if (dump_enabled_p ())
6411                     dump_printf_loc (MSG_NOTE, vect_location,
6412                                      "killing debug use\n");
6413
6414                   gimple_debug_bind_reset_value (ustmt);
6415                   update_stmt (ustmt);
6416                 }
6417               else
6418                 gcc_unreachable ();
6419             }
6420         }
6421     }
6422 }
6423
6424
6425 /* This function builds ni_name = number of iterations.  Statements
6426    are emitted on the loop preheader edge.  */
6427
6428 static tree
6429 vect_build_loop_niters (loop_vec_info loop_vinfo)
6430 {
6431   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6432   if (TREE_CODE (ni) == INTEGER_CST)
6433     return ni;
6434   else
6435     {
6436       tree ni_name, var;
6437       gimple_seq stmts = NULL;
6438       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6439
6440       var = create_tmp_var (TREE_TYPE (ni), "niters");
6441       ni_name = force_gimple_operand (ni, &stmts, false, var);
6442       if (stmts)
6443         gsi_insert_seq_on_edge_immediate (pe, stmts);
6444
6445       return ni_name;
6446     }
6447 }
6448
6449
6450 /* This function generates the following statements:
6451
6452    ni_name = number of iterations loop executes
6453    ratio = ni_name / vf
6454    ratio_mult_vf_name = ratio * vf
6455
6456    and places them on the loop preheader edge.  */
6457
6458 static void
6459 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6460                                  tree ni_name,
6461                                  tree *ratio_mult_vf_name_ptr,
6462                                  tree *ratio_name_ptr)
6463 {
6464   tree ni_minus_gap_name;
6465   tree var;
6466   tree ratio_name;
6467   tree ratio_mult_vf_name;
6468   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6469   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6470   tree log_vf;
6471
6472   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6473
6474   /* If epilogue loop is required because of data accesses with gaps, we
6475      subtract one iteration from the total number of iterations here for
6476      correct calculation of RATIO.  */
6477   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6478     {
6479       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6480                                        ni_name,
6481                                        build_one_cst (TREE_TYPE (ni_name)));
6482       if (!is_gimple_val (ni_minus_gap_name))
6483         {
6484           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6485           gimple *stmts = NULL;
6486           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6487                                                     true, var);
6488           gsi_insert_seq_on_edge_immediate (pe, stmts);
6489         }
6490     }
6491   else
6492     ni_minus_gap_name = ni_name;
6493
6494   /* Create: ratio = ni >> log2(vf) */
6495   /* ???  As we have ni == number of latch executions + 1, ni could
6496      have overflown to zero.  So avoid computing ratio based on ni
6497      but compute it using the fact that we know ratio will be at least
6498      one, thus via (ni - vf) >> log2(vf) + 1.  */
6499   ratio_name
6500     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6501                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6502                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6503                                              ni_minus_gap_name,
6504                                              build_int_cst
6505                                                (TREE_TYPE (ni_name), vf)),
6506                                 log_vf),
6507                    build_int_cst (TREE_TYPE (ni_name), 1));
6508   if (!is_gimple_val (ratio_name))
6509     {
6510       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6511       gimple *stmts = NULL;
6512       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6513       gsi_insert_seq_on_edge_immediate (pe, stmts);
6514     }
6515   *ratio_name_ptr = ratio_name;
6516
6517   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6518
6519   if (ratio_mult_vf_name_ptr)
6520     {
6521       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6522                                         ratio_name, log_vf);
6523       if (!is_gimple_val (ratio_mult_vf_name))
6524         {
6525           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6526           gimple *stmts = NULL;
6527           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6528                                                      true, var);
6529           gsi_insert_seq_on_edge_immediate (pe, stmts);
6530         }
6531       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6532     }
6533
6534   return;
6535 }
6536
6537
6538 /* Function vect_transform_loop.
6539
6540    The analysis phase has determined that the loop is vectorizable.
6541    Vectorize the loop - created vectorized stmts to replace the scalar
6542    stmts in the loop, and update the loop exit condition.  */
6543
6544 void
6545 vect_transform_loop (loop_vec_info loop_vinfo)
6546 {
6547   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6548   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6549   int nbbs = loop->num_nodes;
6550   int i;
6551   tree ratio = NULL;
6552   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6553   bool grouped_store;
6554   bool slp_scheduled = false;
6555   gimple *stmt, *pattern_stmt;
6556   gimple_seq pattern_def_seq = NULL;
6557   gimple_stmt_iterator pattern_def_si = gsi_none ();
6558   bool transform_pattern_stmt = false;
6559   bool check_profitability = false;
6560   int th;
6561   /* Record number of iterations before we started tampering with the profile. */
6562   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6563
6564   if (dump_enabled_p ())
6565     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6566
6567   /* If profile is inprecise, we have chance to fix it up.  */
6568   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6569     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6570
6571   /* Use the more conservative vectorization threshold.  If the number
6572      of iterations is constant assume the cost check has been performed
6573      by our caller.  If the threshold makes all loops profitable that
6574      run at least the vectorization factor number of times checking
6575      is pointless, too.  */
6576   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6577   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6578       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6579     {
6580       if (dump_enabled_p ())
6581         dump_printf_loc (MSG_NOTE, vect_location,
6582                          "Profitability threshold is %d loop iterations.\n",
6583                          th);
6584       check_profitability = true;
6585     }
6586
6587   /* Version the loop first, if required, so the profitability check
6588      comes first.  */
6589
6590   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6591       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6592     {
6593       vect_loop_versioning (loop_vinfo, th, check_profitability);
6594       check_profitability = false;
6595     }
6596
6597   tree ni_name = vect_build_loop_niters (loop_vinfo);
6598   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6599
6600   /* Peel the loop if there are data refs with unknown alignment.
6601      Only one data ref with unknown store is allowed.  */
6602
6603   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6604     {
6605       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6606                                      th, check_profitability);
6607       check_profitability = false;
6608       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6609          be re-computed.  */
6610       ni_name = NULL_TREE;
6611     }
6612
6613   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6614      compile time constant), or it is a constant that doesn't divide by the
6615      vectorization factor, then an epilog loop needs to be created.
6616      We therefore duplicate the loop: the original loop will be vectorized,
6617      and will compute the first (n/VF) iterations.  The second copy of the loop
6618      will remain scalar and will compute the remaining (n%VF) iterations.
6619      (VF is the vectorization factor).  */
6620
6621   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6622       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6623     {
6624       tree ratio_mult_vf;
6625       if (!ni_name)
6626         ni_name = vect_build_loop_niters (loop_vinfo);
6627       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6628                                        &ratio);
6629       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6630                                       th, check_profitability);
6631     }
6632   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6633     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6634                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6635   else
6636     {
6637       if (!ni_name)
6638         ni_name = vect_build_loop_niters (loop_vinfo);
6639       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6640     }
6641
6642   /* 1) Make sure the loop header has exactly two entries
6643      2) Make sure we have a preheader basic block.  */
6644
6645   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6646
6647   split_edge (loop_preheader_edge (loop));
6648
6649   /* FORNOW: the vectorizer supports only loops which body consist
6650      of one basic block (header + empty latch). When the vectorizer will
6651      support more involved loop forms, the order by which the BBs are
6652      traversed need to be reconsidered.  */
6653
6654   for (i = 0; i < nbbs; i++)
6655     {
6656       basic_block bb = bbs[i];
6657       stmt_vec_info stmt_info;
6658
6659       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6660            gsi_next (&si))
6661         {
6662           gphi *phi = si.phi ();
6663           if (dump_enabled_p ())
6664             {
6665               dump_printf_loc (MSG_NOTE, vect_location,
6666                                "------>vectorizing phi: ");
6667               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6668               dump_printf (MSG_NOTE, "\n");
6669             }
6670           stmt_info = vinfo_for_stmt (phi);
6671           if (!stmt_info)
6672             continue;
6673
6674           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6675             vect_loop_kill_debug_uses (loop, phi);
6676
6677           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6678               && !STMT_VINFO_LIVE_P (stmt_info))
6679             continue;
6680
6681           if (STMT_VINFO_VECTYPE (stmt_info)
6682               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6683                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6684               && dump_enabled_p ())
6685             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6686
6687           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6688             {
6689               if (dump_enabled_p ())
6690                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6691               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6692             }
6693         }
6694
6695       pattern_stmt = NULL;
6696       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6697            !gsi_end_p (si) || transform_pattern_stmt;)
6698         {
6699           bool is_store;
6700
6701           if (transform_pattern_stmt)
6702             stmt = pattern_stmt;
6703           else
6704             {
6705               stmt = gsi_stmt (si);
6706               /* During vectorization remove existing clobber stmts.  */
6707               if (gimple_clobber_p (stmt))
6708                 {
6709                   unlink_stmt_vdef (stmt);
6710                   gsi_remove (&si, true);
6711                   release_defs (stmt);
6712                   continue;
6713                 }
6714             }
6715
6716           if (dump_enabled_p ())
6717             {
6718               dump_printf_loc (MSG_NOTE, vect_location,
6719                                "------>vectorizing statement: ");
6720               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6721               dump_printf (MSG_NOTE, "\n");
6722             }
6723
6724           stmt_info = vinfo_for_stmt (stmt);
6725
6726           /* vector stmts created in the outer-loop during vectorization of
6727              stmts in an inner-loop may not have a stmt_info, and do not
6728              need to be vectorized.  */
6729           if (!stmt_info)
6730             {
6731               gsi_next (&si);
6732               continue;
6733             }
6734
6735           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6736             vect_loop_kill_debug_uses (loop, stmt);
6737
6738           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6739               && !STMT_VINFO_LIVE_P (stmt_info))
6740             {
6741               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6742                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6743                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6744                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6745                 {
6746                   stmt = pattern_stmt;
6747                   stmt_info = vinfo_for_stmt (stmt);
6748                 }
6749               else
6750                 {
6751                   gsi_next (&si);
6752                   continue;
6753                 }
6754             }
6755           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6756                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6757                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6758                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6759             transform_pattern_stmt = true;
6760
6761           /* If pattern statement has def stmts, vectorize them too.  */
6762           if (is_pattern_stmt_p (stmt_info))
6763             {
6764               if (pattern_def_seq == NULL)
6765                 {
6766                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6767                   pattern_def_si = gsi_start (pattern_def_seq);
6768                 }
6769               else if (!gsi_end_p (pattern_def_si))
6770                 gsi_next (&pattern_def_si);
6771               if (pattern_def_seq != NULL)
6772                 {
6773                   gimple *pattern_def_stmt = NULL;
6774                   stmt_vec_info pattern_def_stmt_info = NULL;
6775
6776                   while (!gsi_end_p (pattern_def_si))
6777                     {
6778                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6779                       pattern_def_stmt_info
6780                         = vinfo_for_stmt (pattern_def_stmt);
6781                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6782                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6783                         break;
6784                       gsi_next (&pattern_def_si);
6785                     }
6786
6787                   if (!gsi_end_p (pattern_def_si))
6788                     {
6789                       if (dump_enabled_p ())
6790                         {
6791                           dump_printf_loc (MSG_NOTE, vect_location,
6792                                            "==> vectorizing pattern def "
6793                                            "stmt: ");
6794                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6795                                             pattern_def_stmt, 0);
6796                           dump_printf (MSG_NOTE, "\n");
6797                         }
6798
6799                       stmt = pattern_def_stmt;
6800                       stmt_info = pattern_def_stmt_info;
6801                     }
6802                   else
6803                     {
6804                       pattern_def_si = gsi_none ();
6805                       transform_pattern_stmt = false;
6806                     }
6807                 }
6808               else
6809                 transform_pattern_stmt = false;
6810             }
6811
6812           if (STMT_VINFO_VECTYPE (stmt_info))
6813             {
6814               unsigned int nunits
6815                 = (unsigned int)
6816                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6817               if (!STMT_SLP_TYPE (stmt_info)
6818                   && nunits != (unsigned int) vectorization_factor
6819                   && dump_enabled_p ())
6820                   /* For SLP VF is set according to unrolling factor, and not
6821                      to vector size, hence for SLP this print is not valid.  */
6822                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6823             }
6824
6825           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6826              reached.  */
6827           if (STMT_SLP_TYPE (stmt_info))
6828             {
6829               if (!slp_scheduled)
6830                 {
6831                   slp_scheduled = true;
6832
6833                   if (dump_enabled_p ())
6834                     dump_printf_loc (MSG_NOTE, vect_location,
6835                                      "=== scheduling SLP instances ===\n");
6836
6837                   vect_schedule_slp (loop_vinfo);
6838                 }
6839
6840               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6841               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6842                 {
6843                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6844                     {
6845                       pattern_def_seq = NULL;
6846                       gsi_next (&si);
6847                     }
6848                   continue;
6849                 }
6850             }
6851
6852           /* -------- vectorize statement ------------ */
6853           if (dump_enabled_p ())
6854             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6855
6856           grouped_store = false;
6857           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6858           if (is_store)
6859             {
6860               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6861                 {
6862                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6863                      interleaving chain was completed - free all the stores in
6864                      the chain.  */
6865                   gsi_next (&si);
6866                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6867                 }
6868               else
6869                 {
6870                   /* Free the attached stmt_vec_info and remove the stmt.  */
6871                   gimple *store = gsi_stmt (si);
6872                   free_stmt_vec_info (store);
6873                   unlink_stmt_vdef (store);
6874                   gsi_remove (&si, true);
6875                   release_defs (store);
6876                 }
6877
6878               /* Stores can only appear at the end of pattern statements.  */
6879               gcc_assert (!transform_pattern_stmt);
6880               pattern_def_seq = NULL;
6881             }
6882           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6883             {
6884               pattern_def_seq = NULL;
6885               gsi_next (&si);
6886             }
6887         }                       /* stmts in BB */
6888     }                           /* BBs in loop */
6889
6890   slpeel_make_loop_iterate_ntimes (loop, ratio);
6891
6892   /* Reduce loop iterations by the vectorization factor.  */
6893   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6894                       expected_iterations / vectorization_factor);
6895   loop->nb_iterations_upper_bound
6896     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6897   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6898       && loop->nb_iterations_upper_bound != 0)
6899     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6900   if (loop->any_estimate)
6901     {
6902       loop->nb_iterations_estimate
6903         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6904        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6905            && loop->nb_iterations_estimate != 0)
6906          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6907     }
6908
6909   if (dump_enabled_p ())
6910     {
6911       dump_printf_loc (MSG_NOTE, vect_location,
6912                        "LOOP VECTORIZED\n");
6913       if (loop->inner)
6914         dump_printf_loc (MSG_NOTE, vect_location,
6915                          "OUTER LOOP VECTORIZED\n");
6916       dump_printf (MSG_NOTE, "\n");
6917     }
6918 }