gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<gimple *, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (phi);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       gimple *phi = worklist.pop ();
 547       tree def = PHI_RESULT (phi);
 548       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       stmt_vec_info reduc_stmt_info
 560         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 561                                        &double_reduc, false);
 562       if (reduc_stmt_info)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 572                 = vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 593                   /* Store the reduction cycles for possible vectorization in
 594                      loop-aware SLP if it was not detected as reduction
 595                      chain.  */
 596                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 597                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 598                       (reduc_stmt_info);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 652
 653 static void
 654 vect_fixup_reduc_chain (gimple *stmt)
 655 {
 656   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 657   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 658   stmt_vec_info stmtp;
 659   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 660               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 661   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 665       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 666       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 667       if (stmt_info)
 668         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 669           = STMT_VINFO_RELATED_STMT (stmt_info);
 670     }
 671   while (stmt_info);
 672   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   stmt_vec_info first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (first))
 685       {
 686         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (next))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (next);
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (first);
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           add_stmt (phi);
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           add_stmt (stmt);
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   /* ???  We're releasing loop_vinfos en-block.  */
 892   set_stmt_vec_info_vec (&stmt_vec_infos);
 893   nbbs = loop->num_nodes;
 894   for (j = 0; j < nbbs; j++)
 895     {
 896       basic_block bb = bbs[j];
 897       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 898         free_stmt_vec_info (gsi_stmt (si));
 899
 900       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 901         {
 902           gimple *stmt = gsi_stmt (si);
 903
 904           /* We may have broken canonical form by moving a constant
 905              into RHS1 of a commutative op.  Fix such occurrences.  */
 906           if (operands_swapped && is_gimple_assign (stmt))
 907             {
 908               enum tree_code code = gimple_assign_rhs_code (stmt);
 909
 910               if ((code == PLUS_EXPR
 911                    || code == POINTER_PLUS_EXPR
 912                    || code == MULT_EXPR)
 913                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 914                 swap_ssa_operands (stmt,
 915                                    gimple_assign_rhs1_ptr (stmt),
 916                                    gimple_assign_rhs2_ptr (stmt));
 917               else if (code == COND_EXPR
 918                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 919                 {
 920                   tree cond_expr = gimple_assign_rhs1 (stmt);
 921                   enum tree_code cond_code = TREE_CODE (cond_expr);
 922
 923                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 924                     {
 925                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 926                                                                   0));
 927                       cond_code = invert_tree_comparison (cond_code,
 928                                                           honor_nans);
 929                       if (cond_code != ERROR_MARK)
 930                         {
 931                           TREE_SET_CODE (cond_expr, cond_code);
 932                           swap_ssa_operands (stmt,
 933                                              gimple_assign_rhs2_ptr (stmt),
 934                                              gimple_assign_rhs3_ptr (stmt));
 935                         }
 936                     }
 937                 }
 938             }
 939
 940           /* Free stmt_vec_info.  */
 941           free_stmt_vec_info (stmt);
 942           gsi_next (&si);
 943         }
 944     }
 945
 946   free (bbs);
 947
 948   release_vec_loop_masks (&masks);
 949   delete ivexpr_map;
 950
 951   loop->aux = NULL;
 952 }
 953
 954 /* Return an invariant or register for EXPR and emit necessary
 955    computations in the LOOP_VINFO loop preheader.  */
 956
 957 tree
 958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 959 {
 960   if (is_gimple_reg (expr)
 961       || is_gimple_min_invariant (expr))
 962     return expr;
 963
 964   if (! loop_vinfo->ivexpr_map)
 965     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 966   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 967   if (! cached)
 968     {
 969       gimple_seq stmts = NULL;
 970       cached = force_gimple_operand (unshare_expr (expr),
 971                                      &stmts, true, NULL_TREE);
 972       if (stmts)
 973         {
 974           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 975           gsi_insert_seq_on_edge_immediate (e, stmts);
 976         }
 977     }
 978   return cached;
 979 }
 980
 981 /* Return true if we can use CMP_TYPE as the comparison type to produce
 982    all masks required to mask LOOP_VINFO.  */
 983
 984 static bool
 985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 986 {
 987   rgroup_masks *rgm;
 988   unsigned int i;
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 990     if (rgm->mask_type != NULL_TREE
 991         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 992                                             cmp_type, rgm->mask_type,
 993                                             OPTIMIZE_FOR_SPEED))
 994       return false;
 995   return true;
 996 }
 997
 998 /* Calculate the maximum number of scalars per iteration for every
 999    rgroup in LOOP_VINFO.  */
1000
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1003 {
1004   unsigned int res = 1;
1005   unsigned int i;
1006   rgroup_masks *rgm;
1007   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008     res = MAX (res, rgm->max_nscalars_per_iter);
1009   return res;
1010 }
1011
1012 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1013    whether we can actually generate the masks required.  Return true if so,
1014    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1015
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1018 {
1019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020   unsigned int min_ni_width;
1021
1022   /* Use a normal loop if there are no statements that need masking.
1023      This only happens in rare degenerate cases: it means that the loop
1024      has no loads, no stores, and no live-out values.  */
1025   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026     return false;
1027
1028   /* Get the maximum number of iterations that is representable
1029      in the counter type.  */
1030   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1032
1033   /* Get a more refined estimate for the number of iterations.  */
1034   widest_int max_back_edges;
1035   if (max_loop_iterations (loop, &max_back_edges))
1036     max_ni = wi::smin (max_ni, max_back_edges + 1);
1037
1038   /* Account for rgroup masks, in which each bit is replicated N times.  */
1039   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1040
1041   /* Work out how many bits we need to represent the limit.  */
1042   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1043
1044   /* Find a scalar mode for which WHILE_ULT is supported.  */
1045   opt_scalar_int_mode cmp_mode_iter;
1046   tree cmp_type = NULL_TREE;
1047   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1048     {
1049       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050       if (cmp_bits >= min_ni_width
1051           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1052         {
1053           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054           if (this_type
1055               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1056             {
1057               /* Although we could stop as soon as we find a valid mode,
1058                  it's often better to continue until we hit Pmode, since the
1059                  operands to the WHILE are more likely to be reusable in
1060                  address calculations.  */
1061               cmp_type = this_type;
1062               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063                 break;
1064             }
1065         }
1066     }
1067
1068   if (!cmp_type)
1069     return false;
1070
1071   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072   return true;
1073 }
1074
1075 /* Calculate the cost of one scalar iteration of the loop.  */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1078 {
1079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081   int nbbs = loop->num_nodes, factor;
1082   int innerloop_iters, i;
1083
1084   /* Gather costs for statements in the scalar loop.  */
1085
1086   /* FORNOW.  */
1087   innerloop_iters = 1;
1088   if (loop->inner)
1089     innerloop_iters = 50; /* FIXME */
1090
1091   for (i = 0; i < nbbs; i++)
1092     {
1093       gimple_stmt_iterator si;
1094       basic_block bb = bbs[i];
1095
1096       if (bb->loop_father == loop->inner)
1097         factor = innerloop_iters;
1098       else
1099         factor = 1;
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1105
1106           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107             continue;
1108
1109           /* Skip stmts that are not vectorized inside the loop.  */
1110           if (stmt_info
1111               && !STMT_VINFO_RELEVANT_P (stmt_info)
1112               && (!STMT_VINFO_LIVE_P (stmt_info)
1113                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115             continue;
1116
1117           vect_cost_for_stmt kind;
1118           if (STMT_VINFO_DATA_REF (stmt_info))
1119             {
1120               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121                kind = scalar_load;
1122              else
1123                kind = scalar_store;
1124             }
1125           else
1126             kind = scalar_stmt;
1127
1128           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                             factor, kind, stmt_info, 0, vect_prologue);
1130         }
1131     }
1132
1133   /* Now accumulate cost.  */
1134   void *target_cost_data = init_cost (loop);
1135   stmt_info_for_cost *si;
1136   int j;
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138                     j, si)
1139     (void) add_stmt_cost (target_cost_data, si->count,
1140                           si->kind, si->stmt_info, si->misalign,
1141                           vect_body);
1142   unsigned dummy, body_cost = 0;
1143   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1144   destroy_cost_data (target_cost_data);
1145   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1146 }
1147
1148
1149 /* Function vect_analyze_loop_form_1.
1150
1151    Verify that certain CFG restrictions hold, including:
1152    - the loop has a pre-header
1153    - the loop has a single entry and exit
1154    - the loop exit condition is simple enough
1155    - the number of iterations can be analyzed, i.e, a countable loop.  The
1156      niter could be analyzed under some assumptions.  */
1157
1158 bool
1159 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1160                           tree *assumptions, tree *number_of_iterationsm1,
1161                           tree *number_of_iterations, gcond **inner_loop_cond)
1162 {
1163   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1164
1165   /* Different restrictions apply when we are considering an inner-most loop,
1166      vs. an outer (nested) loop.
1167      (FORNOW. May want to relax some of these restrictions in the future).  */
1168
1169   if (!loop->inner)
1170     {
1171       /* Inner-most loop.  We currently require that the number of BBs is
1172          exactly 2 (the header and latch).  Vectorizable inner-most loops
1173          look like this:
1174
1175                         (pre-header)
1176                            |
1177                           header <--------+
1178                            | |            |
1179                            | +--> latch --+
1180                            |
1181                         (exit-bb)  */
1182
1183       if (loop->num_nodes != 2)
1184         {
1185           if (dump_enabled_p ())
1186             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187                              "not vectorized: control flow in loop.\n");
1188           return false;
1189         }
1190
1191       if (empty_block_p (loop->header))
1192         {
1193           if (dump_enabled_p ())
1194             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195                              "not vectorized: empty loop.\n");
1196           return false;
1197         }
1198     }
1199   else
1200     {
1201       struct loop *innerloop = loop->inner;
1202       edge entryedge;
1203
1204       /* Nested loop. We currently require that the loop is doubly-nested,
1205          contains a single inner loop, and the number of BBs is exactly 5.
1206          Vectorizable outer-loops look like this:
1207
1208                         (pre-header)
1209                            |
1210                           header <---+
1211                            |         |
1212                           inner-loop |
1213                            |         |
1214                           tail ------+
1215                            |
1216                         (exit-bb)
1217
1218          The inner-loop has the properties expected of inner-most loops
1219          as described above.  */
1220
1221       if ((loop->inner)->inner || (loop->inner)->next)
1222         {
1223           if (dump_enabled_p ())
1224             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1225                              "not vectorized: multiple nested loops.\n");
1226           return false;
1227         }
1228
1229       if (loop->num_nodes != 5)
1230         {
1231           if (dump_enabled_p ())
1232             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233                              "not vectorized: control flow in loop.\n");
1234           return false;
1235         }
1236
1237       entryedge = loop_preheader_edge (innerloop);
1238       if (entryedge->src != loop->header
1239           || !single_exit (innerloop)
1240           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1241         {
1242           if (dump_enabled_p ())
1243             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244                              "not vectorized: unsupported outerloop form.\n");
1245           return false;
1246         }
1247
1248       /* Analyze the inner-loop.  */
1249       tree inner_niterm1, inner_niter, inner_assumptions;
1250       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1251                                       &inner_assumptions, &inner_niterm1,
1252                                       &inner_niter, NULL)
1253           /* Don't support analyzing niter under assumptions for inner
1254              loop.  */
1255           || !integer_onep (inner_assumptions))
1256         {
1257           if (dump_enabled_p ())
1258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259                              "not vectorized: Bad inner loop.\n");
1260           return false;
1261         }
1262
1263       if (!expr_invariant_in_loop_p (loop, inner_niter))
1264         {
1265           if (dump_enabled_p ())
1266             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267                              "not vectorized: inner-loop count not"
1268                              " invariant.\n");
1269           return false;
1270         }
1271
1272       if (dump_enabled_p ())
1273         dump_printf_loc (MSG_NOTE, vect_location,
1274                          "Considering outer-loop vectorization.\n");
1275     }
1276
1277   if (!single_exit (loop)
1278       || EDGE_COUNT (loop->header->preds) != 2)
1279     {
1280       if (dump_enabled_p ())
1281         {
1282           if (!single_exit (loop))
1283             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284                              "not vectorized: multiple exits.\n");
1285           else if (EDGE_COUNT (loop->header->preds) != 2)
1286             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1287                              "not vectorized: too many incoming edges.\n");
1288         }
1289       return false;
1290     }
1291
1292   /* We assume that the loop exit condition is at the end of the loop. i.e,
1293      that the loop is represented as a do-while (with a proper if-guard
1294      before the loop if needed), where the loop header contains all the
1295      executable statements, and the latch is empty.  */
1296   if (!empty_block_p (loop->latch)
1297       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1298     {
1299       if (dump_enabled_p ())
1300         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301                          "not vectorized: latch block not empty.\n");
1302       return false;
1303     }
1304
1305   /* Make sure the exit is not abnormal.  */
1306   edge e = single_exit (loop);
1307   if (e->flags & EDGE_ABNORMAL)
1308     {
1309       if (dump_enabled_p ())
1310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                          "not vectorized: abnormal loop exit edge.\n");
1312       return false;
1313     }
1314
1315   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1316                                      number_of_iterationsm1);
1317   if (!*loop_cond)
1318     {
1319       if (dump_enabled_p ())
1320         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321                          "not vectorized: complicated exit condition.\n");
1322       return false;
1323     }
1324
1325   if (integer_zerop (*assumptions)
1326       || !*number_of_iterations
1327       || chrec_contains_undetermined (*number_of_iterations))
1328     {
1329       if (dump_enabled_p ())
1330         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331                          "not vectorized: number of iterations cannot be "
1332                          "computed.\n");
1333       return false;
1334     }
1335
1336   if (integer_zerop (*number_of_iterations))
1337     {
1338       if (dump_enabled_p ())
1339         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340                          "not vectorized: number of iterations = 0.\n");
1341       return false;
1342     }
1343
1344   return true;
1345 }
1346
1347 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1348
1349 loop_vec_info
1350 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1351 {
1352   tree assumptions, number_of_iterations, number_of_iterationsm1;
1353   gcond *loop_cond, *inner_loop_cond = NULL;
1354
1355   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1356                                   &assumptions, &number_of_iterationsm1,
1357                                   &number_of_iterations, &inner_loop_cond))
1358     return NULL;
1359
1360   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1361   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1362   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1363   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1364   if (!integer_onep (assumptions))
1365     {
1366       /* We consider to vectorize this loop by versioning it under
1367          some assumptions.  In order to do this, we need to clear
1368          existing information computed by scev and niter analyzer.  */
1369       scev_reset_htab ();
1370       free_numbers_of_iterations_estimates (loop);
1371       /* Also set flag for this loop so that following scev and niter
1372          analysis are done under the assumptions.  */
1373       loop_constraint_set (loop, LOOP_C_FINITE);
1374       /* Also record the assumptions for versioning.  */
1375       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1376     }
1377
1378   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1379     {
1380       if (dump_enabled_p ())
1381         {
1382           dump_printf_loc (MSG_NOTE, vect_location,
1383                            "Symbolic number of iterations is ");
1384           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1385           dump_printf (MSG_NOTE, "\n");
1386         }
1387     }
1388
1389   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1390   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1391   if (inner_loop_cond)
1392     {
1393       stmt_vec_info inner_loop_cond_info
1394         = loop_vinfo->lookup_stmt (inner_loop_cond);
1395       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1396     }
1397
1398   gcc_assert (!loop->aux);
1399   loop->aux = loop_vinfo;
1400   return loop_vinfo;
1401 }
1402
1403
1404
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406    statements update the vectorization factor.  */
1407
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1410 {
1411   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413   int nbbs = loop->num_nodes;
1414   poly_uint64 vectorization_factor;
1415   int i;
1416
1417   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1418
1419   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420   gcc_assert (known_ne (vectorization_factor, 0U));
1421
1422   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423      vectorization factor of the loop is the unrolling factor required by
1424      the SLP instances.  If that unrolling factor is 1, we say, that we
1425      perform pure SLP on loop - cross iteration parallelism is not
1426      exploited.  */
1427   bool only_slp_in_loop = true;
1428   for (i = 0; i < nbbs; i++)
1429     {
1430       basic_block bb = bbs[i];
1431       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432            gsi_next (&si))
1433         {
1434           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1435           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1436               && STMT_VINFO_RELATED_STMT (stmt_info))
1437             stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1438           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1439                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1440               && !PURE_SLP_STMT (stmt_info))
1441             /* STMT needs both SLP and loop-based vectorization.  */
1442             only_slp_in_loop = false;
1443         }
1444     }
1445
1446   if (only_slp_in_loop)
1447     {
1448       dump_printf_loc (MSG_NOTE, vect_location,
1449                        "Loop contains only SLP stmts\n");
1450       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1451     }
1452   else
1453     {
1454       dump_printf_loc (MSG_NOTE, vect_location,
1455                        "Loop contains SLP and non-SLP stmts\n");
1456       /* Both the vectorization factor and unroll factor have the form
1457          current_vector_size * X for some rational X, so they must have
1458          a common multiple.  */
1459       vectorization_factor
1460         = force_common_multiple (vectorization_factor,
1461                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1462     }
1463
1464   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1465   if (dump_enabled_p ())
1466     {
1467       dump_printf_loc (MSG_NOTE, vect_location,
1468                        "Updating vectorization factor to ");
1469       dump_dec (MSG_NOTE, vectorization_factor);
1470       dump_printf (MSG_NOTE, ".\n");
1471     }
1472 }
1473
1474 /* Return true if STMT_INFO describes a double reduction phi and if
1475    the other phi in the reduction is also relevant for vectorization.
1476    This rejects cases such as:
1477
1478       outer1:
1479         x_1 = PHI <x_3(outer2), ...>;
1480         ...
1481
1482       inner:
1483         x_2 = ...;
1484         ...
1485
1486       outer2:
1487         x_3 = PHI <x_2(inner)>;
1488
1489    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1490
1491 static bool
1492 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1493 {
1494   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1495     return false;
1496
1497   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1498 }
1499
1500 /* Function vect_analyze_loop_operations.
1501
1502    Scan the loop stmts and make sure they are all vectorizable.  */
1503
1504 static bool
1505 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1506 {
1507   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1508   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1509   int nbbs = loop->num_nodes;
1510   int i;
1511   stmt_vec_info stmt_info;
1512   bool need_to_vectorize = false;
1513   bool ok;
1514
1515   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1516
1517   stmt_vector_for_cost cost_vec;
1518   cost_vec.create (2);
1519
1520   for (i = 0; i < nbbs; i++)
1521     {
1522       basic_block bb = bbs[i];
1523
1524       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1525            gsi_next (&si))
1526         {
1527           gphi *phi = si.phi ();
1528           ok = true;
1529
1530           stmt_info = loop_vinfo->lookup_stmt (phi);
1531           if (dump_enabled_p ())
1532             {
1533               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1534               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1535             }
1536           if (virtual_operand_p (gimple_phi_result (phi)))
1537             continue;
1538
1539           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1540              (i.e., a phi in the tail of the outer-loop).  */
1541           if (! is_loop_header_bb_p (bb))
1542             {
1543               /* FORNOW: we currently don't support the case that these phis
1544                  are not used in the outerloop (unless it is double reduction,
1545                  i.e., this phi is vect_reduction_def), cause this case
1546                  requires to actually do something here.  */
1547               if (STMT_VINFO_LIVE_P (stmt_info)
1548                   && !vect_active_double_reduction_p (stmt_info))
1549                 {
1550                   if (dump_enabled_p ())
1551                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552                                      "Unsupported loop-closed phi in "
1553                                      "outer-loop.\n");
1554                   return false;
1555                 }
1556
1557               /* If PHI is used in the outer loop, we check that its operand
1558                  is defined in the inner loop.  */
1559               if (STMT_VINFO_RELEVANT_P (stmt_info))
1560                 {
1561                   tree phi_op;
1562
1563                   if (gimple_phi_num_args (phi) != 1)
1564                     return false;
1565
1566                   phi_op = PHI_ARG_DEF (phi, 0);
1567                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1568                   if (!op_def_info)
1569                     return false;
1570
1571                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1572                       && (STMT_VINFO_RELEVANT (op_def_info)
1573                           != vect_used_in_outer_by_reduction))
1574                     return false;
1575                 }
1576
1577               continue;
1578             }
1579
1580           gcc_assert (stmt_info);
1581
1582           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1583                || STMT_VINFO_LIVE_P (stmt_info))
1584               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1585             {
1586               /* A scalar-dependence cycle that we don't support.  */
1587               if (dump_enabled_p ())
1588                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589                                  "not vectorized: scalar dependence cycle.\n");
1590               return false;
1591             }
1592
1593           if (STMT_VINFO_RELEVANT_P (stmt_info))
1594             {
1595               need_to_vectorize = true;
1596               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1597                   && ! PURE_SLP_STMT (stmt_info))
1598                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1599               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1600                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1601                        && ! PURE_SLP_STMT (stmt_info))
1602                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1603                                              &cost_vec);
1604             }
1605
1606           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1607           if (ok
1608               && STMT_VINFO_LIVE_P (stmt_info)
1609               && !PURE_SLP_STMT (stmt_info))
1610             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1611                                               &cost_vec);
1612
1613           if (!ok)
1614             {
1615               if (dump_enabled_p ())
1616                 {
1617                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1618                                    "not vectorized: relevant phi not "
1619                                    "supported: ");
1620                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1621                 }
1622               return false;
1623             }
1624         }
1625
1626       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1627            gsi_next (&si))
1628         {
1629           gimple *stmt = gsi_stmt (si);
1630           if (!gimple_clobber_p (stmt)
1631               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1632                                      &cost_vec))
1633             return false;
1634         }
1635     } /* bbs */
1636
1637   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1638   cost_vec.release ();
1639
1640   /* All operations in the loop are either irrelevant (deal with loop
1641      control, or dead), or only used outside the loop and can be moved
1642      out of the loop (e.g. invariants, inductions).  The loop can be
1643      optimized away by scalar optimizations.  We're better off not
1644      touching this loop.  */
1645   if (!need_to_vectorize)
1646     {
1647       if (dump_enabled_p ())
1648         dump_printf_loc (MSG_NOTE, vect_location,
1649                          "All the computation can be taken out of the loop.\n");
1650       if (dump_enabled_p ())
1651         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652                          "not vectorized: redundant loop. no profit to "
1653                          "vectorize.\n");
1654       return false;
1655     }
1656
1657   return true;
1658 }
1659
1660 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1661    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1662    definitely no, or -1 if it's worth retrying.  */
1663
1664 static int
1665 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1666 {
1667   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1669
1670   /* Only fully-masked loops can have iteration counts less than the
1671      vectorization factor.  */
1672   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1673     {
1674       HOST_WIDE_INT max_niter;
1675
1676       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1677         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1678       else
1679         max_niter = max_stmt_executions_int (loop);
1680
1681       if (max_niter != -1
1682           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1683         {
1684           if (dump_enabled_p ())
1685             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                              "not vectorized: iteration count smaller than "
1687                              "vectorization factor.\n");
1688           return 0;
1689         }
1690     }
1691
1692   int min_profitable_iters, min_profitable_estimate;
1693   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1694                                       &min_profitable_estimate);
1695
1696   if (min_profitable_iters < 0)
1697     {
1698       if (dump_enabled_p ())
1699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700                          "not vectorized: vectorization not profitable.\n");
1701       if (dump_enabled_p ())
1702         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1703                          "not vectorized: vector version will never be "
1704                          "profitable.\n");
1705       return -1;
1706     }
1707
1708   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1709                                * assumed_vf);
1710
1711   /* Use the cost model only if it is more conservative than user specified
1712      threshold.  */
1713   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1714                                     min_profitable_iters);
1715
1716   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1717
1718   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1719       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1720     {
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1723                          "not vectorized: vectorization not profitable.\n");
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "not vectorized: iteration count smaller than user "
1727                          "specified loop bound parameter or minimum profitable "
1728                          "iterations (whichever is more conservative).\n");
1729       return 0;
1730     }
1731
1732   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1733   if (estimated_niter == -1)
1734     estimated_niter = likely_max_stmt_executions_int (loop);
1735   if (estimated_niter != -1
1736       && ((unsigned HOST_WIDE_INT) estimated_niter
1737           < MAX (th, (unsigned) min_profitable_estimate)))
1738     {
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741                          "not vectorized: estimated iteration count too "
1742                          "small.\n");
1743       if (dump_enabled_p ())
1744         dump_printf_loc (MSG_NOTE, vect_location,
1745                          "not vectorized: estimated iteration count smaller "
1746                          "than specified loop bound parameter or minimum "
1747                          "profitable iterations (whichever is more "
1748                          "conservative).\n");
1749       return -1;
1750     }
1751
1752   return 1;
1753 }
1754
1755 static bool
1756 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1757                            vec<data_reference_p> *datarefs,
1758                            unsigned int *n_stmts)
1759 {
1760   *n_stmts = 0;
1761   for (unsigned i = 0; i < loop->num_nodes; i++)
1762     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1763          !gsi_end_p (gsi); gsi_next (&gsi))
1764       {
1765         gimple *stmt = gsi_stmt (gsi);
1766         if (is_gimple_debug (stmt))
1767           continue;
1768         ++(*n_stmts);
1769         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1770           {
1771             if (is_gimple_call (stmt) && loop->safelen)
1772               {
1773                 tree fndecl = gimple_call_fndecl (stmt), op;
1774                 if (fndecl != NULL_TREE)
1775                   {
1776                     cgraph_node *node = cgraph_node::get (fndecl);
1777                     if (node != NULL && node->simd_clones != NULL)
1778                       {
1779                         unsigned int j, n = gimple_call_num_args (stmt);
1780                         for (j = 0; j < n; j++)
1781                           {
1782                             op = gimple_call_arg (stmt, j);
1783                             if (DECL_P (op)
1784                                 || (REFERENCE_CLASS_P (op)
1785                                     && get_base_address (op)))
1786                               break;
1787                           }
1788                         op = gimple_call_lhs (stmt);
1789                         /* Ignore #pragma omp declare simd functions
1790                            if they don't have data references in the
1791                            call stmt itself.  */
1792                         if (j == n
1793                             && !(op
1794                                  && (DECL_P (op)
1795                                      || (REFERENCE_CLASS_P (op)
1796                                          && get_base_address (op)))))
1797                           continue;
1798                       }
1799                   }
1800               }
1801             return false;
1802           }
1803         /* If dependence analysis will give up due to the limit on the
1804            number of datarefs stop here and fail fatally.  */
1805         if (datarefs->length ()
1806             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1807           return false;
1808       }
1809   return true;
1810 }
1811
1812 /* Function vect_analyze_loop_2.
1813
1814    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1815    for it.  The different analyses will record information in the
1816    loop_vec_info struct.  */
1817 static bool
1818 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1819 {
1820   bool ok;
1821   int res;
1822   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1823   poly_uint64 min_vf = 2;
1824
1825   /* The first group of checks is independent of the vector size.  */
1826   fatal = true;
1827
1828   /* Find all data references in the loop (which correspond to vdefs/vuses)
1829      and analyze their evolution in the loop.  */
1830
1831   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1832
1833   /* Gather the data references and count stmts in the loop.  */
1834   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1835     {
1836       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1837                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1838                                       n_stmts))
1839         {
1840           if (dump_enabled_p ())
1841             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1842                              "not vectorized: loop contains function "
1843                              "calls or data references that cannot "
1844                              "be analyzed\n");
1845           return false;
1846         }
1847       loop_vinfo->shared->save_datarefs ();
1848     }
1849   else
1850     loop_vinfo->shared->check_datarefs ();
1851
1852   /* Analyze the data references and also adjust the minimal
1853      vectorization factor according to the loads and stores.  */
1854
1855   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1856   if (!ok)
1857     {
1858       if (dump_enabled_p ())
1859         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860                          "bad data references.\n");
1861       return false;
1862     }
1863
1864   /* Classify all cross-iteration scalar data-flow cycles.
1865      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1866   vect_analyze_scalar_cycles (loop_vinfo);
1867
1868   vect_pattern_recog (loop_vinfo);
1869
1870   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1871
1872   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1873      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1874
1875   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1876   if (!ok)
1877     {
1878       if (dump_enabled_p ())
1879         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1880                          "bad data access.\n");
1881       return false;
1882     }
1883
1884   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1885
1886   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1887   if (!ok)
1888     {
1889       if (dump_enabled_p ())
1890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891                          "unexpected pattern.\n");
1892       return false;
1893     }
1894
1895   /* While the rest of the analysis below depends on it in some way.  */
1896   fatal = false;
1897
1898   /* Analyze data dependences between the data-refs in the loop
1899      and adjust the maximum vectorization factor according to
1900      the dependences.
1901      FORNOW: fail at the first data dependence that we encounter.  */
1902
1903   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1904   if (!ok
1905       || (max_vf != MAX_VECTORIZATION_FACTOR
1906           && maybe_lt (max_vf, min_vf)))
1907     {
1908       if (dump_enabled_p ())
1909             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910                              "bad data dependence.\n");
1911       return false;
1912     }
1913   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1914
1915   ok = vect_determine_vectorization_factor (loop_vinfo);
1916   if (!ok)
1917     {
1918       if (dump_enabled_p ())
1919         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1920                          "can't determine vectorization factor.\n");
1921       return false;
1922     }
1923   if (max_vf != MAX_VECTORIZATION_FACTOR
1924       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1925     {
1926       if (dump_enabled_p ())
1927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928                          "bad data dependence.\n");
1929       return false;
1930     }
1931
1932   /* Compute the scalar iteration cost.  */
1933   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1934
1935   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1936   unsigned th;
1937
1938   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1939   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1940   if (!ok)
1941     return false;
1942
1943   /* If there are any SLP instances mark them as pure_slp.  */
1944   bool slp = vect_make_slp_decision (loop_vinfo);
1945   if (slp)
1946     {
1947       /* Find stmts that need to be both vectorized and SLPed.  */
1948       vect_detect_hybrid_slp (loop_vinfo);
1949
1950       /* Update the vectorization factor based on the SLP decision.  */
1951       vect_update_vf_for_slp (loop_vinfo);
1952     }
1953
1954   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1955
1956   /* We don't expect to have to roll back to anything other than an empty
1957      set of rgroups.  */
1958   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1959
1960   /* This is the point where we can re-start analysis with SLP forced off.  */
1961 start_over:
1962
1963   /* Now the vectorization factor is final.  */
1964   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1965   gcc_assert (known_ne (vectorization_factor, 0U));
1966
1967   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1968     {
1969       dump_printf_loc (MSG_NOTE, vect_location,
1970                        "vectorization_factor = ");
1971       dump_dec (MSG_NOTE, vectorization_factor);
1972       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1973                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1974     }
1975
1976   HOST_WIDE_INT max_niter
1977     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1978
1979   /* Analyze the alignment of the data-refs in the loop.
1980      Fail if a data reference is found that cannot be vectorized.  */
1981
1982   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1983   if (!ok)
1984     {
1985       if (dump_enabled_p ())
1986         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1987                          "bad data alignment.\n");
1988       return false;
1989     }
1990
1991   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1992      It is important to call pruning after vect_analyze_data_ref_accesses,
1993      since we use grouping information gathered by interleaving analysis.  */
1994   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1995   if (!ok)
1996     return false;
1997
1998   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
1999      vectorization.  */
2000   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2001     {
2002     /* This pass will decide on using loop versioning and/or loop peeling in
2003        order to enhance the alignment of data references in the loop.  */
2004     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2005     if (!ok)
2006       {
2007         if (dump_enabled_p ())
2008           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009                            "bad data alignment.\n");
2010         return false;
2011       }
2012     }
2013
2014   if (slp)
2015     {
2016       /* Analyze operations in the SLP instances.  Note this may
2017          remove unsupported SLP instances which makes the above
2018          SLP kind detection invalid.  */
2019       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2020       vect_slp_analyze_operations (loop_vinfo);
2021       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2022         goto again;
2023     }
2024
2025   /* Scan all the remaining operations in the loop that are not subject
2026      to SLP and make sure they are vectorizable.  */
2027   ok = vect_analyze_loop_operations (loop_vinfo);
2028   if (!ok)
2029     {
2030       if (dump_enabled_p ())
2031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032                          "bad operation or unsupported loop bound.\n");
2033       return false;
2034     }
2035
2036   /* Decide whether to use a fully-masked loop for this vectorization
2037      factor.  */
2038   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2039     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2040        && vect_verify_full_masking (loop_vinfo));
2041   if (dump_enabled_p ())
2042     {
2043       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2044         dump_printf_loc (MSG_NOTE, vect_location,
2045                          "using a fully-masked loop.\n");
2046       else
2047         dump_printf_loc (MSG_NOTE, vect_location,
2048                          "not using a fully-masked loop.\n");
2049     }
2050
2051   /* If epilog loop is required because of data accesses with gaps,
2052      one additional iteration needs to be peeled.  Check if there is
2053      enough iterations for vectorization.  */
2054   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2055       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2056       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2057     {
2058       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2060
2061       if (known_lt (wi::to_widest (scalar_niters), vf))
2062         {
2063           if (dump_enabled_p ())
2064             dump_printf_loc (MSG_NOTE, vect_location,
2065                              "loop has no enough iterations to support"
2066                              " peeling for gaps.\n");
2067           return false;
2068         }
2069     }
2070
2071   /* Check the costings of the loop make vectorizing worthwhile.  */
2072   res = vect_analyze_loop_costing (loop_vinfo);
2073   if (res < 0)
2074     goto again;
2075   if (!res)
2076     {
2077       if (dump_enabled_p ())
2078         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079                          "Loop costings not worthwhile.\n");
2080       return false;
2081     }
2082
2083   /* Decide whether we need to create an epilogue loop to handle
2084      remaining scalar iterations.  */
2085   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2086
2087   unsigned HOST_WIDE_INT const_vf;
2088   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2089     /* The main loop handles all iterations.  */
2090     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2091   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2092            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2093     {
2094       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2095                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2096                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2097         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2098     }
2099   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2100            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2101            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2102                 < (unsigned) exact_log2 (const_vf))
2103                /* In case of versioning, check if the maximum number of
2104                   iterations is greater than th.  If they are identical,
2105                   the epilogue is unnecessary.  */
2106                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2107                    || ((unsigned HOST_WIDE_INT) max_niter
2108                        > (th / const_vf) * const_vf))))
2109     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2110
2111   /* If an epilogue loop is required make sure we can create one.  */
2112   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2113       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2114     {
2115       if (dump_enabled_p ())
2116         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2117       if (!vect_can_advance_ivs_p (loop_vinfo)
2118           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2119                                            single_exit (LOOP_VINFO_LOOP
2120                                                          (loop_vinfo))))
2121         {
2122           if (dump_enabled_p ())
2123             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124                              "not vectorized: can't create required "
2125                              "epilog loop\n");
2126           goto again;
2127         }
2128     }
2129
2130   /* During peeling, we need to check if number of loop iterations is
2131      enough for both peeled prolog loop and vector loop.  This check
2132      can be merged along with threshold check of loop versioning, so
2133      increase threshold for this case if necessary.  */
2134   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2135     {
2136       poly_uint64 niters_th = 0;
2137
2138       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2139         {
2140           /* Niters for peeled prolog loop.  */
2141           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2142             {
2143               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2144               tree vectype = STMT_VINFO_VECTYPE (vect_dr_stmt (dr));
2145               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2146             }
2147           else
2148             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2149         }
2150
2151       /* Niters for at least one iteration of vectorized loop.  */
2152       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2153         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2154       /* One additional iteration because of peeling for gap.  */
2155       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2156         niters_th += 1;
2157       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2158     }
2159
2160   gcc_assert (known_eq (vectorization_factor,
2161                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2162
2163   /* Ok to vectorize!  */
2164   return true;
2165
2166 again:
2167   /* Try again with SLP forced off but if we didn't do any SLP there is
2168      no point in re-trying.  */
2169   if (!slp)
2170     return false;
2171
2172   /* If there are reduction chains re-trying will fail anyway.  */
2173   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2174     return false;
2175
2176   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2177      via interleaving or lane instructions.  */
2178   slp_instance instance;
2179   slp_tree node;
2180   unsigned i, j;
2181   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2182     {
2183       stmt_vec_info vinfo;
2184       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2185       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2186         continue;
2187       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2188       unsigned int size = DR_GROUP_SIZE (vinfo);
2189       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2190       if (! vect_store_lanes_supported (vectype, size, false)
2191          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2192          && ! vect_grouped_store_supported (vectype, size))
2193        return false;
2194       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2195         {
2196           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2197           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2198           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2199           size = DR_GROUP_SIZE (vinfo);
2200           vectype = STMT_VINFO_VECTYPE (vinfo);
2201           if (! vect_load_lanes_supported (vectype, size, false)
2202               && ! vect_grouped_load_supported (vectype, single_element_p,
2203                                                 size))
2204             return false;
2205         }
2206     }
2207
2208   if (dump_enabled_p ())
2209     dump_printf_loc (MSG_NOTE, vect_location,
2210                      "re-trying with SLP disabled\n");
2211
2212   /* Roll back state appropriately.  No SLP this time.  */
2213   slp = false;
2214   /* Restore vectorization factor as it were without SLP.  */
2215   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2216   /* Free the SLP instances.  */
2217   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2218     vect_free_slp_instance (instance, false);
2219   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2220   /* Reset SLP type to loop_vect on all stmts.  */
2221   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2222     {
2223       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2224       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2225            !gsi_end_p (si); gsi_next (&si))
2226         {
2227           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2228           STMT_SLP_TYPE (stmt_info) = loop_vect;
2229         }
2230       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2231            !gsi_end_p (si); gsi_next (&si))
2232         {
2233           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2234           STMT_SLP_TYPE (stmt_info) = loop_vect;
2235           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2236             {
2237               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2238               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2239               STMT_SLP_TYPE (stmt_info) = loop_vect;
2240               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2241                    !gsi_end_p (pi); gsi_next (&pi))
2242                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2243                   = loop_vect;
2244             }
2245         }
2246     }
2247   /* Free optimized alias test DDRS.  */
2248   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2249   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2250   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2251   /* Reset target cost data.  */
2252   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2253   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2254     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2255   /* Reset accumulated rgroup information.  */
2256   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2257   /* Reset assorted flags.  */
2258   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2259   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2260   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2261   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2262   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2263
2264   goto start_over;
2265 }
2266
2267 /* Function vect_analyze_loop.
2268
2269    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2270    for it.  The different analyses will record information in the
2271    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2272    be vectorized.  */
2273 loop_vec_info
2274 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2275                    vec_info_shared *shared)
2276 {
2277   loop_vec_info loop_vinfo;
2278   auto_vector_sizes vector_sizes;
2279
2280   /* Autodetect first vector size we try.  */
2281   current_vector_size = 0;
2282   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2283   unsigned int next_size = 0;
2284
2285   DUMP_VECT_SCOPE ("analyze_loop_nest");
2286
2287   if (loop_outer (loop)
2288       && loop_vec_info_for_loop (loop_outer (loop))
2289       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2290     {
2291       if (dump_enabled_p ())
2292         dump_printf_loc (MSG_NOTE, vect_location,
2293                          "outer-loop already vectorized.\n");
2294       return NULL;
2295     }
2296
2297   if (!find_loop_nest (loop, &shared->loop_nest))
2298     {
2299       if (dump_enabled_p ())
2300         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                          "not vectorized: loop nest containing two "
2302                          "or more consecutive inner loops cannot be "
2303                          "vectorized\n");
2304       return NULL;
2305     }
2306
2307   unsigned n_stmts = 0;
2308   poly_uint64 autodetected_vector_size = 0;
2309   while (1)
2310     {
2311       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2312       loop_vinfo = vect_analyze_loop_form (loop, shared);
2313       if (!loop_vinfo)
2314         {
2315           if (dump_enabled_p ())
2316             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                              "bad loop form.\n");
2318           return NULL;
2319         }
2320
2321       bool fatal = false;
2322
2323       if (orig_loop_vinfo)
2324         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2325
2326       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2327         {
2328           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2329
2330           return loop_vinfo;
2331         }
2332
2333       delete loop_vinfo;
2334
2335       if (next_size == 0)
2336         autodetected_vector_size = current_vector_size;
2337
2338       if (next_size < vector_sizes.length ()
2339           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2340         next_size += 1;
2341
2342       if (fatal
2343           || next_size == vector_sizes.length ()
2344           || known_eq (current_vector_size, 0U))
2345         return NULL;
2346
2347       /* Try the next biggest vector size.  */
2348       current_vector_size = vector_sizes[next_size++];
2349       if (dump_enabled_p ())
2350         {
2351           dump_printf_loc (MSG_NOTE, vect_location,
2352                            "***** Re-trying analysis with "
2353                            "vector size ");
2354           dump_dec (MSG_NOTE, current_vector_size);
2355           dump_printf (MSG_NOTE, "\n");
2356         }
2357     }
2358 }
2359
2360 /* Return true if there is an in-order reduction function for CODE, storing
2361    it in *REDUC_FN if so.  */
2362
2363 static bool
2364 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2365 {
2366   switch (code)
2367     {
2368     case PLUS_EXPR:
2369       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2370       return true;
2371
2372     default:
2373       return false;
2374     }
2375 }
2376
2377 /* Function reduction_fn_for_scalar_code
2378
2379    Input:
2380    CODE - tree_code of a reduction operations.
2381
2382    Output:
2383    REDUC_FN - the corresponding internal function to be used to reduce the
2384       vector of partial results into a single scalar result, or IFN_LAST
2385       if the operation is a supported reduction operation, but does not have
2386       such an internal function.
2387
2388    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2389
2390 static bool
2391 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2392 {
2393   switch (code)
2394     {
2395       case MAX_EXPR:
2396         *reduc_fn = IFN_REDUC_MAX;
2397         return true;
2398
2399       case MIN_EXPR:
2400         *reduc_fn = IFN_REDUC_MIN;
2401         return true;
2402
2403       case PLUS_EXPR:
2404         *reduc_fn = IFN_REDUC_PLUS;
2405         return true;
2406
2407       case BIT_AND_EXPR:
2408         *reduc_fn = IFN_REDUC_AND;
2409         return true;
2410
2411       case BIT_IOR_EXPR:
2412         *reduc_fn = IFN_REDUC_IOR;
2413         return true;
2414
2415       case BIT_XOR_EXPR:
2416         *reduc_fn = IFN_REDUC_XOR;
2417         return true;
2418
2419       case MULT_EXPR:
2420       case MINUS_EXPR:
2421         *reduc_fn = IFN_LAST;
2422         return true;
2423
2424       default:
2425        return false;
2426     }
2427 }
2428
2429 /* If there is a neutral value X such that SLP reduction NODE would not
2430    be affected by the introduction of additional X elements, return that X,
2431    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2432    is true if the SLP statements perform a single reduction, false if each
2433    statement performs an independent reduction.  */
2434
2435 static tree
2436 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2437                               bool reduc_chain)
2438 {
2439   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2440   stmt_vec_info stmt_vinfo = stmts[0];
2441   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2442   tree scalar_type = TREE_TYPE (vector_type);
2443   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2444   gcc_assert (loop);
2445
2446   switch (code)
2447     {
2448     case WIDEN_SUM_EXPR:
2449     case DOT_PROD_EXPR:
2450     case SAD_EXPR:
2451     case PLUS_EXPR:
2452     case MINUS_EXPR:
2453     case BIT_IOR_EXPR:
2454     case BIT_XOR_EXPR:
2455       return build_zero_cst (scalar_type);
2456
2457     case MULT_EXPR:
2458       return build_one_cst (scalar_type);
2459
2460     case BIT_AND_EXPR:
2461       return build_all_ones_cst (scalar_type);
2462
2463     case MAX_EXPR:
2464     case MIN_EXPR:
2465       /* For MIN/MAX the initial values are neutral.  A reduction chain
2466          has only a single initial value, so that value is neutral for
2467          all statements.  */
2468       if (reduc_chain)
2469         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2470                                       loop_preheader_edge (loop));
2471       return NULL_TREE;
2472
2473     default:
2474       return NULL_TREE;
2475     }
2476 }
2477
2478 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2479    STMT is printed with a message MSG. */
2480
2481 static void
2482 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2483 {
2484   dump_printf_loc (msg_type, vect_location, "%s", msg);
2485   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2486 }
2487
2488 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2489    operation.  Return true if the results of DEF_STMT_INFO are something
2490    that can be accumulated by such a reduction.  */
2491
2492 static bool
2493 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2494 {
2495   return (is_gimple_assign (def_stmt_info->stmt)
2496           || is_gimple_call (def_stmt_info->stmt)
2497           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2498           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2499               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2500               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2501 }
2502
2503 /* Detect SLP reduction of the form:
2504
2505    #a1 = phi <a5, a0>
2506    a2 = operation (a1)
2507    a3 = operation (a2)
2508    a4 = operation (a3)
2509    a5 = operation (a4)
2510
2511    #a = phi <a5>
2512
2513    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2514    FIRST_STMT is the first reduction stmt in the chain
2515    (a2 = operation (a1)).
2516
2517    Return TRUE if a reduction chain was detected.  */
2518
2519 static bool
2520 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2521                        gimple *first_stmt)
2522 {
2523   struct loop *loop = (gimple_bb (phi))->loop_father;
2524   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2525   enum tree_code code;
2526   gimple *loop_use_stmt = NULL;
2527   stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2528   tree lhs;
2529   imm_use_iterator imm_iter;
2530   use_operand_p use_p;
2531   int nloop_uses, size = 0, n_out_of_loop_uses;
2532   bool found = false;
2533
2534   if (loop != vect_loop)
2535     return false;
2536
2537   lhs = PHI_RESULT (phi);
2538   code = gimple_assign_rhs_code (first_stmt);
2539   while (1)
2540     {
2541       nloop_uses = 0;
2542       n_out_of_loop_uses = 0;
2543       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2544         {
2545           gimple *use_stmt = USE_STMT (use_p);
2546           if (is_gimple_debug (use_stmt))
2547             continue;
2548
2549           /* Check if we got back to the reduction phi.  */
2550           if (use_stmt == phi)
2551             {
2552               loop_use_stmt = use_stmt;
2553               found = true;
2554               break;
2555             }
2556
2557           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2558             {
2559               loop_use_stmt = use_stmt;
2560               nloop_uses++;
2561             }
2562            else
2563              n_out_of_loop_uses++;
2564
2565            /* There are can be either a single use in the loop or two uses in
2566               phi nodes.  */
2567            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2568              return false;
2569         }
2570
2571       if (found)
2572         break;
2573
2574       /* We reached a statement with no loop uses.  */
2575       if (nloop_uses == 0)
2576         return false;
2577
2578       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2579       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2580         return false;
2581
2582       if (!is_gimple_assign (loop_use_stmt)
2583           || code != gimple_assign_rhs_code (loop_use_stmt)
2584           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2585         return false;
2586
2587       /* Insert USE_STMT into reduction chain.  */
2588       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2589       if (current_stmt_info)
2590         {
2591           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2592           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2593             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2594         }
2595       else
2596         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2597
2598       lhs = gimple_assign_lhs (loop_use_stmt);
2599       current_stmt_info = use_stmt_info;
2600       size++;
2601    }
2602
2603   if (!found || loop_use_stmt != phi || size < 2)
2604     return false;
2605
2606   /* Swap the operands, if needed, to make the reduction operand be the second
2607      operand.  */
2608   lhs = PHI_RESULT (phi);
2609   stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2610   while (next_stmt_info)
2611     {
2612       gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2613       if (gimple_assign_rhs2 (next_stmt) == lhs)
2614         {
2615           tree op = gimple_assign_rhs1 (next_stmt);
2616           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2617
2618           /* Check that the other def is either defined in the loop
2619              ("vect_internal_def"), or it's an induction (defined by a
2620              loop-header phi-node).  */
2621           if (def_stmt_info
2622               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2623               && vect_valid_reduction_input_p (def_stmt_info))
2624             {
2625               lhs = gimple_assign_lhs (next_stmt);
2626               next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2627               continue;
2628             }
2629
2630           return false;
2631         }
2632       else
2633         {
2634           tree op = gimple_assign_rhs2 (next_stmt);
2635           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2636
2637           /* Check that the other def is either defined in the loop
2638             ("vect_internal_def"), or it's an induction (defined by a
2639             loop-header phi-node).  */
2640           if (def_stmt_info
2641               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2642               && vect_valid_reduction_input_p (def_stmt_info))
2643             {
2644               if (dump_enabled_p ())
2645                 {
2646                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2647                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2648                 }
2649
2650               swap_ssa_operands (next_stmt,
2651                                  gimple_assign_rhs1_ptr (next_stmt),
2652                                  gimple_assign_rhs2_ptr (next_stmt));
2653               update_stmt (next_stmt);
2654
2655               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2656                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2657             }
2658           else
2659             return false;
2660         }
2661
2662       lhs = gimple_assign_lhs (next_stmt);
2663       next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2664     }
2665
2666   /* Save the chain for further analysis in SLP detection.  */
2667   stmt_vec_info first_stmt_info
2668     = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2669   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2670   REDUC_GROUP_SIZE (first_stmt_info) = size;
2671
2672   return true;
2673 }
2674
2675 /* Return true if we need an in-order reduction for operation CODE
2676    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2677    overflow must wrap.  */
2678
2679 static bool
2680 needs_fold_left_reduction_p (tree type, tree_code code,
2681                              bool need_wrapping_integral_overflow)
2682 {
2683   /* CHECKME: check for !flag_finite_math_only too?  */
2684   if (SCALAR_FLOAT_TYPE_P (type))
2685     switch (code)
2686       {
2687       case MIN_EXPR:
2688       case MAX_EXPR:
2689         return false;
2690
2691       default:
2692         return !flag_associative_math;
2693       }
2694
2695   if (INTEGRAL_TYPE_P (type))
2696     {
2697       if (!operation_no_trapping_overflow (type, code))
2698         return true;
2699       if (need_wrapping_integral_overflow
2700           && !TYPE_OVERFLOW_WRAPS (type)
2701           && operation_can_overflow (code))
2702         return true;
2703       return false;
2704     }
2705
2706   if (SAT_FIXED_POINT_TYPE_P (type))
2707     return true;
2708
2709   return false;
2710 }
2711
2712 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2713    reduction operation CODE has a handled computation expression.  */
2714
2715 bool
2716 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2717                       tree loop_arg, enum tree_code code)
2718 {
2719   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2720   auto_bitmap visited;
2721   tree lookfor = PHI_RESULT (phi);
2722   ssa_op_iter curri;
2723   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2724   while (USE_FROM_PTR (curr) != loop_arg)
2725     curr = op_iter_next_use (&curri);
2726   curri.i = curri.numops;
2727   do
2728     {
2729       path.safe_push (std::make_pair (curri, curr));
2730       tree use = USE_FROM_PTR (curr);
2731       if (use == lookfor)
2732         break;
2733       gimple *def = SSA_NAME_DEF_STMT (use);
2734       if (gimple_nop_p (def)
2735           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2736         {
2737 pop:
2738           do
2739             {
2740               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2741               curri = x.first;
2742               curr = x.second;
2743               do
2744                 curr = op_iter_next_use (&curri);
2745               /* Skip already visited or non-SSA operands (from iterating
2746                  over PHI args).  */
2747               while (curr != NULL_USE_OPERAND_P
2748                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2749                          || ! bitmap_set_bit (visited,
2750                                               SSA_NAME_VERSION
2751                                                 (USE_FROM_PTR (curr)))));
2752             }
2753           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2754           if (curr == NULL_USE_OPERAND_P)
2755             break;
2756         }
2757       else
2758         {
2759           if (gimple_code (def) == GIMPLE_PHI)
2760             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2761           else
2762             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2763           while (curr != NULL_USE_OPERAND_P
2764                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2765                      || ! bitmap_set_bit (visited,
2766                                           SSA_NAME_VERSION
2767                                             (USE_FROM_PTR (curr)))))
2768             curr = op_iter_next_use (&curri);
2769           if (curr == NULL_USE_OPERAND_P)
2770             goto pop;
2771         }
2772     }
2773   while (1);
2774   if (dump_file && (dump_flags & TDF_DETAILS))
2775     {
2776       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2777       unsigned i;
2778       std::pair<ssa_op_iter, use_operand_p> *x;
2779       FOR_EACH_VEC_ELT (path, i, x)
2780         {
2781           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2782           dump_printf (MSG_NOTE, " ");
2783         }
2784       dump_printf (MSG_NOTE, "\n");
2785     }
2786
2787   /* Check whether the reduction path detected is valid.  */
2788   bool fail = path.length () == 0;
2789   bool neg = false;
2790   for (unsigned i = 1; i < path.length (); ++i)
2791     {
2792       gimple *use_stmt = USE_STMT (path[i].second);
2793       tree op = USE_FROM_PTR (path[i].second);
2794       if (! has_single_use (op)
2795           || ! is_gimple_assign (use_stmt))
2796         {
2797           fail = true;
2798           break;
2799         }
2800       if (gimple_assign_rhs_code (use_stmt) != code)
2801         {
2802           if (code == PLUS_EXPR
2803               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2804             {
2805               /* Track whether we negate the reduction value each iteration.  */
2806               if (gimple_assign_rhs2 (use_stmt) == op)
2807                 neg = ! neg;
2808             }
2809           else
2810             {
2811               fail = true;
2812               break;
2813             }
2814         }
2815     }
2816   return ! fail && ! neg;
2817 }
2818
2819
2820 /* Function vect_is_simple_reduction
2821
2822    (1) Detect a cross-iteration def-use cycle that represents a simple
2823    reduction computation.  We look for the following pattern:
2824
2825    loop_header:
2826      a1 = phi < a0, a2 >
2827      a3 = ...
2828      a2 = operation (a3, a1)
2829
2830    or
2831
2832    a3 = ...
2833    loop_header:
2834      a1 = phi < a0, a2 >
2835      a2 = operation (a3, a1)
2836
2837    such that:
2838    1. operation is commutative and associative and it is safe to
2839       change the order of the computation
2840    2. no uses for a2 in the loop (a2 is used out of the loop)
2841    3. no uses of a1 in the loop besides the reduction operation
2842    4. no uses of a1 outside the loop.
2843
2844    Conditions 1,4 are tested here.
2845    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2846
2847    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2848    nested cycles.
2849
2850    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2851    reductions:
2852
2853      a1 = phi < a0, a2 >
2854      inner loop (def of a3)
2855      a2 = phi < a3 >
2856
2857    (4) Detect condition expressions, ie:
2858      for (int i = 0; i < N; i++)
2859        if (a[i] < val)
2860         ret_val = a[i];
2861
2862 */
2863
2864 static stmt_vec_info
2865 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2866                           bool *double_reduc,
2867                           bool need_wrapping_integral_overflow,
2868                           enum vect_reduction_type *v_reduc_type)
2869 {
2870   gphi *phi = as_a <gphi *> (phi_info->stmt);
2871   struct loop *loop = (gimple_bb (phi))->loop_father;
2872   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2873   gimple *phi_use_stmt = NULL;
2874   enum tree_code orig_code, code;
2875   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2876   tree type;
2877   int nloop_uses;
2878   tree name;
2879   imm_use_iterator imm_iter;
2880   use_operand_p use_p;
2881   bool phi_def;
2882
2883   *double_reduc = false;
2884   *v_reduc_type = TREE_CODE_REDUCTION;
2885
2886   tree phi_name = PHI_RESULT (phi);
2887   /* ???  If there are no uses of the PHI result the inner loop reduction
2888      won't be detected as possibly double-reduction by vectorizable_reduction
2889      because that tries to walk the PHI arg from the preheader edge which
2890      can be constant.  See PR60382.  */
2891   if (has_zero_uses (phi_name))
2892     return NULL;
2893   nloop_uses = 0;
2894   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2895     {
2896       gimple *use_stmt = USE_STMT (use_p);
2897       if (is_gimple_debug (use_stmt))
2898         continue;
2899
2900       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2901         {
2902           if (dump_enabled_p ())
2903             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904                              "intermediate value used outside loop.\n");
2905
2906           return NULL;
2907         }
2908
2909       nloop_uses++;
2910       if (nloop_uses > 1)
2911         {
2912           if (dump_enabled_p ())
2913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914                              "reduction value used in loop.\n");
2915           return NULL;
2916         }
2917
2918       phi_use_stmt = use_stmt;
2919     }
2920
2921   edge latch_e = loop_latch_edge (loop);
2922   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2923   if (TREE_CODE (loop_arg) != SSA_NAME)
2924     {
2925       if (dump_enabled_p ())
2926         {
2927           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2928                            "reduction: not ssa_name: ");
2929           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2930           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2931         }
2932       return NULL;
2933     }
2934
2935   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2936   if (!def_stmt_info)
2937     return NULL;
2938
2939   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2940     {
2941       name = gimple_assign_lhs (def_stmt);
2942       phi_def = false;
2943     }
2944   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2945     {
2946       name = PHI_RESULT (def_stmt);
2947       phi_def = true;
2948     }
2949   else
2950     {
2951       if (dump_enabled_p ())
2952         {
2953           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954                            "reduction: unhandled reduction operation: ");
2955           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2956                             def_stmt_info->stmt, 0);
2957         }
2958       return NULL;
2959     }
2960
2961   nloop_uses = 0;
2962   auto_vec<gphi *, 3> lcphis;
2963   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2964     {
2965       gimple *use_stmt = USE_STMT (use_p);
2966       if (is_gimple_debug (use_stmt))
2967         continue;
2968       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2969         nloop_uses++;
2970       else
2971         /* We can have more than one loop-closed PHI.  */
2972         lcphis.safe_push (as_a <gphi *> (use_stmt));
2973       if (nloop_uses > 1)
2974         {
2975           if (dump_enabled_p ())
2976             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2977                              "reduction used in loop.\n");
2978           return NULL;
2979         }
2980     }
2981
2982   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2983      defined in the inner loop.  */
2984   if (phi_def)
2985     {
2986       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2987       op1 = PHI_ARG_DEF (def_stmt, 0);
2988
2989       if (gimple_phi_num_args (def_stmt) != 1
2990           || TREE_CODE (op1) != SSA_NAME)
2991         {
2992           if (dump_enabled_p ())
2993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2994                              "unsupported phi node definition.\n");
2995
2996           return NULL;
2997         }
2998
2999       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3000       if (gimple_bb (def1)
3001           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3002           && loop->inner
3003           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3004           && is_gimple_assign (def1)
3005           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3006         {
3007           if (dump_enabled_p ())
3008             report_vect_op (MSG_NOTE, def_stmt,
3009                             "detected double reduction: ");
3010
3011           *double_reduc = true;
3012           return def_stmt_info;
3013         }
3014
3015       return NULL;
3016     }
3017
3018   /* If we are vectorizing an inner reduction we are executing that
3019      in the original order only in case we are not dealing with a
3020      double reduction.  */
3021   bool check_reduction = true;
3022   if (flow_loop_nested_p (vect_loop, loop))
3023     {
3024       gphi *lcphi;
3025       unsigned i;
3026       check_reduction = false;
3027       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3028         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3029           {
3030             gimple *use_stmt = USE_STMT (use_p);
3031             if (is_gimple_debug (use_stmt))
3032               continue;
3033             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3034               check_reduction = true;
3035           }
3036     }
3037
3038   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3039   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3040   code = orig_code = gimple_assign_rhs_code (def_stmt);
3041
3042   /* We can handle "res -= x[i]", which is non-associative by
3043      simply rewriting this into "res += -x[i]".  Avoid changing
3044      gimple instruction for the first simple tests and only do this
3045      if we're allowed to change code at all.  */
3046   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3047     code = PLUS_EXPR;
3048
3049   if (code == COND_EXPR)
3050     {
3051       if (! nested_in_vect_loop)
3052         *v_reduc_type = COND_REDUCTION;
3053
3054       op3 = gimple_assign_rhs1 (def_stmt);
3055       if (COMPARISON_CLASS_P (op3))
3056         {
3057           op4 = TREE_OPERAND (op3, 1);
3058           op3 = TREE_OPERAND (op3, 0);
3059         }
3060       if (op3 == phi_name || op4 == phi_name)
3061         {
3062           if (dump_enabled_p ())
3063             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3064                             "reduction: condition depends on previous"
3065                             " iteration: ");
3066           return NULL;
3067         }
3068
3069       op1 = gimple_assign_rhs2 (def_stmt);
3070       op2 = gimple_assign_rhs3 (def_stmt);
3071     }
3072   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3073     {
3074       if (dump_enabled_p ())
3075         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3076                         "reduction: not commutative/associative: ");
3077       return NULL;
3078     }
3079   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3080     {
3081       op1 = gimple_assign_rhs1 (def_stmt);
3082       op2 = gimple_assign_rhs2 (def_stmt);
3083     }
3084   else
3085     {
3086       if (dump_enabled_p ())
3087         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3088                         "reduction: not handled operation: ");
3089       return NULL;
3090     }
3091
3092   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3093     {
3094       if (dump_enabled_p ())
3095         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3096                         "reduction: both uses not ssa_names: ");
3097
3098       return NULL;
3099     }
3100
3101   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3102   if ((TREE_CODE (op1) == SSA_NAME
3103        && !types_compatible_p (type,TREE_TYPE (op1)))
3104       || (TREE_CODE (op2) == SSA_NAME
3105           && !types_compatible_p (type, TREE_TYPE (op2)))
3106       || (op3 && TREE_CODE (op3) == SSA_NAME
3107           && !types_compatible_p (type, TREE_TYPE (op3)))
3108       || (op4 && TREE_CODE (op4) == SSA_NAME
3109           && !types_compatible_p (type, TREE_TYPE (op4))))
3110     {
3111       if (dump_enabled_p ())
3112         {
3113           dump_printf_loc (MSG_NOTE, vect_location,
3114                            "reduction: multiple types: operation type: ");
3115           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3116           dump_printf (MSG_NOTE, ", operands types: ");
3117           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3118                              TREE_TYPE (op1));
3119           dump_printf (MSG_NOTE, ",");
3120           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3121                              TREE_TYPE (op2));
3122           if (op3)
3123             {
3124               dump_printf (MSG_NOTE, ",");
3125               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126                                  TREE_TYPE (op3));
3127             }
3128
3129           if (op4)
3130             {
3131               dump_printf (MSG_NOTE, ",");
3132               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3133                                  TREE_TYPE (op4));
3134             }
3135           dump_printf (MSG_NOTE, "\n");
3136         }
3137
3138       return NULL;
3139     }
3140
3141   /* Check whether it's ok to change the order of the computation.
3142      Generally, when vectorizing a reduction we change the order of the
3143      computation.  This may change the behavior of the program in some
3144      cases, so we need to check that this is ok.  One exception is when
3145      vectorizing an outer-loop: the inner-loop is executed sequentially,
3146      and therefore vectorizing reductions in the inner-loop during
3147      outer-loop vectorization is safe.  */
3148   if (check_reduction
3149       && *v_reduc_type == TREE_CODE_REDUCTION
3150       && needs_fold_left_reduction_p (type, code,
3151                                       need_wrapping_integral_overflow))
3152     *v_reduc_type = FOLD_LEFT_REDUCTION;
3153
3154   /* Reduction is safe. We're dealing with one of the following:
3155      1) integer arithmetic and no trapv
3156      2) floating point arithmetic, and special flags permit this optimization
3157      3) nested cycle (i.e., outer loop vectorization).  */
3158   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3159   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3160   if (code != COND_EXPR && !def1_info && !def2_info)
3161     {
3162       if (dump_enabled_p ())
3163         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3164       return NULL;
3165     }
3166
3167   /* Check that one def is the reduction def, defined by PHI,
3168      the other def is either defined in the loop ("vect_internal_def"),
3169      or it's an induction (defined by a loop-header phi-node).  */
3170
3171   if (def2_info
3172       && def2_info->stmt == phi
3173       && (code == COND_EXPR
3174           || !def1_info
3175           || vect_valid_reduction_input_p (def1_info)))
3176     {
3177       if (dump_enabled_p ())
3178         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3179       return def_stmt_info;
3180     }
3181
3182   if (def1_info
3183       && def1_info->stmt == phi
3184       && (code == COND_EXPR
3185           || !def2_info
3186           || vect_valid_reduction_input_p (def2_info)))
3187     {
3188       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3189         {
3190           /* Check if we can swap operands (just for simplicity - so that
3191              the rest of the code can assume that the reduction variable
3192              is always the last (second) argument).  */
3193           if (code == COND_EXPR)
3194             {
3195               /* Swap cond_expr by inverting the condition.  */
3196               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3197               enum tree_code invert_code = ERROR_MARK;
3198               enum tree_code cond_code = TREE_CODE (cond_expr);
3199
3200               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3201                 {
3202                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3203                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3204                 }
3205               if (invert_code != ERROR_MARK)
3206                 {
3207                   TREE_SET_CODE (cond_expr, invert_code);
3208                   swap_ssa_operands (def_stmt,
3209                                      gimple_assign_rhs2_ptr (def_stmt),
3210                                      gimple_assign_rhs3_ptr (def_stmt));
3211                 }
3212               else
3213                 {
3214                   if (dump_enabled_p ())
3215                     report_vect_op (MSG_NOTE, def_stmt,
3216                                     "detected reduction: cannot swap operands "
3217                                     "for cond_expr");
3218                   return NULL;
3219                 }
3220             }
3221           else
3222             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3223                                gimple_assign_rhs2_ptr (def_stmt));
3224
3225           if (dump_enabled_p ())
3226             report_vect_op (MSG_NOTE, def_stmt,
3227                             "detected reduction: need to swap operands: ");
3228
3229           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3230             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3231         }
3232       else
3233         {
3234           if (dump_enabled_p ())
3235             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3236         }
3237
3238       return def_stmt_info;
3239     }
3240
3241   /* Try to find SLP reduction chain.  */
3242   if (! nested_in_vect_loop
3243       && code != COND_EXPR
3244       && orig_code != MINUS_EXPR
3245       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3246     {
3247       if (dump_enabled_p ())
3248         report_vect_op (MSG_NOTE, def_stmt,
3249                         "reduction: detected reduction chain: ");
3250
3251       return def_stmt_info;
3252     }
3253
3254   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3255   stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3256   while (first)
3257     {
3258       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3259       REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3260       REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3261       first = next;
3262     }
3263
3264   /* Look for the expression computing loop_arg from loop PHI result.  */
3265   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3266     return def_stmt_info;
3267
3268   if (dump_enabled_p ())
3269     {
3270       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3271                       "reduction: unknown pattern: ");
3272     }
3273
3274   return NULL;
3275 }
3276
3277 /* Wrapper around vect_is_simple_reduction, which will modify code
3278    in-place if it enables detection of more reductions.  Arguments
3279    as there.  */
3280
3281 stmt_vec_info
3282 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3283                              bool *double_reduc,
3284                              bool need_wrapping_integral_overflow)
3285 {
3286   enum vect_reduction_type v_reduc_type;
3287   stmt_vec_info def_info
3288     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3289                                 need_wrapping_integral_overflow,
3290                                 &v_reduc_type);
3291   if (def_info)
3292     {
3293       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3294       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3295       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3296       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3297     }
3298   return def_info;
3299 }
3300
3301 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3302 int
3303 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3304                              int *peel_iters_epilogue,
3305                              stmt_vector_for_cost *scalar_cost_vec,
3306                              stmt_vector_for_cost *prologue_cost_vec,
3307                              stmt_vector_for_cost *epilogue_cost_vec)
3308 {
3309   int retval = 0;
3310   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3311
3312   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3313     {
3314       *peel_iters_epilogue = assumed_vf / 2;
3315       if (dump_enabled_p ())
3316         dump_printf_loc (MSG_NOTE, vect_location,
3317                          "cost model: epilogue peel iters set to vf/2 "
3318                          "because loop iterations are unknown .\n");
3319
3320       /* If peeled iterations are known but number of scalar loop
3321          iterations are unknown, count a taken branch per peeled loop.  */
3322       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3323                                  NULL, 0, vect_prologue);
3324       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3325                                  NULL, 0, vect_epilogue);
3326     }
3327   else
3328     {
3329       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3330       peel_iters_prologue = niters < peel_iters_prologue ?
3331                             niters : peel_iters_prologue;
3332       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3333       /* If we need to peel for gaps, but no peeling is required, we have to
3334          peel VF iterations.  */
3335       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3336         *peel_iters_epilogue = assumed_vf;
3337     }
3338
3339   stmt_info_for_cost *si;
3340   int j;
3341   if (peel_iters_prologue)
3342     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3343       retval += record_stmt_cost (prologue_cost_vec,
3344                                   si->count * peel_iters_prologue,
3345                                   si->kind, si->stmt_info, si->misalign,
3346                                   vect_prologue);
3347   if (*peel_iters_epilogue)
3348     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3349       retval += record_stmt_cost (epilogue_cost_vec,
3350                                   si->count * *peel_iters_epilogue,
3351                                   si->kind, si->stmt_info, si->misalign,
3352                                   vect_epilogue);
3353
3354   return retval;
3355 }
3356
3357 /* Function vect_estimate_min_profitable_iters
3358
3359    Return the number of iterations required for the vector version of the
3360    loop to be profitable relative to the cost of the scalar version of the
3361    loop.
3362
3363    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3364    of iterations for vectorization.  -1 value means loop vectorization
3365    is not profitable.  This returned value may be used for dynamic
3366    profitability check.
3367
3368    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3369    for static check against estimated number of iterations.  */
3370
3371 static void
3372 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3373                                     int *ret_min_profitable_niters,
3374                                     int *ret_min_profitable_estimate)
3375 {
3376   int min_profitable_iters;
3377   int min_profitable_estimate;
3378   int peel_iters_prologue;
3379   int peel_iters_epilogue;
3380   unsigned vec_inside_cost = 0;
3381   int vec_outside_cost = 0;
3382   unsigned vec_prologue_cost = 0;
3383   unsigned vec_epilogue_cost = 0;
3384   int scalar_single_iter_cost = 0;
3385   int scalar_outside_cost = 0;
3386   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3387   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3388   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3389
3390   /* Cost model disabled.  */
3391   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3392     {
3393       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3394       *ret_min_profitable_niters = 0;
3395       *ret_min_profitable_estimate = 0;
3396       return;
3397     }
3398
3399   /* Requires loop versioning tests to handle misalignment.  */
3400   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3401     {
3402       /*  FIXME: Make cost depend on complexity of individual check.  */
3403       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3404       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3405                             vect_prologue);
3406       dump_printf (MSG_NOTE,
3407                    "cost model: Adding cost of checks for loop "
3408                    "versioning to treat misalignment.\n");
3409     }
3410
3411   /* Requires loop versioning with alias checks.  */
3412   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3413     {
3414       /*  FIXME: Make cost depend on complexity of individual check.  */
3415       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3416       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3417                             vect_prologue);
3418       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3419       if (len)
3420         /* Count LEN - 1 ANDs and LEN comparisons.  */
3421         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3422                               NULL, 0, vect_prologue);
3423       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3424       if (len)
3425         {
3426           /* Count LEN - 1 ANDs and LEN comparisons.  */
3427           unsigned int nstmts = len * 2 - 1;
3428           /* +1 for each bias that needs adding.  */
3429           for (unsigned int i = 0; i < len; ++i)
3430             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3431               nstmts += 1;
3432           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3433                                 NULL, 0, vect_prologue);
3434         }
3435       dump_printf (MSG_NOTE,
3436                    "cost model: Adding cost of checks for loop "
3437                    "versioning aliasing.\n");
3438     }
3439
3440   /* Requires loop versioning with niter checks.  */
3441   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3442     {
3443       /*  FIXME: Make cost depend on complexity of individual check.  */
3444       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3445                             vect_prologue);
3446       dump_printf (MSG_NOTE,
3447                    "cost model: Adding cost of checks for loop "
3448                    "versioning niters.\n");
3449     }
3450
3451   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3452     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3453                           vect_prologue);
3454
3455   /* Count statements in scalar loop.  Using this as scalar cost for a single
3456      iteration for now.
3457
3458      TODO: Add outer loop support.
3459
3460      TODO: Consider assigning different costs to different scalar
3461      statements.  */
3462
3463   scalar_single_iter_cost
3464     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3465
3466   /* Add additional cost for the peeled instructions in prologue and epilogue
3467      loop.  (For fully-masked loops there will be no peeling.)
3468
3469      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3470      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3471
3472      TODO: Build an expression that represents peel_iters for prologue and
3473      epilogue to be used in a run-time test.  */
3474
3475   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3476     {
3477       peel_iters_prologue = 0;
3478       peel_iters_epilogue = 0;
3479
3480       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3481         {
3482           /* We need to peel exactly one iteration.  */
3483           peel_iters_epilogue += 1;
3484           stmt_info_for_cost *si;
3485           int j;
3486           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3487                             j, si)
3488             (void) add_stmt_cost (target_cost_data, si->count,
3489                                   si->kind, si->stmt_info, si->misalign,
3490                                   vect_epilogue);
3491         }
3492     }
3493   else if (npeel < 0)
3494     {
3495       peel_iters_prologue = assumed_vf / 2;
3496       dump_printf (MSG_NOTE, "cost model: "
3497                    "prologue peel iters set to vf/2.\n");
3498
3499       /* If peeling for alignment is unknown, loop bound of main loop becomes
3500          unknown.  */
3501       peel_iters_epilogue = assumed_vf / 2;
3502       dump_printf (MSG_NOTE, "cost model: "
3503                    "epilogue peel iters set to vf/2 because "
3504                    "peeling for alignment is unknown.\n");
3505
3506       /* If peeled iterations are unknown, count a taken branch and a not taken
3507          branch per peeled loop. Even if scalar loop iterations are known,
3508          vector iterations are not known since peeled prologue iterations are
3509          not known. Hence guards remain the same.  */
3510       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3511                             NULL, 0, vect_prologue);
3512       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3513                             NULL, 0, vect_prologue);
3514       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3515                             NULL, 0, vect_epilogue);
3516       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3517                             NULL, 0, vect_epilogue);
3518       stmt_info_for_cost *si;
3519       int j;
3520       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3521         {
3522           (void) add_stmt_cost (target_cost_data,
3523                                 si->count * peel_iters_prologue,
3524                                 si->kind, si->stmt_info, si->misalign,
3525                                 vect_prologue);
3526           (void) add_stmt_cost (target_cost_data,
3527                                 si->count * peel_iters_epilogue,
3528                                 si->kind, si->stmt_info, si->misalign,
3529                                 vect_epilogue);
3530         }
3531     }
3532   else
3533     {
3534       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3535       stmt_info_for_cost *si;
3536       int j;
3537       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3538
3539       prologue_cost_vec.create (2);
3540       epilogue_cost_vec.create (2);
3541       peel_iters_prologue = npeel;
3542
3543       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3544                                           &peel_iters_epilogue,
3545                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3546                                             (loop_vinfo),
3547                                           &prologue_cost_vec,
3548                                           &epilogue_cost_vec);
3549
3550       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3551         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3552                               si->misalign, vect_prologue);
3553
3554       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3555         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3556                               si->misalign, vect_epilogue);
3557
3558       prologue_cost_vec.release ();
3559       epilogue_cost_vec.release ();
3560     }
3561
3562   /* FORNOW: The scalar outside cost is incremented in one of the
3563      following ways:
3564
3565      1. The vectorizer checks for alignment and aliasing and generates
3566      a condition that allows dynamic vectorization.  A cost model
3567      check is ANDED with the versioning condition.  Hence scalar code
3568      path now has the added cost of the versioning check.
3569
3570        if (cost > th & versioning_check)
3571          jmp to vector code
3572
3573      Hence run-time scalar is incremented by not-taken branch cost.
3574
3575      2. The vectorizer then checks if a prologue is required.  If the
3576      cost model check was not done before during versioning, it has to
3577      be done before the prologue check.
3578
3579        if (cost <= th)
3580          prologue = scalar_iters
3581        if (prologue == 0)
3582          jmp to vector code
3583        else
3584          execute prologue
3585        if (prologue == num_iters)
3586          go to exit
3587
3588      Hence the run-time scalar cost is incremented by a taken branch,
3589      plus a not-taken branch, plus a taken branch cost.
3590
3591      3. The vectorizer then checks if an epilogue is required.  If the
3592      cost model check was not done before during prologue check, it
3593      has to be done with the epilogue check.
3594
3595        if (prologue == 0)
3596          jmp to vector code
3597        else
3598          execute prologue
3599        if (prologue == num_iters)
3600          go to exit
3601        vector code:
3602          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3603            jmp to epilogue
3604
3605      Hence the run-time scalar cost should be incremented by 2 taken
3606      branches.
3607
3608      TODO: The back end may reorder the BBS's differently and reverse
3609      conditions/branch directions.  Change the estimates below to
3610      something more reasonable.  */
3611
3612   /* If the number of iterations is known and we do not do versioning, we can
3613      decide whether to vectorize at compile time.  Hence the scalar version
3614      do not carry cost model guard costs.  */
3615   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3616       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3617     {
3618       /* Cost model check occurs at versioning.  */
3619       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3620         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3621       else
3622         {
3623           /* Cost model check occurs at prologue generation.  */
3624           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3625             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3626               + vect_get_stmt_cost (cond_branch_not_taken);
3627           /* Cost model check occurs at epilogue generation.  */
3628           else
3629             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3630         }
3631     }
3632
3633   /* Complete the target-specific cost calculations.  */
3634   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3635                &vec_inside_cost, &vec_epilogue_cost);
3636
3637   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3638
3639   if (dump_enabled_p ())
3640     {
3641       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3642       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3643                    vec_inside_cost);
3644       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3645                    vec_prologue_cost);
3646       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3647                    vec_epilogue_cost);
3648       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3649                    scalar_single_iter_cost);
3650       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3651                    scalar_outside_cost);
3652       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3653                    vec_outside_cost);
3654       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3655                    peel_iters_prologue);
3656       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3657                    peel_iters_epilogue);
3658     }
3659
3660   /* Calculate number of iterations required to make the vector version
3661      profitable, relative to the loop bodies only.  The following condition
3662      must hold true:
3663      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3664      where
3665      SIC = scalar iteration cost, VIC = vector iteration cost,
3666      VOC = vector outside cost, VF = vectorization factor,
3667      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3668      SOC = scalar outside cost for run time cost model check.  */
3669
3670   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3671     {
3672       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3673                               * assumed_vf
3674                               - vec_inside_cost * peel_iters_prologue
3675                               - vec_inside_cost * peel_iters_epilogue);
3676       if (min_profitable_iters <= 0)
3677         min_profitable_iters = 0;
3678       else
3679         {
3680           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3681                                    - vec_inside_cost);
3682
3683           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3684               <= (((int) vec_inside_cost * min_profitable_iters)
3685                   + (((int) vec_outside_cost - scalar_outside_cost)
3686                      * assumed_vf)))
3687             min_profitable_iters++;
3688         }
3689     }
3690   /* vector version will never be profitable.  */
3691   else
3692     {
3693       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3694         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3695                     "vectorization did not happen for a simd loop");
3696
3697       if (dump_enabled_p ())
3698         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3699                          "cost model: the vector iteration cost = %d "
3700                          "divided by the scalar iteration cost = %d "
3701                          "is greater or equal to the vectorization factor = %d"
3702                          ".\n",
3703                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3704       *ret_min_profitable_niters = -1;
3705       *ret_min_profitable_estimate = -1;
3706       return;
3707     }
3708
3709   dump_printf (MSG_NOTE,
3710                "  Calculated minimum iters for profitability: %d\n",
3711                min_profitable_iters);
3712
3713   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3714       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3715     /* We want the vectorized loop to execute at least once.  */
3716     min_profitable_iters = assumed_vf + peel_iters_prologue;
3717
3718   if (dump_enabled_p ())
3719     dump_printf_loc (MSG_NOTE, vect_location,
3720                      "  Runtime profitability threshold = %d\n",
3721                      min_profitable_iters);
3722
3723   *ret_min_profitable_niters = min_profitable_iters;
3724
3725   /* Calculate number of iterations required to make the vector version
3726      profitable, relative to the loop bodies only.
3727
3728      Non-vectorized variant is SIC * niters and it must win over vector
3729      variant on the expected loop trip count.  The following condition must hold true:
3730      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3731
3732   if (vec_outside_cost <= 0)
3733     min_profitable_estimate = 0;
3734   else
3735     {
3736       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3737                                  * assumed_vf
3738                                  - vec_inside_cost * peel_iters_prologue
3739                                  - vec_inside_cost * peel_iters_epilogue)
3740                                  / ((scalar_single_iter_cost * assumed_vf)
3741                                    - vec_inside_cost);
3742     }
3743   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3744   if (dump_enabled_p ())
3745     dump_printf_loc (MSG_NOTE, vect_location,
3746                      "  Static estimate profitability threshold = %d\n",
3747                      min_profitable_estimate);
3748
3749   *ret_min_profitable_estimate = min_profitable_estimate;
3750 }
3751
3752 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3753    vector elements (not bits) for a vector with NELT elements.  */
3754 static void
3755 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3756                               vec_perm_builder *sel)
3757 {
3758   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3759      by vec_perm_indices.  */
3760   sel->new_vector (nelt, 1, 3);
3761   for (unsigned int i = 0; i < 3; i++)
3762     sel->quick_push (i + offset);
3763 }
3764
3765 /* Checks whether the target supports whole-vector shifts for vectors of mode
3766    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3767    it supports vec_perm_const with masks for all necessary shift amounts.  */
3768 static bool
3769 have_whole_vector_shift (machine_mode mode)
3770 {
3771   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3772     return true;
3773
3774   /* Variable-length vectors should be handled via the optab.  */
3775   unsigned int nelt;
3776   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3777     return false;
3778
3779   vec_perm_builder sel;
3780   vec_perm_indices indices;
3781   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3782     {
3783       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3784       indices.new_vector (sel, 2, nelt);
3785       if (!can_vec_perm_const_p (mode, indices, false))
3786         return false;
3787     }
3788   return true;
3789 }
3790
3791 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3792    functions. Design better to avoid maintenance issues.  */
3793
3794 /* Function vect_model_reduction_cost.
3795
3796    Models cost for a reduction operation, including the vector ops
3797    generated within the strip-mine loop, the initial definition before
3798    the loop, and the epilogue code that must be generated.  */
3799
3800 static void
3801 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3802                            int ncopies, stmt_vector_for_cost *cost_vec)
3803 {
3804   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3805   enum tree_code code;
3806   optab optab;
3807   tree vectype;
3808   machine_mode mode;
3809   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3810   struct loop *loop = NULL;
3811
3812   if (loop_vinfo)
3813     loop = LOOP_VINFO_LOOP (loop_vinfo);
3814
3815   /* Condition reductions generate two reductions in the loop.  */
3816   vect_reduction_type reduction_type
3817     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3818   if (reduction_type == COND_REDUCTION)
3819     ncopies *= 2;
3820
3821   vectype = STMT_VINFO_VECTYPE (stmt_info);
3822   mode = TYPE_MODE (vectype);
3823   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3824
3825   if (!orig_stmt_info)
3826     orig_stmt_info = stmt_info;
3827
3828   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3829
3830   if (reduction_type == EXTRACT_LAST_REDUCTION
3831       || reduction_type == FOLD_LEFT_REDUCTION)
3832     {
3833       /* No extra instructions needed in the prologue.  */
3834       prologue_cost = 0;
3835
3836       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3837         /* Count one reduction-like operation per vector.  */
3838         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3839                                         stmt_info, 0, vect_body);
3840       else
3841         {
3842           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3843           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3844           inside_cost = record_stmt_cost (cost_vec, nelements,
3845                                           vec_to_scalar, stmt_info, 0,
3846                                           vect_body);
3847           inside_cost += record_stmt_cost (cost_vec, nelements,
3848                                            scalar_stmt, stmt_info, 0,
3849                                            vect_body);
3850         }
3851     }
3852   else
3853     {
3854       /* Add in cost for initial definition.
3855          For cond reduction we have four vectors: initial index, step,
3856          initial result of the data reduction, initial value of the index
3857          reduction.  */
3858       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3859       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3860                                          scalar_to_vec, stmt_info, 0,
3861                                          vect_prologue);
3862
3863       /* Cost of reduction op inside loop.  */
3864       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3865                                       stmt_info, 0, vect_body);
3866     }
3867
3868   /* Determine cost of epilogue code.
3869
3870      We have a reduction operator that will reduce the vector in one statement.
3871      Also requires scalar extract.  */
3872
3873   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3874     {
3875       if (reduc_fn != IFN_LAST)
3876         {
3877           if (reduction_type == COND_REDUCTION)
3878             {
3879               /* An EQ stmt and an COND_EXPR stmt.  */
3880               epilogue_cost += record_stmt_cost (cost_vec, 2,
3881                                                  vector_stmt, stmt_info, 0,
3882                                                  vect_epilogue);
3883               /* Reduction of the max index and a reduction of the found
3884                  values.  */
3885               epilogue_cost += record_stmt_cost (cost_vec, 2,
3886                                                  vec_to_scalar, stmt_info, 0,
3887                                                  vect_epilogue);
3888               /* A broadcast of the max value.  */
3889               epilogue_cost += record_stmt_cost (cost_vec, 1,
3890                                                  scalar_to_vec, stmt_info, 0,
3891                                                  vect_epilogue);
3892             }
3893           else
3894             {
3895               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3896                                                  stmt_info, 0, vect_epilogue);
3897               epilogue_cost += record_stmt_cost (cost_vec, 1,
3898                                                  vec_to_scalar, stmt_info, 0,
3899                                                  vect_epilogue);
3900             }
3901         }
3902       else if (reduction_type == COND_REDUCTION)
3903         {
3904           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3905           /* Extraction of scalar elements.  */
3906           epilogue_cost += record_stmt_cost (cost_vec,
3907                                              2 * estimated_nunits,
3908                                              vec_to_scalar, stmt_info, 0,
3909                                              vect_epilogue);
3910           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3911           epilogue_cost += record_stmt_cost (cost_vec,
3912                                              2 * estimated_nunits - 3,
3913                                              scalar_stmt, stmt_info, 0,
3914                                              vect_epilogue);
3915         }
3916       else if (reduction_type == EXTRACT_LAST_REDUCTION
3917                || reduction_type == FOLD_LEFT_REDUCTION)
3918         /* No extra instructions need in the epilogue.  */
3919         ;
3920       else
3921         {
3922           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3923           tree bitsize =
3924             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3925           int element_bitsize = tree_to_uhwi (bitsize);
3926           int nelements = vec_size_in_bits / element_bitsize;
3927
3928           if (code == COND_EXPR)
3929             code = MAX_EXPR;
3930
3931           optab = optab_for_tree_code (code, vectype, optab_default);
3932
3933           /* We have a whole vector shift available.  */
3934           if (optab != unknown_optab
3935               && VECTOR_MODE_P (mode)
3936               && optab_handler (optab, mode) != CODE_FOR_nothing
3937               && have_whole_vector_shift (mode))
3938             {
3939               /* Final reduction via vector shifts and the reduction operator.
3940                  Also requires scalar extract.  */
3941               epilogue_cost += record_stmt_cost (cost_vec,
3942                                                  exact_log2 (nelements) * 2,
3943                                                  vector_stmt, stmt_info, 0,
3944                                                  vect_epilogue);
3945               epilogue_cost += record_stmt_cost (cost_vec, 1,
3946                                                  vec_to_scalar, stmt_info, 0,
3947                                                  vect_epilogue);
3948             }
3949           else
3950             /* Use extracts and reduction op for final reduction.  For N
3951                elements, we have N extracts and N-1 reduction ops.  */
3952             epilogue_cost += record_stmt_cost (cost_vec,
3953                                                nelements + nelements - 1,
3954                                                vector_stmt, stmt_info, 0,
3955                                                vect_epilogue);
3956         }
3957     }
3958
3959   if (dump_enabled_p ())
3960     dump_printf (MSG_NOTE,
3961                  "vect_model_reduction_cost: inside_cost = %d, "
3962                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3963                  prologue_cost, epilogue_cost);
3964 }
3965
3966
3967 /* Function vect_model_induction_cost.
3968
3969    Models cost for induction operations.  */
3970
3971 static void
3972 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3973                            stmt_vector_for_cost *cost_vec)
3974 {
3975   unsigned inside_cost, prologue_cost;
3976
3977   if (PURE_SLP_STMT (stmt_info))
3978     return;
3979
3980   /* loop cost for vec_loop.  */
3981   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3982                                   stmt_info, 0, vect_body);
3983
3984   /* prologue cost for vec_init and vec_step.  */
3985   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3986                                     stmt_info, 0, vect_prologue);
3987
3988   if (dump_enabled_p ())
3989     dump_printf_loc (MSG_NOTE, vect_location,
3990                      "vect_model_induction_cost: inside_cost = %d, "
3991                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3992 }
3993
3994
3995
3996 /* Function get_initial_def_for_reduction
3997
3998    Input:
3999    STMT - a stmt that performs a reduction operation in the loop.
4000    INIT_VAL - the initial value of the reduction variable
4001
4002    Output:
4003    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4004         of the reduction (used for adjusting the epilog - see below).
4005    Return a vector variable, initialized according to the operation that STMT
4006         performs. This vector will be used as the initial value of the
4007         vector of partial results.
4008
4009    Option1 (adjust in epilog): Initialize the vector as follows:
4010      add/bit or/xor:    [0,0,...,0,0]
4011      mult/bit and:      [1,1,...,1,1]
4012      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4013    and when necessary (e.g. add/mult case) let the caller know
4014    that it needs to adjust the result by init_val.
4015
4016    Option2: Initialize the vector as follows:
4017      add/bit or/xor:    [init_val,0,0,...,0]
4018      mult/bit and:      [init_val,1,1,...,1]
4019      min/max/cond_expr: [init_val,init_val,...,init_val]
4020    and no adjustments are needed.
4021
4022    For example, for the following code:
4023
4024    s = init_val;
4025    for (i=0;i<n;i++)
4026      s = s + a[i];
4027
4028    STMT is 's = s + a[i]', and the reduction variable is 's'.
4029    For a vector of 4 units, we want to return either [0,0,0,init_val],
4030    or [0,0,0,0] and let the caller know that it needs to adjust
4031    the result at the end by 'init_val'.
4032
4033    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4034    initialization vector is simpler (same element in all entries), if
4035    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4036
4037    A cost model should help decide between these two schemes.  */
4038
4039 tree
4040 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4041                                tree *adjustment_def)
4042 {
4043   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4044   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4045   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4046   tree scalar_type = TREE_TYPE (init_val);
4047   tree vectype = get_vectype_for_scalar_type (scalar_type);
4048   enum tree_code code = gimple_assign_rhs_code (stmt);
4049   tree def_for_init;
4050   tree init_def;
4051   REAL_VALUE_TYPE real_init_val = dconst0;
4052   int int_init_val = 0;
4053   gimple_seq stmts = NULL;
4054
4055   gcc_assert (vectype);
4056
4057   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4058               || SCALAR_FLOAT_TYPE_P (scalar_type));
4059
4060   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4061               || loop == (gimple_bb (stmt))->loop_father);
4062
4063   vect_reduction_type reduction_type
4064     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4065
4066   switch (code)
4067     {
4068     case WIDEN_SUM_EXPR:
4069     case DOT_PROD_EXPR:
4070     case SAD_EXPR:
4071     case PLUS_EXPR:
4072     case MINUS_EXPR:
4073     case BIT_IOR_EXPR:
4074     case BIT_XOR_EXPR:
4075     case MULT_EXPR:
4076     case BIT_AND_EXPR:
4077       {
4078         /* ADJUSTMENT_DEF is NULL when called from
4079            vect_create_epilog_for_reduction to vectorize double reduction.  */
4080         if (adjustment_def)
4081           *adjustment_def = init_val;
4082
4083         if (code == MULT_EXPR)
4084           {
4085             real_init_val = dconst1;
4086             int_init_val = 1;
4087           }
4088
4089         if (code == BIT_AND_EXPR)
4090           int_init_val = -1;
4091
4092         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4093           def_for_init = build_real (scalar_type, real_init_val);
4094         else
4095           def_for_init = build_int_cst (scalar_type, int_init_val);
4096
4097         if (adjustment_def)
4098           /* Option1: the first element is '0' or '1' as well.  */
4099           init_def = gimple_build_vector_from_val (&stmts, vectype,
4100                                                    def_for_init);
4101         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4102           {
4103             /* Option2 (variable length): the first element is INIT_VAL.  */
4104             init_def = gimple_build_vector_from_val (&stmts, vectype,
4105                                                      def_for_init);
4106             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4107                                      vectype, init_def, init_val);
4108           }
4109         else
4110           {
4111             /* Option2: the first element is INIT_VAL.  */
4112             tree_vector_builder elts (vectype, 1, 2);
4113             elts.quick_push (init_val);
4114             elts.quick_push (def_for_init);
4115             init_def = gimple_build_vector (&stmts, &elts);
4116           }
4117       }
4118       break;
4119
4120     case MIN_EXPR:
4121     case MAX_EXPR:
4122     case COND_EXPR:
4123       {
4124         if (adjustment_def)
4125           {
4126             *adjustment_def = NULL_TREE;
4127             if (reduction_type != COND_REDUCTION
4128                 && reduction_type != EXTRACT_LAST_REDUCTION)
4129               {
4130                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4131                 break;
4132               }
4133           }
4134         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4135         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4136       }
4137       break;
4138
4139     default:
4140       gcc_unreachable ();
4141     }
4142
4143   if (stmts)
4144     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4145   return init_def;
4146 }
4147
4148 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4149    NUMBER_OF_VECTORS is the number of vector defs to create.
4150    If NEUTRAL_OP is nonnull, introducing extra elements of that
4151    value will not change the result.  */
4152
4153 static void
4154 get_initial_defs_for_reduction (slp_tree slp_node,
4155                                 vec<tree> *vec_oprnds,
4156                                 unsigned int number_of_vectors,
4157                                 bool reduc_chain, tree neutral_op)
4158 {
4159   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4160   stmt_vec_info stmt_vinfo = stmts[0];
4161   unsigned HOST_WIDE_INT nunits;
4162   unsigned j, number_of_places_left_in_vector;
4163   tree vector_type;
4164   tree vop;
4165   int group_size = stmts.length ();
4166   unsigned int vec_num, i;
4167   unsigned number_of_copies = 1;
4168   vec<tree> voprnds;
4169   voprnds.create (number_of_vectors);
4170   struct loop *loop;
4171   auto_vec<tree, 16> permute_results;
4172
4173   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4174
4175   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4176
4177   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4178   gcc_assert (loop);
4179   edge pe = loop_preheader_edge (loop);
4180
4181   gcc_assert (!reduc_chain || neutral_op);
4182
4183   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4184      created vectors. It is greater than 1 if unrolling is performed.
4185
4186      For example, we have two scalar operands, s1 and s2 (e.g., group of
4187      strided accesses of size two), while NUNITS is four (i.e., four scalars
4188      of this type can be packed in a vector).  The output vector will contain
4189      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4190      will be 2).
4191
4192      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4193      vectors containing the operands.
4194
4195      For example, NUNITS is four as before, and the group size is 8
4196      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4197      {s5, s6, s7, s8}.  */
4198
4199   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4200     nunits = group_size;
4201
4202   number_of_copies = nunits * number_of_vectors / group_size;
4203
4204   number_of_places_left_in_vector = nunits;
4205   bool constant_p = true;
4206   tree_vector_builder elts (vector_type, nunits, 1);
4207   elts.quick_grow (nunits);
4208   for (j = 0; j < number_of_copies; j++)
4209     {
4210       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4211         {
4212           tree op;
4213           /* Get the def before the loop.  In reduction chain we have only
4214              one initial value.  */
4215           if ((j != (number_of_copies - 1)
4216                || (reduc_chain && i != 0))
4217               && neutral_op)
4218             op = neutral_op;
4219           else
4220             op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4221
4222           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4223           number_of_places_left_in_vector--;
4224           elts[number_of_places_left_in_vector] = op;
4225           if (!CONSTANT_CLASS_P (op))
4226             constant_p = false;
4227
4228           if (number_of_places_left_in_vector == 0)
4229             {
4230               gimple_seq ctor_seq = NULL;
4231               tree init;
4232               if (constant_p && !neutral_op
4233                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4234                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4235                 /* Build the vector directly from ELTS.  */
4236                 init = gimple_build_vector (&ctor_seq, &elts);
4237               else if (neutral_op)
4238                 {
4239                   /* Build a vector of the neutral value and shift the
4240                      other elements into place.  */
4241                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4242                                                        neutral_op);
4243                   int k = nunits;
4244                   while (k > 0 && elts[k - 1] == neutral_op)
4245                     k -= 1;
4246                   while (k > 0)
4247                     {
4248                       k -= 1;
4249                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4250                                            vector_type, init, elts[k]);
4251                     }
4252                 }
4253               else
4254                 {
4255                   /* First time round, duplicate ELTS to fill the
4256                      required number of vectors, then cherry pick the
4257                      appropriate result for each iteration.  */
4258                   if (vec_oprnds->is_empty ())
4259                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4260                                               number_of_vectors,
4261                                               permute_results);
4262                   init = permute_results[number_of_vectors - j - 1];
4263                 }
4264               if (ctor_seq != NULL)
4265                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4266               voprnds.quick_push (init);
4267
4268               number_of_places_left_in_vector = nunits;
4269               elts.new_vector (vector_type, nunits, 1);
4270               elts.quick_grow (nunits);
4271               constant_p = true;
4272             }
4273         }
4274     }
4275
4276   /* Since the vectors are created in the reverse order, we should invert
4277      them.  */
4278   vec_num = voprnds.length ();
4279   for (j = vec_num; j != 0; j--)
4280     {
4281       vop = voprnds[j - 1];
4282       vec_oprnds->quick_push (vop);
4283     }
4284
4285   voprnds.release ();
4286
4287   /* In case that VF is greater than the unrolling factor needed for the SLP
4288      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4289      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4290      to replicate the vectors.  */
4291   tree neutral_vec = NULL;
4292   while (number_of_vectors > vec_oprnds->length ())
4293     {
4294       if (neutral_op)
4295         {
4296           if (!neutral_vec)
4297             {
4298               gimple_seq ctor_seq = NULL;
4299               neutral_vec = gimple_build_vector_from_val
4300                 (&ctor_seq, vector_type, neutral_op);
4301               if (ctor_seq != NULL)
4302                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4303             }
4304           vec_oprnds->quick_push (neutral_vec);
4305         }
4306       else
4307         {
4308           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4309             vec_oprnds->quick_push (vop);
4310         }
4311     }
4312 }
4313
4314
4315 /* Function vect_create_epilog_for_reduction
4316
4317    Create code at the loop-epilog to finalize the result of a reduction
4318    computation.
4319
4320    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4321      reduction statements.
4322    STMT is the scalar reduction stmt that is being vectorized.
4323    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4324      number of elements that we can fit in a vectype (nunits).  In this case
4325      we have to generate more than one vector stmt - i.e - we need to "unroll"
4326      the vector stmt by a factor VF/nunits.  For more details see documentation
4327      in vectorizable_operation.
4328    REDUC_FN is the internal function for the epilog reduction.
4329    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4330      computation.
4331    REDUC_INDEX is the index of the operand in the right hand side of the
4332      statement that is defined by REDUCTION_PHI.
4333    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4334    SLP_NODE is an SLP node containing a group of reduction statements. The
4335      first one in this group is STMT.
4336    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4337      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4338      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4339      any value of the IV in the loop.
4340    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4341    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4342      null if this is not an SLP reduction
4343
4344    This function:
4345    1. Creates the reduction def-use cycles: sets the arguments for
4346       REDUCTION_PHIS:
4347       The loop-entry argument is the vectorized initial-value of the reduction.
4348       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4349       sums.
4350    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4351       by calling the function specified by REDUC_FN if available, or by
4352       other means (whole-vector shifts or a scalar loop).
4353       The function also creates a new phi node at the loop exit to preserve
4354       loop-closed form, as illustrated below.
4355
4356      The flow at the entry to this function:
4357
4358         loop:
4359           vec_def = phi <null, null>            # REDUCTION_PHI
4360           VECT_DEF = vector_stmt                # vectorized form of STMT
4361           s_loop = scalar_stmt                  # (scalar) STMT
4362         loop_exit:
4363           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4364           use <s_out0>
4365           use <s_out0>
4366
4367      The above is transformed by this function into:
4368
4369         loop:
4370           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4371           VECT_DEF = vector_stmt                # vectorized form of STMT
4372           s_loop = scalar_stmt                  # (scalar) STMT
4373         loop_exit:
4374           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4375           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4376           v_out2 = reduce <v_out1>
4377           s_out3 = extract_field <v_out2, 0>
4378           s_out4 = adjust_result <s_out3>
4379           use <s_out4>
4380           use <s_out4>
4381 */
4382
4383 static void
4384 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4385                                   gimple *reduc_def_stmt,
4386                                   int ncopies, internal_fn reduc_fn,
4387                                   vec<stmt_vec_info> reduction_phis,
4388                                   bool double_reduc,
4389                                   slp_tree slp_node,
4390                                   slp_instance slp_node_instance,
4391                                   tree induc_val, enum tree_code induc_code,
4392                                   tree neutral_op)
4393 {
4394   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4395   stmt_vec_info prev_phi_info;
4396   tree vectype;
4397   machine_mode mode;
4398   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4399   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4400   basic_block exit_bb;
4401   tree scalar_dest;
4402   tree scalar_type;
4403   gimple *new_phi = NULL, *phi;
4404   stmt_vec_info phi_info;
4405   gimple_stmt_iterator exit_gsi;
4406   tree vec_dest;
4407   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4408   gimple *epilog_stmt = NULL;
4409   enum tree_code code = gimple_assign_rhs_code (stmt);
4410   gimple *exit_phi;
4411   tree bitsize;
4412   tree adjustment_def = NULL;
4413   tree vec_initial_def = NULL;
4414   tree expr, def, initial_def = NULL;
4415   tree orig_name, scalar_result;
4416   imm_use_iterator imm_iter, phi_imm_iter;
4417   use_operand_p use_p, phi_use_p;
4418   gimple *use_stmt;
4419   stmt_vec_info reduction_phi_info = NULL;
4420   bool nested_in_vect_loop = false;
4421   auto_vec<gimple *> new_phis;
4422   auto_vec<stmt_vec_info> inner_phis;
4423   enum vect_def_type dt = vect_unknown_def_type;
4424   int j, i;
4425   auto_vec<tree> scalar_results;
4426   unsigned int group_size = 1, k, ratio;
4427   auto_vec<tree> vec_initial_defs;
4428   auto_vec<gimple *> phis;
4429   bool slp_reduc = false;
4430   bool direct_slp_reduc;
4431   tree new_phi_result;
4432   stmt_vec_info inner_phi = NULL;
4433   tree induction_index = NULL_TREE;
4434
4435   if (slp_node)
4436     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4437
4438   if (nested_in_vect_loop_p (loop, stmt))
4439     {
4440       outer_loop = loop;
4441       loop = loop->inner;
4442       nested_in_vect_loop = true;
4443       gcc_assert (!slp_node);
4444     }
4445
4446   vectype = STMT_VINFO_VECTYPE (stmt_info);
4447   gcc_assert (vectype);
4448   mode = TYPE_MODE (vectype);
4449
4450   /* 1. Create the reduction def-use cycle:
4451      Set the arguments of REDUCTION_PHIS, i.e., transform
4452
4453         loop:
4454           vec_def = phi <null, null>            # REDUCTION_PHI
4455           VECT_DEF = vector_stmt                # vectorized form of STMT
4456           ...
4457
4458      into:
4459
4460         loop:
4461           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4462           VECT_DEF = vector_stmt                # vectorized form of STMT
4463           ...
4464
4465      (in case of SLP, do it for all the phis). */
4466
4467   /* Get the loop-entry arguments.  */
4468   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4469   if (slp_node)
4470     {
4471       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4472       vec_initial_defs.reserve (vec_num);
4473       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4474                                       &vec_initial_defs, vec_num,
4475                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4476                                       neutral_op);
4477     }
4478   else
4479     {
4480       /* Get at the scalar def before the loop, that defines the initial value
4481          of the reduction variable.  */
4482       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4483                                            loop_preheader_edge (loop));
4484       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4485          and we can't use zero for induc_val, use initial_def.  Similarly
4486          for REDUC_MIN and initial_def larger than the base.  */
4487       if (TREE_CODE (initial_def) == INTEGER_CST
4488           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4489               == INTEGER_INDUC_COND_REDUCTION)
4490           && !integer_zerop (induc_val)
4491           && ((induc_code == MAX_EXPR
4492                && tree_int_cst_lt (initial_def, induc_val))
4493               || (induc_code == MIN_EXPR
4494                   && tree_int_cst_lt (induc_val, initial_def))))
4495         induc_val = initial_def;
4496
4497       if (double_reduc)
4498         /* In case of double reduction we only create a vector variable
4499            to be put in the reduction phi node.  The actual statement
4500            creation is done later in this function.  */
4501         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4502       else if (nested_in_vect_loop)
4503         {
4504           /* Do not use an adjustment def as that case is not supported
4505              correctly if ncopies is not one.  */
4506           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4507           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4508         }
4509       else
4510         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4511                                                          &adjustment_def);
4512       vec_initial_defs.create (1);
4513       vec_initial_defs.quick_push (vec_initial_def);
4514     }
4515
4516   /* Set phi nodes arguments.  */
4517   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4518     {
4519       tree vec_init_def = vec_initial_defs[i];
4520       tree def = vect_defs[i];
4521       for (j = 0; j < ncopies; j++)
4522         {
4523           if (j != 0)
4524             {
4525               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4526               if (nested_in_vect_loop)
4527                 vec_init_def
4528                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4529                                                     vec_init_def);
4530             }
4531
4532           /* Set the loop-entry arg of the reduction-phi.  */
4533
4534           gphi *phi = as_a <gphi *> (phi_info->stmt);
4535           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4536               == INTEGER_INDUC_COND_REDUCTION)
4537             {
4538               /* Initialise the reduction phi to zero.  This prevents initial
4539                  values of non-zero interferring with the reduction op.  */
4540               gcc_assert (ncopies == 1);
4541               gcc_assert (i == 0);
4542
4543               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4544               tree induc_val_vec
4545                 = build_vector_from_val (vec_init_def_type, induc_val);
4546
4547               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4548                            UNKNOWN_LOCATION);
4549             }
4550           else
4551             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4552                          UNKNOWN_LOCATION);
4553
4554           /* Set the loop-latch arg for the reduction-phi.  */
4555           if (j > 0)
4556             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4557
4558           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4559
4560           if (dump_enabled_p ())
4561             {
4562               dump_printf_loc (MSG_NOTE, vect_location,
4563                                "transform reduction: created def-use cycle: ");
4564               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4565               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4566             }
4567         }
4568     }
4569
4570   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4571      which is updated with the current index of the loop for every match of
4572      the original loop's cond_expr (VEC_STMT).  This results in a vector
4573      containing the last time the condition passed for that vector lane.
4574      The first match will be a 1 to allow 0 to be used for non-matching
4575      indexes.  If there are no matches at all then the vector will be all
4576      zeroes.  */
4577   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4578     {
4579       tree indx_before_incr, indx_after_incr;
4580       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4581
4582       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4583       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4584
4585       int scalar_precision
4586         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4587       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4588       tree cr_index_vector_type = build_vector_type
4589         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4590
4591       /* First we create a simple vector induction variable which starts
4592          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4593          vector size (STEP).  */
4594
4595       /* Create a {1,2,3,...} vector.  */
4596       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4597
4598       /* Create a vector of the step value.  */
4599       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4600       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4601
4602       /* Create an induction variable.  */
4603       gimple_stmt_iterator incr_gsi;
4604       bool insert_after;
4605       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4606       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4607                  insert_after, &indx_before_incr, &indx_after_incr);
4608
4609       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4610          filled with zeros (VEC_ZERO).  */
4611
4612       /* Create a vector of 0s.  */
4613       tree zero = build_zero_cst (cr_index_scalar_type);
4614       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4615
4616       /* Create a vector phi node.  */
4617       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4618       new_phi = create_phi_node (new_phi_tree, loop->header);
4619       loop_vinfo->add_stmt (new_phi);
4620       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4621                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4622
4623       /* Now take the condition from the loops original cond_expr
4624          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4625          every match uses values from the induction variable
4626          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4627          (NEW_PHI_TREE).
4628          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4629          the new cond_expr (INDEX_COND_EXPR).  */
4630
4631       /* Duplicate the condition from vec_stmt.  */
4632       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4633
4634       /* Create a conditional, where the condition is taken from vec_stmt
4635          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4636          else is the phi (NEW_PHI_TREE).  */
4637       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4638                                      ccompare, indx_before_incr,
4639                                      new_phi_tree);
4640       induction_index = make_ssa_name (cr_index_vector_type);
4641       gimple *index_condition = gimple_build_assign (induction_index,
4642                                                      index_cond_expr);
4643       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4644       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4645       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4646
4647       /* Update the phi with the vec cond.  */
4648       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4649                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4650     }
4651
4652   /* 2. Create epilog code.
4653         The reduction epilog code operates across the elements of the vector
4654         of partial results computed by the vectorized loop.
4655         The reduction epilog code consists of:
4656
4657         step 1: compute the scalar result in a vector (v_out2)
4658         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4659         step 3: adjust the scalar result (s_out3) if needed.
4660
4661         Step 1 can be accomplished using one the following three schemes:
4662           (scheme 1) using reduc_fn, if available.
4663           (scheme 2) using whole-vector shifts, if available.
4664           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4665                      combined.
4666
4667           The overall epilog code looks like this:
4668
4669           s_out0 = phi <s_loop>         # original EXIT_PHI
4670           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4671           v_out2 = reduce <v_out1>              # step 1
4672           s_out3 = extract_field <v_out2, 0>    # step 2
4673           s_out4 = adjust_result <s_out3>       # step 3
4674
4675           (step 3 is optional, and steps 1 and 2 may be combined).
4676           Lastly, the uses of s_out0 are replaced by s_out4.  */
4677
4678
4679   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4680          v_out1 = phi <VECT_DEF>
4681          Store them in NEW_PHIS.  */
4682
4683   exit_bb = single_exit (loop)->dest;
4684   prev_phi_info = NULL;
4685   new_phis.create (vect_defs.length ());
4686   FOR_EACH_VEC_ELT (vect_defs, i, def)
4687     {
4688       for (j = 0; j < ncopies; j++)
4689         {
4690           tree new_def = copy_ssa_name (def);
4691           phi = create_phi_node (new_def, exit_bb);
4692           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4693           if (j == 0)
4694             new_phis.quick_push (phi);
4695           else
4696             {
4697               def = vect_get_vec_def_for_stmt_copy (dt, def);
4698               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4699             }
4700
4701           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4702           prev_phi_info = phi_info;
4703         }
4704     }
4705
4706   /* The epilogue is created for the outer-loop, i.e., for the loop being
4707      vectorized.  Create exit phis for the outer loop.  */
4708   if (double_reduc)
4709     {
4710       loop = outer_loop;
4711       exit_bb = single_exit (loop)->dest;
4712       inner_phis.create (vect_defs.length ());
4713       FOR_EACH_VEC_ELT (new_phis, i, phi)
4714         {
4715           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4716           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4717           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4718           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4719                            PHI_RESULT (phi));
4720           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4721           inner_phis.quick_push (phi_info);
4722           new_phis[i] = outer_phi;
4723           while (STMT_VINFO_RELATED_STMT (phi_info))
4724             {
4725               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4726               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4727               outer_phi = create_phi_node (new_result, exit_bb);
4728               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4729                                PHI_RESULT (phi_info->stmt));
4730               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4731               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4732               prev_phi_info = outer_phi_info;
4733             }
4734         }
4735     }
4736
4737   exit_gsi = gsi_after_labels (exit_bb);
4738
4739   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4740          (i.e. when reduc_fn is not available) and in the final adjustment
4741          code (if needed).  Also get the original scalar reduction variable as
4742          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4743          represents a reduction pattern), the tree-code and scalar-def are
4744          taken from the original stmt that the pattern-stmt (STMT) replaces.
4745          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4746          are taken from STMT.  */
4747
4748   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4749   if (!orig_stmt_info)
4750     {
4751       /* Regular reduction  */
4752       orig_stmt_info = stmt_info;
4753     }
4754   else
4755     {
4756       /* Reduction pattern  */
4757       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4758       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4759     }
4760
4761   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4762   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4763      partial results are added and not subtracted.  */
4764   if (code == MINUS_EXPR)
4765     code = PLUS_EXPR;
4766
4767   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4768   scalar_type = TREE_TYPE (scalar_dest);
4769   scalar_results.create (group_size);
4770   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4771   bitsize = TYPE_SIZE (scalar_type);
4772
4773   /* In case this is a reduction in an inner-loop while vectorizing an outer
4774      loop - we don't need to extract a single scalar result at the end of the
4775      inner-loop (unless it is double reduction, i.e., the use of reduction is
4776      outside the outer-loop).  The final vector of partial results will be used
4777      in the vectorized outer-loop, or reduced to a scalar result at the end of
4778      the outer-loop.  */
4779   if (nested_in_vect_loop && !double_reduc)
4780     goto vect_finalize_reduction;
4781
4782   /* SLP reduction without reduction chain, e.g.,
4783      # a1 = phi <a2, a0>
4784      # b1 = phi <b2, b0>
4785      a2 = operation (a1)
4786      b2 = operation (b1)  */
4787   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4788
4789   /* True if we should implement SLP_REDUC using native reduction operations
4790      instead of scalar operations.  */
4791   direct_slp_reduc = (reduc_fn != IFN_LAST
4792                       && slp_reduc
4793                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4794
4795   /* In case of reduction chain, e.g.,
4796      # a1 = phi <a3, a0>
4797      a2 = operation (a1)
4798      a3 = operation (a2),
4799
4800      we may end up with more than one vector result.  Here we reduce them to
4801      one vector.  */
4802   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4803     {
4804       tree first_vect = PHI_RESULT (new_phis[0]);
4805       gassign *new_vec_stmt = NULL;
4806       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4807       for (k = 1; k < new_phis.length (); k++)
4808         {
4809           gimple *next_phi = new_phis[k];
4810           tree second_vect = PHI_RESULT (next_phi);
4811           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4812           new_vec_stmt = gimple_build_assign (tem, code,
4813                                               first_vect, second_vect);
4814           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4815           first_vect = tem;
4816         }
4817
4818       new_phi_result = first_vect;
4819       if (new_vec_stmt)
4820         {
4821           new_phis.truncate (0);
4822           new_phis.safe_push (new_vec_stmt);
4823         }
4824     }
4825   /* Likewise if we couldn't use a single defuse cycle.  */
4826   else if (ncopies > 1)
4827     {
4828       gcc_assert (new_phis.length () == 1);
4829       tree first_vect = PHI_RESULT (new_phis[0]);
4830       gassign *new_vec_stmt = NULL;
4831       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4832       gimple *next_phi = new_phis[0];
4833       for (int k = 1; k < ncopies; ++k)
4834         {
4835           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4836           tree second_vect = PHI_RESULT (next_phi);
4837           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4838           new_vec_stmt = gimple_build_assign (tem, code,
4839                                               first_vect, second_vect);
4840           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4841           first_vect = tem;
4842         }
4843       new_phi_result = first_vect;
4844       new_phis.truncate (0);
4845       new_phis.safe_push (new_vec_stmt);
4846     }
4847   else
4848     new_phi_result = PHI_RESULT (new_phis[0]);
4849
4850   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4851       && reduc_fn != IFN_LAST)
4852     {
4853       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4854          various data values where the condition matched and another vector
4855          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4856          need to extract the last matching index (which will be the index with
4857          highest value) and use this to index into the data vector.
4858          For the case where there were no matches, the data vector will contain
4859          all default values and the index vector will be all zeros.  */
4860
4861       /* Get various versions of the type of the vector of indexes.  */
4862       tree index_vec_type = TREE_TYPE (induction_index);
4863       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4864       tree index_scalar_type = TREE_TYPE (index_vec_type);
4865       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4866         (index_vec_type);
4867
4868       /* Get an unsigned integer version of the type of the data vector.  */
4869       int scalar_precision
4870         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4871       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4872       tree vectype_unsigned = build_vector_type
4873         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4874
4875       /* First we need to create a vector (ZERO_VEC) of zeros and another
4876          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4877          can create using a MAX reduction and then expanding.
4878          In the case where the loop never made any matches, the max index will
4879          be zero.  */
4880
4881       /* Vector of {0, 0, 0,...}.  */
4882       tree zero_vec = make_ssa_name (vectype);
4883       tree zero_vec_rhs = build_zero_cst (vectype);
4884       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4885       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4886
4887       /* Find maximum value from the vector of found indexes.  */
4888       tree max_index = make_ssa_name (index_scalar_type);
4889       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4890                                                           1, induction_index);
4891       gimple_call_set_lhs (max_index_stmt, max_index);
4892       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4893
4894       /* Vector of {max_index, max_index, max_index,...}.  */
4895       tree max_index_vec = make_ssa_name (index_vec_type);
4896       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4897                                                       max_index);
4898       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4899                                                         max_index_vec_rhs);
4900       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4901
4902       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4903          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4904          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4905          otherwise.  Only one value should match, resulting in a vector
4906          (VEC_COND) with one data value and the rest zeros.
4907          In the case where the loop never made any matches, every index will
4908          match, resulting in a vector with all data values (which will all be
4909          the default value).  */
4910
4911       /* Compare the max index vector to the vector of found indexes to find
4912          the position of the max value.  */
4913       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4914       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4915                                                       induction_index,
4916                                                       max_index_vec);
4917       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4918
4919       /* Use the compare to choose either values from the data vector or
4920          zero.  */
4921       tree vec_cond = make_ssa_name (vectype);
4922       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4923                                                    vec_compare, new_phi_result,
4924                                                    zero_vec);
4925       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4926
4927       /* Finally we need to extract the data value from the vector (VEC_COND)
4928          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4929          reduction, but because this doesn't exist, we can use a MAX reduction
4930          instead.  The data value might be signed or a float so we need to cast
4931          it first.
4932          In the case where the loop never made any matches, the data values are
4933          all identical, and so will reduce down correctly.  */
4934
4935       /* Make the matched data values unsigned.  */
4936       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4937       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4938                                        vec_cond);
4939       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4940                                                         VIEW_CONVERT_EXPR,
4941                                                         vec_cond_cast_rhs);
4942       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4943
4944       /* Reduce down to a scalar value.  */
4945       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4946       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4947                                                            1, vec_cond_cast);
4948       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4949       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4950
4951       /* Convert the reduced value back to the result type and set as the
4952          result.  */
4953       gimple_seq stmts = NULL;
4954       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4955                                data_reduc);
4956       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4957       scalar_results.safe_push (new_temp);
4958     }
4959   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4960            && reduc_fn == IFN_LAST)
4961     {
4962       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4963          idx = 0;
4964          idx_val = induction_index[0];
4965          val = data_reduc[0];
4966          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4967            if (induction_index[i] > idx_val)
4968              val = data_reduc[i], idx_val = induction_index[i];
4969          return val;  */
4970
4971       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4972       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4973       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4974       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4975       /* Enforced by vectorizable_reduction, which ensures we have target
4976          support before allowing a conditional reduction on variable-length
4977          vectors.  */
4978       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4979       tree idx_val = NULL_TREE, val = NULL_TREE;
4980       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4981         {
4982           tree old_idx_val = idx_val;
4983           tree old_val = val;
4984           idx_val = make_ssa_name (idx_eltype);
4985           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4986                                              build3 (BIT_FIELD_REF, idx_eltype,
4987                                                      induction_index,
4988                                                      bitsize_int (el_size),
4989                                                      bitsize_int (off)));
4990           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4991           val = make_ssa_name (data_eltype);
4992           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4993                                              build3 (BIT_FIELD_REF,
4994                                                      data_eltype,
4995                                                      new_phi_result,
4996                                                      bitsize_int (el_size),
4997                                                      bitsize_int (off)));
4998           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4999           if (off != 0)
5000             {
5001               tree new_idx_val = idx_val;
5002               tree new_val = val;
5003               if (off != v_size - el_size)
5004                 {
5005                   new_idx_val = make_ssa_name (idx_eltype);
5006                   epilog_stmt = gimple_build_assign (new_idx_val,
5007                                                      MAX_EXPR, idx_val,
5008                                                      old_idx_val);
5009                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5010                 }
5011               new_val = make_ssa_name (data_eltype);
5012               epilog_stmt = gimple_build_assign (new_val,
5013                                                  COND_EXPR,
5014                                                  build2 (GT_EXPR,
5015                                                          boolean_type_node,
5016                                                          idx_val,
5017                                                          old_idx_val),
5018                                                  val, old_val);
5019               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5020               idx_val = new_idx_val;
5021               val = new_val;
5022             }
5023         }
5024       /* Convert the reduced value back to the result type and set as the
5025          result.  */
5026       gimple_seq stmts = NULL;
5027       val = gimple_convert (&stmts, scalar_type, val);
5028       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5029       scalar_results.safe_push (val);
5030     }
5031
5032   /* 2.3 Create the reduction code, using one of the three schemes described
5033          above. In SLP we simply need to extract all the elements from the
5034          vector (without reducing them), so we use scalar shifts.  */
5035   else if (reduc_fn != IFN_LAST && !slp_reduc)
5036     {
5037       tree tmp;
5038       tree vec_elem_type;
5039
5040       /* Case 1:  Create:
5041          v_out2 = reduc_expr <v_out1>  */
5042
5043       if (dump_enabled_p ())
5044         dump_printf_loc (MSG_NOTE, vect_location,
5045                          "Reduce using direct vector reduction.\n");
5046
5047       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5048       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5049         {
5050           tree tmp_dest
5051             = vect_create_destination_var (scalar_dest, vec_elem_type);
5052           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5053                                                     new_phi_result);
5054           gimple_set_lhs (epilog_stmt, tmp_dest);
5055           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5056           gimple_set_lhs (epilog_stmt, new_temp);
5057           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5058
5059           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5060                                              new_temp);
5061         }
5062       else
5063         {
5064           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5065                                                     new_phi_result);
5066           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5067         }
5068
5069       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5070       gimple_set_lhs (epilog_stmt, new_temp);
5071       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5072
5073       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5074            == INTEGER_INDUC_COND_REDUCTION)
5075           && !operand_equal_p (initial_def, induc_val, 0))
5076         {
5077           /* Earlier we set the initial value to be a vector if induc_val
5078              values.  Check the result and if it is induc_val then replace
5079              with the original initial value, unless induc_val is
5080              the same as initial_def already.  */
5081           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5082                                   induc_val);
5083
5084           tmp = make_ssa_name (new_scalar_dest);
5085           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5086                                              initial_def, new_temp);
5087           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5088           new_temp = tmp;
5089         }
5090
5091       scalar_results.safe_push (new_temp);
5092     }
5093   else if (direct_slp_reduc)
5094     {
5095       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5096          with the elements for other SLP statements replaced with the
5097          neutral value.  We can then do a normal reduction on each vector.  */
5098
5099       /* Enforced by vectorizable_reduction.  */
5100       gcc_assert (new_phis.length () == 1);
5101       gcc_assert (pow2p_hwi (group_size));
5102
5103       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5104       vec<stmt_vec_info> orig_phis
5105         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5106       gimple_seq seq = NULL;
5107
5108       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5109          and the same element size as VECTYPE.  */
5110       tree index = build_index_vector (vectype, 0, 1);
5111       tree index_type = TREE_TYPE (index);
5112       tree index_elt_type = TREE_TYPE (index_type);
5113       tree mask_type = build_same_sized_truth_vector_type (index_type);
5114
5115       /* Create a vector that, for each element, identifies which of
5116          the REDUC_GROUP_SIZE results should use it.  */
5117       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5118       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5119                             build_vector_from_val (index_type, index_mask));
5120
5121       /* Get a neutral vector value.  This is simply a splat of the neutral
5122          scalar value if we have one, otherwise the initial scalar value
5123          is itself a neutral value.  */
5124       tree vector_identity = NULL_TREE;
5125       if (neutral_op)
5126         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5127                                                         neutral_op);
5128       for (unsigned int i = 0; i < group_size; ++i)
5129         {
5130           /* If there's no univeral neutral value, we can use the
5131              initial scalar value from the original PHI.  This is used
5132              for MIN and MAX reduction, for example.  */
5133           if (!neutral_op)
5134             {
5135               tree scalar_value
5136                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5137                                          loop_preheader_edge (loop));
5138               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5139                                                               scalar_value);
5140             }
5141
5142           /* Calculate the equivalent of:
5143
5144              sel[j] = (index[j] == i);
5145
5146              which selects the elements of NEW_PHI_RESULT that should
5147              be included in the result.  */
5148           tree compare_val = build_int_cst (index_elt_type, i);
5149           compare_val = build_vector_from_val (index_type, compare_val);
5150           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5151                                    index, compare_val);
5152
5153           /* Calculate the equivalent of:
5154
5155              vec = seq ? new_phi_result : vector_identity;
5156
5157              VEC is now suitable for a full vector reduction.  */
5158           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5159                                    sel, new_phi_result, vector_identity);
5160
5161           /* Do the reduction and convert it to the appropriate type.  */
5162           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5163                                       TREE_TYPE (vectype), vec);
5164           scalar = gimple_convert (&seq, scalar_type, scalar);
5165           scalar_results.safe_push (scalar);
5166         }
5167       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5168     }
5169   else
5170     {
5171       bool reduce_with_shift;
5172       tree vec_temp;
5173
5174       /* COND reductions all do the final reduction with MAX_EXPR
5175          or MIN_EXPR.  */
5176       if (code == COND_EXPR)
5177         {
5178           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5179               == INTEGER_INDUC_COND_REDUCTION)
5180             code = induc_code;
5181           else
5182             code = MAX_EXPR;
5183         }
5184
5185       /* See if the target wants to do the final (shift) reduction
5186          in a vector mode of smaller size and first reduce upper/lower
5187          halves against each other.  */
5188       enum machine_mode mode1 = mode;
5189       tree vectype1 = vectype;
5190       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5191       unsigned sz1 = sz;
5192       if (!slp_reduc
5193           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5194         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5195
5196       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5197       reduce_with_shift = have_whole_vector_shift (mode1);
5198       if (!VECTOR_MODE_P (mode1))
5199         reduce_with_shift = false;
5200       else
5201         {
5202           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5203           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5204             reduce_with_shift = false;
5205         }
5206
5207       /* First reduce the vector to the desired vector size we should
5208          do shift reduction on by combining upper and lower halves.  */
5209       new_temp = new_phi_result;
5210       while (sz > sz1)
5211         {
5212           gcc_assert (!slp_reduc);
5213           sz /= 2;
5214           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5215
5216           /* The target has to make sure we support lowpart/highpart
5217              extraction, either via direct vector extract or through
5218              an integer mode punning.  */
5219           tree dst1, dst2;
5220           if (convert_optab_handler (vec_extract_optab,
5221                                      TYPE_MODE (TREE_TYPE (new_temp)),
5222                                      TYPE_MODE (vectype1))
5223               != CODE_FOR_nothing)
5224             {
5225               /* Extract sub-vectors directly once vec_extract becomes
5226                  a conversion optab.  */
5227               dst1 = make_ssa_name (vectype1);
5228               epilog_stmt
5229                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5230                                          build3 (BIT_FIELD_REF, vectype1,
5231                                                  new_temp, TYPE_SIZE (vectype1),
5232                                                  bitsize_int (0)));
5233               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234               dst2 =  make_ssa_name (vectype1);
5235               epilog_stmt
5236                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5237                                          build3 (BIT_FIELD_REF, vectype1,
5238                                                  new_temp, TYPE_SIZE (vectype1),
5239                                                  bitsize_int (sz * BITS_PER_UNIT)));
5240               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5241             }
5242           else
5243             {
5244               /* Extract via punning to appropriately sized integer mode
5245                  vector.  */
5246               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5247                                                             1);
5248               tree etype = build_vector_type (eltype, 2);
5249               gcc_assert (convert_optab_handler (vec_extract_optab,
5250                                                  TYPE_MODE (etype),
5251                                                  TYPE_MODE (eltype))
5252                           != CODE_FOR_nothing);
5253               tree tem = make_ssa_name (etype);
5254               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5255                                                  build1 (VIEW_CONVERT_EXPR,
5256                                                          etype, new_temp));
5257               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5258               new_temp = tem;
5259               tem = make_ssa_name (eltype);
5260               epilog_stmt
5261                   = gimple_build_assign (tem, BIT_FIELD_REF,
5262                                          build3 (BIT_FIELD_REF, eltype,
5263                                                  new_temp, TYPE_SIZE (eltype),
5264                                                  bitsize_int (0)));
5265               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5266               dst1 = make_ssa_name (vectype1);
5267               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5268                                                  build1 (VIEW_CONVERT_EXPR,
5269                                                          vectype1, tem));
5270               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271               tem = make_ssa_name (eltype);
5272               epilog_stmt
5273                   = gimple_build_assign (tem, BIT_FIELD_REF,
5274                                          build3 (BIT_FIELD_REF, eltype,
5275                                                  new_temp, TYPE_SIZE (eltype),
5276                                                  bitsize_int (sz * BITS_PER_UNIT)));
5277               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278               dst2 =  make_ssa_name (vectype1);
5279               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5280                                                  build1 (VIEW_CONVERT_EXPR,
5281                                                          vectype1, tem));
5282               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283             }
5284
5285           new_temp = make_ssa_name (vectype1);
5286           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5287           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5288         }
5289
5290       if (reduce_with_shift && !slp_reduc)
5291         {
5292           int element_bitsize = tree_to_uhwi (bitsize);
5293           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5294              for variable-length vectors and also requires direct target support
5295              for loop reductions.  */
5296           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5297           int nelements = vec_size_in_bits / element_bitsize;
5298           vec_perm_builder sel;
5299           vec_perm_indices indices;
5300
5301           int elt_offset;
5302
5303           tree zero_vec = build_zero_cst (vectype1);
5304           /* Case 2: Create:
5305              for (offset = nelements/2; offset >= 1; offset/=2)
5306                 {
5307                   Create:  va' = vec_shift <va, offset>
5308                   Create:  va = vop <va, va'>
5309                 }  */
5310
5311           tree rhs;
5312
5313           if (dump_enabled_p ())
5314             dump_printf_loc (MSG_NOTE, vect_location,
5315                              "Reduce using vector shifts\n");
5316
5317           mode1 = TYPE_MODE (vectype1);
5318           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5319           for (elt_offset = nelements / 2;
5320                elt_offset >= 1;
5321                elt_offset /= 2)
5322             {
5323               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5324               indices.new_vector (sel, 2, nelements);
5325               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5326               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5327                                                  new_temp, zero_vec, mask);
5328               new_name = make_ssa_name (vec_dest, epilog_stmt);
5329               gimple_assign_set_lhs (epilog_stmt, new_name);
5330               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5331
5332               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5333                                                  new_temp);
5334               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5335               gimple_assign_set_lhs (epilog_stmt, new_temp);
5336               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5337             }
5338
5339           /* 2.4  Extract the final scalar result.  Create:
5340              s_out3 = extract_field <v_out2, bitpos>  */
5341
5342           if (dump_enabled_p ())
5343             dump_printf_loc (MSG_NOTE, vect_location,
5344                              "extract scalar result\n");
5345
5346           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5347                         bitsize, bitsize_zero_node);
5348           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5349           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5350           gimple_assign_set_lhs (epilog_stmt, new_temp);
5351           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352           scalar_results.safe_push (new_temp);
5353         }
5354       else
5355         {
5356           /* Case 3: Create:
5357              s = extract_field <v_out2, 0>
5358              for (offset = element_size;
5359                   offset < vector_size;
5360                   offset += element_size;)
5361                {
5362                  Create:  s' = extract_field <v_out2, offset>
5363                  Create:  s = op <s, s'>  // For non SLP cases
5364                }  */
5365
5366           if (dump_enabled_p ())
5367             dump_printf_loc (MSG_NOTE, vect_location,
5368                              "Reduce using scalar code.\n");
5369
5370           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5371           int element_bitsize = tree_to_uhwi (bitsize);
5372           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5373             {
5374               int bit_offset;
5375               if (gimple_code (new_phi) == GIMPLE_PHI)
5376                 vec_temp = PHI_RESULT (new_phi);
5377               else
5378                 vec_temp = gimple_assign_lhs (new_phi);
5379               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5380                                  bitsize_zero_node);
5381               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5382               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5383               gimple_assign_set_lhs (epilog_stmt, new_temp);
5384               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385
5386               /* In SLP we don't need to apply reduction operation, so we just
5387                  collect s' values in SCALAR_RESULTS.  */
5388               if (slp_reduc)
5389                 scalar_results.safe_push (new_temp);
5390
5391               for (bit_offset = element_bitsize;
5392                    bit_offset < vec_size_in_bits;
5393                    bit_offset += element_bitsize)
5394                 {
5395                   tree bitpos = bitsize_int (bit_offset);
5396                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5397                                      bitsize, bitpos);
5398
5399                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5400                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5401                   gimple_assign_set_lhs (epilog_stmt, new_name);
5402                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5403
5404                   if (slp_reduc)
5405                     {
5406                       /* In SLP we don't need to apply reduction operation, so
5407                          we just collect s' values in SCALAR_RESULTS.  */
5408                       new_temp = new_name;
5409                       scalar_results.safe_push (new_name);
5410                     }
5411                   else
5412                     {
5413                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5414                                                          new_name, new_temp);
5415                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5416                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5417                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5418                     }
5419                 }
5420             }
5421
5422           /* The only case where we need to reduce scalar results in SLP, is
5423              unrolling.  If the size of SCALAR_RESULTS is greater than
5424              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5425              REDUC_GROUP_SIZE.  */
5426           if (slp_reduc)
5427             {
5428               tree res, first_res, new_res;
5429               gimple *new_stmt;
5430
5431               /* Reduce multiple scalar results in case of SLP unrolling.  */
5432               for (j = group_size; scalar_results.iterate (j, &res);
5433                    j++)
5434                 {
5435                   first_res = scalar_results[j % group_size];
5436                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5437                                                   first_res, res);
5438                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5439                   gimple_assign_set_lhs (new_stmt, new_res);
5440                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5441                   scalar_results[j % group_size] = new_res;
5442                 }
5443             }
5444           else
5445             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5446             scalar_results.safe_push (new_temp);
5447         }
5448
5449       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5450            == INTEGER_INDUC_COND_REDUCTION)
5451           && !operand_equal_p (initial_def, induc_val, 0))
5452         {
5453           /* Earlier we set the initial value to be a vector if induc_val
5454              values.  Check the result and if it is induc_val then replace
5455              with the original initial value, unless induc_val is
5456              the same as initial_def already.  */
5457           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5458                                   induc_val);
5459
5460           tree tmp = make_ssa_name (new_scalar_dest);
5461           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5462                                              initial_def, new_temp);
5463           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464           scalar_results[0] = tmp;
5465         }
5466     }
5467
5468 vect_finalize_reduction:
5469
5470   if (double_reduc)
5471     loop = loop->inner;
5472
5473   /* 2.5 Adjust the final result by the initial value of the reduction
5474          variable. (When such adjustment is not needed, then
5475          'adjustment_def' is zero).  For example, if code is PLUS we create:
5476          new_temp = loop_exit_def + adjustment_def  */
5477
5478   if (adjustment_def)
5479     {
5480       gcc_assert (!slp_reduc);
5481       if (nested_in_vect_loop)
5482         {
5483           new_phi = new_phis[0];
5484           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5485           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5486           new_dest = vect_create_destination_var (scalar_dest, vectype);
5487         }
5488       else
5489         {
5490           new_temp = scalar_results[0];
5491           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5492           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5493           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5494         }
5495
5496       epilog_stmt = gimple_build_assign (new_dest, expr);
5497       new_temp = make_ssa_name (new_dest, epilog_stmt);
5498       gimple_assign_set_lhs (epilog_stmt, new_temp);
5499       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5500       if (nested_in_vect_loop)
5501         {
5502           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5503           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5504             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5505
5506           if (!double_reduc)
5507             scalar_results.quick_push (new_temp);
5508           else
5509             scalar_results[0] = new_temp;
5510         }
5511       else
5512         scalar_results[0] = new_temp;
5513
5514       new_phis[0] = epilog_stmt;
5515     }
5516
5517   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5518           phis with new adjusted scalar results, i.e., replace use <s_out0>
5519           with use <s_out4>.
5520
5521      Transform:
5522         loop_exit:
5523           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5524           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5525           v_out2 = reduce <v_out1>
5526           s_out3 = extract_field <v_out2, 0>
5527           s_out4 = adjust_result <s_out3>
5528           use <s_out0>
5529           use <s_out0>
5530
5531      into:
5532
5533         loop_exit:
5534           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5535           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5536           v_out2 = reduce <v_out1>
5537           s_out3 = extract_field <v_out2, 0>
5538           s_out4 = adjust_result <s_out3>
5539           use <s_out4>
5540           use <s_out4> */
5541
5542
5543   /* In SLP reduction chain we reduce vector results into one vector if
5544      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5545      LHS of the last stmt in the reduction chain, since we are looking for
5546      the loop exit phi node.  */
5547   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5548     {
5549       stmt_vec_info dest_stmt_info
5550         = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5551       /* Handle reduction patterns.  */
5552       if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5553         dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5554
5555       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5556       group_size = 1;
5557     }
5558
5559   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5560      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5561      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5562      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5563      correspond to the first vector stmt, etc.
5564      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5565   if (group_size > new_phis.length ())
5566     {
5567       ratio = group_size / new_phis.length ();
5568       gcc_assert (!(group_size % new_phis.length ()));
5569     }
5570   else
5571     ratio = 1;
5572
5573   for (k = 0; k < group_size; k++)
5574     {
5575       if (k % ratio == 0)
5576         {
5577           epilog_stmt = new_phis[k / ratio];
5578           reduction_phi_info = reduction_phis[k / ratio];
5579           if (double_reduc)
5580             inner_phi = inner_phis[k / ratio];
5581         }
5582
5583       if (slp_reduc)
5584         {
5585           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5586
5587           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5588           /* SLP statements can't participate in patterns.  */
5589           gcc_assert (!orig_stmt_info);
5590           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5591         }
5592
5593       phis.create (3);
5594       /* Find the loop-closed-use at the loop exit of the original scalar
5595          result.  (The reduction result is expected to have two immediate uses -
5596          one at the latch block, and one at the loop exit).  */
5597       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5598         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5599             && !is_gimple_debug (USE_STMT (use_p)))
5600           phis.safe_push (USE_STMT (use_p));
5601
5602       /* While we expect to have found an exit_phi because of loop-closed-ssa
5603          form we can end up without one if the scalar cycle is dead.  */
5604
5605       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5606         {
5607           if (outer_loop)
5608             {
5609               stmt_vec_info exit_phi_vinfo
5610                 = loop_vinfo->lookup_stmt (exit_phi);
5611               gphi *vect_phi;
5612
5613               /* FORNOW. Currently not supporting the case that an inner-loop
5614                  reduction is not used in the outer-loop (but only outside the
5615                  outer-loop), unless it is double reduction.  */
5616               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5617                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5618                           || double_reduc);
5619
5620               if (double_reduc)
5621                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5622               else
5623                 STMT_VINFO_VEC_STMT (exit_phi_vinfo)
5624                   = vinfo_for_stmt (epilog_stmt);
5625               if (!double_reduc
5626                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5627                       != vect_double_reduction_def)
5628                 continue;
5629
5630               /* Handle double reduction:
5631
5632                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5633                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5634                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5635                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5636
5637                  At that point the regular reduction (stmt2 and stmt3) is
5638                  already vectorized, as well as the exit phi node, stmt4.
5639                  Here we vectorize the phi node of double reduction, stmt1, and
5640                  update all relevant statements.  */
5641
5642               /* Go through all the uses of s2 to find double reduction phi
5643                  node, i.e., stmt1 above.  */
5644               orig_name = PHI_RESULT (exit_phi);
5645               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5646                 {
5647                   stmt_vec_info use_stmt_vinfo;
5648                   tree vect_phi_init, preheader_arg, vect_phi_res;
5649                   basic_block bb = gimple_bb (use_stmt);
5650
5651                   /* Check that USE_STMT is really double reduction phi
5652                      node.  */
5653                   if (gimple_code (use_stmt) != GIMPLE_PHI
5654                       || gimple_phi_num_args (use_stmt) != 2
5655                       || bb->loop_father != outer_loop)
5656                     continue;
5657                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5658                   if (!use_stmt_vinfo
5659                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5660                           != vect_double_reduction_def)
5661                     continue;
5662
5663                   /* Create vector phi node for double reduction:
5664                      vs1 = phi <vs0, vs2>
5665                      vs1 was created previously in this function by a call to
5666                        vect_get_vec_def_for_operand and is stored in
5667                        vec_initial_def;
5668                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5669                      vs0 is created here.  */
5670
5671                   /* Create vector phi node.  */
5672                   vect_phi = create_phi_node (vec_initial_def, bb);
5673                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5674
5675                   /* Create vs0 - initial def of the double reduction phi.  */
5676                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5677                                              loop_preheader_edge (outer_loop));
5678                   vect_phi_init = get_initial_def_for_reduction
5679                     (stmt, preheader_arg, NULL);
5680
5681                   /* Update phi node arguments with vs0 and vs2.  */
5682                   add_phi_arg (vect_phi, vect_phi_init,
5683                                loop_preheader_edge (outer_loop),
5684                                UNKNOWN_LOCATION);
5685                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5686                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5687                   if (dump_enabled_p ())
5688                     {
5689                       dump_printf_loc (MSG_NOTE, vect_location,
5690                                        "created double reduction phi node: ");
5691                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5692                     }
5693
5694                   vect_phi_res = PHI_RESULT (vect_phi);
5695
5696                   /* Replace the use, i.e., set the correct vs1 in the regular
5697                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5698                      loop is redundant.  */
5699                   stmt_vec_info use_info = reduction_phi_info;
5700                   for (j = 0; j < ncopies; j++)
5701                     {
5702                       edge pr_edge = loop_preheader_edge (loop);
5703                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5704                                        pr_edge->dest_idx, vect_phi_res);
5705                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5706                     }
5707                 }
5708             }
5709         }
5710
5711       phis.release ();
5712       if (nested_in_vect_loop)
5713         {
5714           if (double_reduc)
5715             loop = outer_loop;
5716           else
5717             continue;
5718         }
5719
5720       phis.create (3);
5721       /* Find the loop-closed-use at the loop exit of the original scalar
5722          result.  (The reduction result is expected to have two immediate uses,
5723          one at the latch block, and one at the loop exit).  For double
5724          reductions we are looking for exit phis of the outer loop.  */
5725       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5726         {
5727           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5728             {
5729               if (!is_gimple_debug (USE_STMT (use_p)))
5730                 phis.safe_push (USE_STMT (use_p));
5731             }
5732           else
5733             {
5734               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5735                 {
5736                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5737
5738                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5739                     {
5740                       if (!flow_bb_inside_loop_p (loop,
5741                                              gimple_bb (USE_STMT (phi_use_p)))
5742                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5743                         phis.safe_push (USE_STMT (phi_use_p));
5744                     }
5745                 }
5746             }
5747         }
5748
5749       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5750         {
5751           /* Replace the uses:  */
5752           orig_name = PHI_RESULT (exit_phi);
5753           scalar_result = scalar_results[k];
5754           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5755             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5756               SET_USE (use_p, scalar_result);
5757         }
5758
5759       phis.release ();
5760     }
5761 }
5762
5763 /* Return a vector of type VECTYPE that is equal to the vector select
5764    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5765    before GSI.  */
5766
5767 static tree
5768 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5769                      tree vec, tree identity)
5770 {
5771   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5772   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5773                                           mask, vec, identity);
5774   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5775   return cond;
5776 }
5777
5778 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5779    order, starting with LHS.  Insert the extraction statements before GSI and
5780    associate the new scalar SSA names with variable SCALAR_DEST.
5781    Return the SSA name for the result.  */
5782
5783 static tree
5784 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5785                        tree_code code, tree lhs, tree vector_rhs)
5786 {
5787   tree vectype = TREE_TYPE (vector_rhs);
5788   tree scalar_type = TREE_TYPE (vectype);
5789   tree bitsize = TYPE_SIZE (scalar_type);
5790   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5791   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5792
5793   for (unsigned HOST_WIDE_INT bit_offset = 0;
5794        bit_offset < vec_size_in_bits;
5795        bit_offset += element_bitsize)
5796     {
5797       tree bitpos = bitsize_int (bit_offset);
5798       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5799                          bitsize, bitpos);
5800
5801       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5802       rhs = make_ssa_name (scalar_dest, stmt);
5803       gimple_assign_set_lhs (stmt, rhs);
5804       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5805
5806       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5807       tree new_name = make_ssa_name (scalar_dest, stmt);
5808       gimple_assign_set_lhs (stmt, new_name);
5809       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5810       lhs = new_name;
5811     }
5812   return lhs;
5813 }
5814
5815 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5816    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5817    statement.  CODE is the operation performed by STMT and OPS are
5818    its scalar operands.  REDUC_INDEX is the index of the operand in
5819    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5820    implements in-order reduction, or IFN_LAST if we should open-code it.
5821    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5822    that should be used to control the operation in a fully-masked loop.  */
5823
5824 static bool
5825 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5826                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5827                                gimple *reduc_def_stmt,
5828                                tree_code code, internal_fn reduc_fn,
5829                                tree ops[3], tree vectype_in,
5830                                int reduc_index, vec_loop_masks *masks)
5831 {
5832   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5833   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5834   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5835   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5836   stmt_vec_info new_stmt_info = NULL;
5837
5838   int ncopies;
5839   if (slp_node)
5840     ncopies = 1;
5841   else
5842     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5843
5844   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5845   gcc_assert (ncopies == 1);
5846   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5847   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5848   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5849               == FOLD_LEFT_REDUCTION);
5850
5851   if (slp_node)
5852     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5853                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5854
5855   tree op0 = ops[1 - reduc_index];
5856
5857   int group_size = 1;
5858   stmt_vec_info scalar_dest_def_info;
5859   auto_vec<tree> vec_oprnds0;
5860   if (slp_node)
5861     {
5862       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5863       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5864       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5865     }
5866   else
5867     {
5868       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5869       vec_oprnds0.create (1);
5870       vec_oprnds0.quick_push (loop_vec_def0);
5871       scalar_dest_def_info = stmt_info;
5872     }
5873
5874   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5875   tree scalar_type = TREE_TYPE (scalar_dest);
5876   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5877
5878   int vec_num = vec_oprnds0.length ();
5879   gcc_assert (vec_num == 1 || slp_node);
5880   tree vec_elem_type = TREE_TYPE (vectype_out);
5881   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5882
5883   tree vector_identity = NULL_TREE;
5884   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5885     vector_identity = build_zero_cst (vectype_out);
5886
5887   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5888   int i;
5889   tree def0;
5890   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5891     {
5892       gimple *new_stmt;
5893       tree mask = NULL_TREE;
5894       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5895         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5896
5897       /* Handle MINUS by adding the negative.  */
5898       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5899         {
5900           tree negated = make_ssa_name (vectype_out);
5901           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5902           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5903           def0 = negated;
5904         }
5905
5906       if (mask)
5907         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5908                                     vector_identity);
5909
5910       /* On the first iteration the input is simply the scalar phi
5911          result, and for subsequent iterations it is the output of
5912          the preceding operation.  */
5913       if (reduc_fn != IFN_LAST)
5914         {
5915           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5916           /* For chained SLP reductions the output of the previous reduction
5917              operation serves as the input of the next. For the final statement
5918              the output cannot be a temporary - we reuse the original
5919              scalar destination of the last statement.  */
5920           if (i != vec_num - 1)
5921             {
5922               gimple_set_lhs (new_stmt, scalar_dest_var);
5923               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5924               gimple_set_lhs (new_stmt, reduc_var);
5925             }
5926         }
5927       else
5928         {
5929           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5930                                              reduc_var, def0);
5931           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5932           /* Remove the statement, so that we can use the same code paths
5933              as for statements that we've just created.  */
5934           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5935           gsi_remove (&tmp_gsi, false);
5936         }
5937
5938       if (i == vec_num - 1)
5939         {
5940           gimple_set_lhs (new_stmt, scalar_dest);
5941           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5942                                                     new_stmt);
5943         }
5944       else
5945         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5946                                                      new_stmt, gsi);
5947
5948       if (slp_node)
5949         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5950     }
5951
5952   if (!slp_node)
5953     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5954
5955   return true;
5956 }
5957
5958 /* Function is_nonwrapping_integer_induction.
5959
5960    Check if STMT (which is part of loop LOOP) both increments and
5961    does not cause overflow.  */
5962
5963 static bool
5964 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5965 {
5966   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5967   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5968   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5969   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5970   widest_int ni, max_loop_value, lhs_max;
5971   wi::overflow_type overflow = wi::OVF_NONE;
5972
5973   /* Make sure the loop is integer based.  */
5974   if (TREE_CODE (base) != INTEGER_CST
5975       || TREE_CODE (step) != INTEGER_CST)
5976     return false;
5977
5978   /* Check that the max size of the loop will not wrap.  */
5979
5980   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5981     return true;
5982
5983   if (! max_stmt_executions (loop, &ni))
5984     return false;
5985
5986   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5987                             &overflow);
5988   if (overflow)
5989     return false;
5990
5991   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5992                             TYPE_SIGN (lhs_type), &overflow);
5993   if (overflow)
5994     return false;
5995
5996   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5997           <= TYPE_PRECISION (lhs_type));
5998 }
5999
6000 /* Function vectorizable_reduction.
6001
6002    Check if STMT performs a reduction operation that can be vectorized.
6003    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6004    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6005    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6006
6007    This function also handles reduction idioms (patterns) that have been
6008    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6009    of this form:
6010      X = pattern_expr (arg0, arg1, ..., X)
6011    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6012    sequence that had been detected and replaced by the pattern-stmt (STMT).
6013
6014    This function also handles reduction of condition expressions, for example:
6015      for (int i = 0; i < N; i++)
6016        if (a[i] < value)
6017          last = a[i];
6018    This is handled by vectorising the loop and creating an additional vector
6019    containing the loop indexes for which "a[i] < value" was true.  In the
6020    function epilogue this is reduced to a single max value and then used to
6021    index into the vector of results.
6022
6023    In some cases of reduction patterns, the type of the reduction variable X is
6024    different than the type of the other arguments of STMT.
6025    In such cases, the vectype that is used when transforming STMT into a vector
6026    stmt is different than the vectype that is used to determine the
6027    vectorization factor, because it consists of a different number of elements
6028    than the actual number of elements that are being operated upon in parallel.
6029
6030    For example, consider an accumulation of shorts into an int accumulator.
6031    On some targets it's possible to vectorize this pattern operating on 8
6032    shorts at a time (hence, the vectype for purposes of determining the
6033    vectorization factor should be V8HI); on the other hand, the vectype that
6034    is used to create the vector form is actually V4SI (the type of the result).
6035
6036    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6037    indicates what is the actual level of parallelism (V8HI in the example), so
6038    that the right vectorization factor would be derived.  This vectype
6039    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6040    be used to create the vectorized stmt.  The right vectype for the vectorized
6041    stmt is obtained from the type of the result X:
6042         get_vectype_for_scalar_type (TREE_TYPE (X))
6043
6044    This means that, contrary to "regular" reductions (or "regular" stmts in
6045    general), the following equation:
6046       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6047    does *NOT* necessarily hold for reduction patterns.  */
6048
6049 bool
6050 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6051                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6052                         slp_instance slp_node_instance,
6053                         stmt_vector_for_cost *cost_vec)
6054 {
6055   tree vec_dest;
6056   tree scalar_dest;
6057   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6058   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6059   tree vectype_in = NULL_TREE;
6060   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6061   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6062   enum tree_code code, orig_code;
6063   internal_fn reduc_fn;
6064   machine_mode vec_mode;
6065   int op_type;
6066   optab optab;
6067   tree new_temp = NULL_TREE;
6068   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6069   gimple *cond_reduc_def_stmt = NULL;
6070   enum tree_code cond_reduc_op_code = ERROR_MARK;
6071   tree scalar_type;
6072   bool is_simple_use;
6073   int i;
6074   int ncopies;
6075   int epilog_copies;
6076   stmt_vec_info prev_stmt_info, prev_phi_info;
6077   bool single_defuse_cycle = false;
6078   stmt_vec_info new_stmt_info = NULL;
6079   int j;
6080   tree ops[3];
6081   enum vect_def_type dts[3];
6082   bool nested_cycle = false, found_nested_cycle_def = false;
6083   bool double_reduc = false;
6084   basic_block def_bb;
6085   struct loop * def_stmt_loop;
6086   tree def_arg;
6087   auto_vec<tree> vec_oprnds0;
6088   auto_vec<tree> vec_oprnds1;
6089   auto_vec<tree> vec_oprnds2;
6090   auto_vec<tree> vect_defs;
6091   auto_vec<stmt_vec_info> phis;
6092   int vec_num;
6093   tree def0, tem;
6094   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6095   tree cond_reduc_val = NULL_TREE;
6096
6097   /* Make sure it was already recognized as a reduction computation.  */
6098   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6099       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6100     return false;
6101
6102   if (nested_in_vect_loop_p (loop, stmt))
6103     {
6104       loop = loop->inner;
6105       nested_cycle = true;
6106     }
6107
6108   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6109     gcc_assert (slp_node
6110                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6111
6112   if (gphi *phi = dyn_cast <gphi *> (stmt))
6113     {
6114       tree phi_result = gimple_phi_result (phi);
6115       /* Analysis is fully done on the reduction stmt invocation.  */
6116       if (! vec_stmt)
6117         {
6118           if (slp_node)
6119             slp_node_instance->reduc_phis = slp_node;
6120
6121           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6122           return true;
6123         }
6124
6125       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6126         /* Leave the scalar phi in place.  Note that checking
6127            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6128            for reductions involving a single statement.  */
6129         return true;
6130
6131       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6132       if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6133         reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6134
6135       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6136           == EXTRACT_LAST_REDUCTION)
6137         /* Leave the scalar phi in place.  */
6138         return true;
6139
6140       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6141       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6142         {
6143           tree op = gimple_op (reduc_stmt, k);
6144           if (op == phi_result)
6145             continue;
6146           if (k == 1
6147               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6148             continue;
6149           if (!vectype_in
6150               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6151                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6152             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6153           break;
6154         }
6155       gcc_assert (vectype_in);
6156
6157       if (slp_node)
6158         ncopies = 1;
6159       else
6160         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6161
6162       stmt_vec_info use_stmt_info;
6163       if (ncopies > 1
6164           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6165           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6166           && (use_stmt_info == reduc_stmt_info
6167               || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
6168         single_defuse_cycle = true;
6169
6170       /* Create the destination vector  */
6171       scalar_dest = gimple_assign_lhs (reduc_stmt);
6172       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6173
6174       if (slp_node)
6175         /* The size vect_schedule_slp_instance computes is off for us.  */
6176         vec_num = vect_get_num_vectors
6177           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6178            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6179            vectype_in);
6180       else
6181         vec_num = 1;
6182
6183       /* Generate the reduction PHIs upfront.  */
6184       prev_phi_info = NULL;
6185       for (j = 0; j < ncopies; j++)
6186         {
6187           if (j == 0 || !single_defuse_cycle)
6188             {
6189               for (i = 0; i < vec_num; i++)
6190                 {
6191                   /* Create the reduction-phi that defines the reduction
6192                      operand.  */
6193                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6194                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6195
6196                   if (slp_node)
6197                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6198                   else
6199                     {
6200                       if (j == 0)
6201                         STMT_VINFO_VEC_STMT (stmt_info)
6202                           = *vec_stmt = new_phi_info;
6203                       else
6204                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6205                       prev_phi_info = new_phi_info;
6206                     }
6207                 }
6208             }
6209         }
6210
6211       return true;
6212     }
6213
6214   /* 1. Is vectorizable reduction?  */
6215   /* Not supportable if the reduction variable is used in the loop, unless
6216      it's a reduction chain.  */
6217   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6218       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6219     return false;
6220
6221   /* Reductions that are not used even in an enclosing outer-loop,
6222      are expected to be "live" (used out of the loop).  */
6223   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6224       && !STMT_VINFO_LIVE_P (stmt_info))
6225     return false;
6226
6227   /* 2. Has this been recognized as a reduction pattern?
6228
6229      Check if STMT represents a pattern that has been recognized
6230      in earlier analysis stages.  For stmts that represent a pattern,
6231      the STMT_VINFO_RELATED_STMT field records the last stmt in
6232      the original sequence that constitutes the pattern.  */
6233
6234   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6235   if (orig_stmt_info)
6236     {
6237       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6238       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6239     }
6240
6241   /* 3. Check the operands of the operation.  The first operands are defined
6242         inside the loop body. The last operand is the reduction variable,
6243         which is defined by the loop-header-phi.  */
6244
6245   gcc_assert (is_gimple_assign (stmt));
6246
6247   /* Flatten RHS.  */
6248   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6249     {
6250     case GIMPLE_BINARY_RHS:
6251       code = gimple_assign_rhs_code (stmt);
6252       op_type = TREE_CODE_LENGTH (code);
6253       gcc_assert (op_type == binary_op);
6254       ops[0] = gimple_assign_rhs1 (stmt);
6255       ops[1] = gimple_assign_rhs2 (stmt);
6256       break;
6257
6258     case GIMPLE_TERNARY_RHS:
6259       code = gimple_assign_rhs_code (stmt);
6260       op_type = TREE_CODE_LENGTH (code);
6261       gcc_assert (op_type == ternary_op);
6262       ops[0] = gimple_assign_rhs1 (stmt);
6263       ops[1] = gimple_assign_rhs2 (stmt);
6264       ops[2] = gimple_assign_rhs3 (stmt);
6265       break;
6266
6267     case GIMPLE_UNARY_RHS:
6268       return false;
6269
6270     default:
6271       gcc_unreachable ();
6272     }
6273
6274   if (code == COND_EXPR && slp_node)
6275     return false;
6276
6277   scalar_dest = gimple_assign_lhs (stmt);
6278   scalar_type = TREE_TYPE (scalar_dest);
6279   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6280       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6281     return false;
6282
6283   /* Do not try to vectorize bit-precision reductions.  */
6284   if (!type_has_mode_precision_p (scalar_type))
6285     return false;
6286
6287   /* All uses but the last are expected to be defined in the loop.
6288      The last use is the reduction variable.  In case of nested cycle this
6289      assumption is not true: we use reduc_index to record the index of the
6290      reduction variable.  */
6291   stmt_vec_info reduc_def_info = NULL;
6292   int reduc_index = -1;
6293   for (i = 0; i < op_type; i++)
6294     {
6295       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6296       if (i == 0 && code == COND_EXPR)
6297         continue;
6298
6299       stmt_vec_info def_stmt_info;
6300       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6301                                           &def_stmt_info);
6302       dt = dts[i];
6303       gcc_assert (is_simple_use);
6304       if (dt == vect_reduction_def)
6305         {
6306           reduc_def_info = def_stmt_info;
6307           reduc_index = i;
6308           continue;
6309         }
6310       else if (tem)
6311         {
6312           /* To properly compute ncopies we are interested in the widest
6313              input type in case we're looking at a widening accumulation.  */
6314           if (!vectype_in
6315               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6316                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6317             vectype_in = tem;
6318         }
6319
6320       if (dt != vect_internal_def
6321           && dt != vect_external_def
6322           && dt != vect_constant_def
6323           && dt != vect_induction_def
6324           && !(dt == vect_nested_cycle && nested_cycle))
6325         return false;
6326
6327       if (dt == vect_nested_cycle)
6328         {
6329           found_nested_cycle_def = true;
6330           reduc_def_info = def_stmt_info;
6331           reduc_index = i;
6332         }
6333
6334       if (i == 1 && code == COND_EXPR)
6335         {
6336           /* Record how value of COND_EXPR is defined.  */
6337           if (dt == vect_constant_def)
6338             {
6339               cond_reduc_dt = dt;
6340               cond_reduc_val = ops[i];
6341             }
6342           if (dt == vect_induction_def
6343               && def_stmt_info
6344               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6345             {
6346               cond_reduc_dt = dt;
6347               cond_reduc_def_stmt = def_stmt_info;
6348             }
6349         }
6350     }
6351
6352   if (!vectype_in)
6353     vectype_in = vectype_out;
6354
6355   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6356      directy used in stmt.  */
6357   if (reduc_index == -1)
6358     {
6359       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6360         {
6361           if (dump_enabled_p ())
6362             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6363                              "in-order reduction chain without SLP.\n");
6364           return false;
6365         }
6366
6367       if (orig_stmt_info)
6368         reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6369       else
6370         reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6371     }
6372
6373   if (! reduc_def_info)
6374     return false;
6375
6376   gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6377   if (!reduc_def_phi)
6378     return false;
6379
6380   if (!(reduc_index == -1
6381         || dts[reduc_index] == vect_reduction_def
6382         || dts[reduc_index] == vect_nested_cycle
6383         || ((dts[reduc_index] == vect_internal_def
6384              || dts[reduc_index] == vect_external_def
6385              || dts[reduc_index] == vect_constant_def
6386              || dts[reduc_index] == vect_induction_def)
6387             && nested_cycle && found_nested_cycle_def)))
6388     {
6389       /* For pattern recognized stmts, orig_stmt might be a reduction,
6390          but some helper statements for the pattern might not, or
6391          might be COND_EXPRs with reduction uses in the condition.  */
6392       gcc_assert (orig_stmt_info);
6393       return false;
6394     }
6395
6396   /* PHIs should not participate in patterns.  */
6397   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6398   enum vect_reduction_type v_reduc_type
6399     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6400   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6401
6402   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6403   /* If we have a condition reduction, see if we can simplify it further.  */
6404   if (v_reduc_type == COND_REDUCTION)
6405     {
6406       /* TODO: We can't yet handle reduction chains, since we need to treat
6407          each COND_EXPR in the chain specially, not just the last one.
6408          E.g. for:
6409
6410             x_1 = PHI <x_3, ...>
6411             x_2 = a_2 ? ... : x_1;
6412             x_3 = a_3 ? ... : x_2;
6413
6414          we're interested in the last element in x_3 for which a_2 || a_3
6415          is true, whereas the current reduction chain handling would
6416          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6417          as a reduction operation.  */
6418       if (reduc_index == -1)
6419         {
6420           if (dump_enabled_p ())
6421             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6422                              "conditional reduction chains not supported\n");
6423           return false;
6424         }
6425
6426       /* vect_is_simple_reduction ensured that operand 2 is the
6427          loop-carried operand.  */
6428       gcc_assert (reduc_index == 2);
6429
6430       /* Loop peeling modifies initial value of reduction PHI, which
6431          makes the reduction stmt to be transformed different to the
6432          original stmt analyzed.  We need to record reduction code for
6433          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6434          it can be used directly at transform stage.  */
6435       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6436           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6437         {
6438           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6439           gcc_assert (cond_reduc_dt == vect_constant_def);
6440           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6441         }
6442       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6443                                                vectype_in, OPTIMIZE_FOR_SPEED))
6444         {
6445           if (dump_enabled_p ())
6446             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6447                              "optimizing condition reduction with"
6448                              " FOLD_EXTRACT_LAST.\n");
6449           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6450         }
6451       else if (cond_reduc_dt == vect_induction_def)
6452         {
6453           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6454           tree base
6455             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6456           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6457
6458           gcc_assert (TREE_CODE (base) == INTEGER_CST
6459                       && TREE_CODE (step) == INTEGER_CST);
6460           cond_reduc_val = NULL_TREE;
6461           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6462              above base; punt if base is the minimum value of the type for
6463              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6464           if (tree_int_cst_sgn (step) == -1)
6465             {
6466               cond_reduc_op_code = MIN_EXPR;
6467               if (tree_int_cst_sgn (base) == -1)
6468                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6469               else if (tree_int_cst_lt (base,
6470                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6471                 cond_reduc_val
6472                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6473             }
6474           else
6475             {
6476               cond_reduc_op_code = MAX_EXPR;
6477               if (tree_int_cst_sgn (base) == 1)
6478                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6479               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6480                                         base))
6481                 cond_reduc_val
6482                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6483             }
6484           if (cond_reduc_val)
6485             {
6486               if (dump_enabled_p ())
6487                 dump_printf_loc (MSG_NOTE, vect_location,
6488                                  "condition expression based on "
6489                                  "integer induction.\n");
6490               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6491                 = INTEGER_INDUC_COND_REDUCTION;
6492             }
6493         }
6494       else if (cond_reduc_dt == vect_constant_def)
6495         {
6496           enum vect_def_type cond_initial_dt;
6497           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6498           tree cond_initial_val
6499             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6500
6501           gcc_assert (cond_reduc_val != NULL_TREE);
6502           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6503           if (cond_initial_dt == vect_constant_def
6504               && types_compatible_p (TREE_TYPE (cond_initial_val),
6505                                      TREE_TYPE (cond_reduc_val)))
6506             {
6507               tree e = fold_binary (LE_EXPR, boolean_type_node,
6508                                     cond_initial_val, cond_reduc_val);
6509               if (e && (integer_onep (e) || integer_zerop (e)))
6510                 {
6511                   if (dump_enabled_p ())
6512                     dump_printf_loc (MSG_NOTE, vect_location,
6513                                      "condition expression based on "
6514                                      "compile time constant.\n");
6515                   /* Record reduction code at analysis stage.  */
6516                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6517                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6518                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6519                     = CONST_COND_REDUCTION;
6520                 }
6521             }
6522         }
6523     }
6524
6525   if (orig_stmt_info)
6526     gcc_assert (tmp == orig_stmt_info
6527                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6528   else
6529     /* We changed STMT to be the first stmt in reduction chain, hence we
6530        check that in this case the first element in the chain is STMT.  */
6531     gcc_assert (tmp == stmt_info
6532                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6533
6534   if (STMT_VINFO_LIVE_P (reduc_def_info))
6535     return false;
6536
6537   if (slp_node)
6538     ncopies = 1;
6539   else
6540     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6541
6542   gcc_assert (ncopies >= 1);
6543
6544   vec_mode = TYPE_MODE (vectype_in);
6545   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6546
6547   if (code == COND_EXPR)
6548     {
6549       /* Only call during the analysis stage, otherwise we'll lose
6550          STMT_VINFO_TYPE.  */
6551       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6552                                                 ops[reduc_index], 0, NULL,
6553                                                 cost_vec))
6554         {
6555           if (dump_enabled_p ())
6556             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6557                              "unsupported condition in reduction\n");
6558           return false;
6559         }
6560     }
6561   else
6562     {
6563       /* 4. Supportable by target?  */
6564
6565       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6566           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6567         {
6568           /* Shifts and rotates are only supported by vectorizable_shifts,
6569              not vectorizable_reduction.  */
6570           if (dump_enabled_p ())
6571             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6572                              "unsupported shift or rotation.\n");
6573           return false;
6574         }
6575
6576       /* 4.1. check support for the operation in the loop  */
6577       optab = optab_for_tree_code (code, vectype_in, optab_default);
6578       if (!optab)
6579         {
6580           if (dump_enabled_p ())
6581             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582                              "no optab.\n");
6583
6584           return false;
6585         }
6586
6587       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6588         {
6589           if (dump_enabled_p ())
6590             dump_printf (MSG_NOTE, "op not supported by target.\n");
6591
6592           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6593               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6594             return false;
6595
6596           if (dump_enabled_p ())
6597             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6598         }
6599
6600       /* Worthwhile without SIMD support?  */
6601       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6602           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6603         {
6604           if (dump_enabled_p ())
6605             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6606                              "not worthwhile without SIMD support.\n");
6607
6608           return false;
6609         }
6610     }
6611
6612   /* 4.2. Check support for the epilog operation.
6613
6614           If STMT represents a reduction pattern, then the type of the
6615           reduction variable may be different than the type of the rest
6616           of the arguments.  For example, consider the case of accumulation
6617           of shorts into an int accumulator; The original code:
6618                         S1: int_a = (int) short_a;
6619           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6620
6621           was replaced with:
6622                         STMT: int_acc = widen_sum <short_a, int_acc>
6623
6624           This means that:
6625           1. The tree-code that is used to create the vector operation in the
6626              epilog code (that reduces the partial results) is not the
6627              tree-code of STMT, but is rather the tree-code of the original
6628              stmt from the pattern that STMT is replacing.  I.e, in the example
6629              above we want to use 'widen_sum' in the loop, but 'plus' in the
6630              epilog.
6631           2. The type (mode) we use to check available target support
6632              for the vector operation to be created in the *epilog*, is
6633              determined by the type of the reduction variable (in the example
6634              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6635              However the type (mode) we use to check available target support
6636              for the vector operation to be created *inside the loop*, is
6637              determined by the type of the other arguments to STMT (in the
6638              example we'd check this: optab_handler (widen_sum_optab,
6639              vect_short_mode)).
6640
6641           This is contrary to "regular" reductions, in which the types of all
6642           the arguments are the same as the type of the reduction variable.
6643           For "regular" reductions we can therefore use the same vector type
6644           (and also the same tree-code) when generating the epilog code and
6645           when generating the code inside the loop.  */
6646
6647   vect_reduction_type reduction_type
6648     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6649   if (orig_stmt_info
6650       && (reduction_type == TREE_CODE_REDUCTION
6651           || reduction_type == FOLD_LEFT_REDUCTION))
6652     {
6653       /* This is a reduction pattern: get the vectype from the type of the
6654          reduction variable, and get the tree-code from orig_stmt.  */
6655       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6656       gcc_assert (vectype_out);
6657       vec_mode = TYPE_MODE (vectype_out);
6658     }
6659   else
6660     {
6661       /* Regular reduction: use the same vectype and tree-code as used for
6662          the vector code inside the loop can be used for the epilog code. */
6663       orig_code = code;
6664
6665       if (code == MINUS_EXPR)
6666         orig_code = PLUS_EXPR;
6667
6668       /* For simple condition reductions, replace with the actual expression
6669          we want to base our reduction around.  */
6670       if (reduction_type == CONST_COND_REDUCTION)
6671         {
6672           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6673           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6674         }
6675       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6676         orig_code = cond_reduc_op_code;
6677     }
6678
6679   if (nested_cycle)
6680     {
6681       def_bb = gimple_bb (reduc_def_phi);
6682       def_stmt_loop = def_bb->loop_father;
6683       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6684                                        loop_preheader_edge (def_stmt_loop));
6685       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6686       if (def_arg_stmt_info
6687           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6688               == vect_double_reduction_def))
6689         double_reduc = true;
6690     }
6691
6692   reduc_fn = IFN_LAST;
6693
6694   if (reduction_type == TREE_CODE_REDUCTION
6695       || reduction_type == FOLD_LEFT_REDUCTION
6696       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6697       || reduction_type == CONST_COND_REDUCTION)
6698     {
6699       if (reduction_type == FOLD_LEFT_REDUCTION
6700           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6701           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6702         {
6703           if (reduc_fn != IFN_LAST
6704               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6705                                                   OPTIMIZE_FOR_SPEED))
6706             {
6707               if (dump_enabled_p ())
6708                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6709                                  "reduc op not supported by target.\n");
6710
6711               reduc_fn = IFN_LAST;
6712             }
6713         }
6714       else
6715         {
6716           if (!nested_cycle || double_reduc)
6717             {
6718               if (dump_enabled_p ())
6719                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6720                                  "no reduc code for scalar code.\n");
6721
6722               return false;
6723             }
6724         }
6725     }
6726   else if (reduction_type == COND_REDUCTION)
6727     {
6728       int scalar_precision
6729         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6730       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6731       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6732                                                 nunits_out);
6733
6734       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6735                                           OPTIMIZE_FOR_SPEED))
6736         reduc_fn = IFN_REDUC_MAX;
6737     }
6738
6739   if (reduction_type != EXTRACT_LAST_REDUCTION
6740       && reduc_fn == IFN_LAST
6741       && !nunits_out.is_constant ())
6742     {
6743       if (dump_enabled_p ())
6744         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6745                          "missing target support for reduction on"
6746                          " variable-length vectors.\n");
6747       return false;
6748     }
6749
6750   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6751       && ncopies > 1)
6752     {
6753       if (dump_enabled_p ())
6754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755                          "multiple types in double reduction or condition "
6756                          "reduction.\n");
6757       return false;
6758     }
6759
6760   /* For SLP reductions, see if there is a neutral value we can use.  */
6761   tree neutral_op = NULL_TREE;
6762   if (slp_node)
6763     neutral_op = neutral_op_for_slp_reduction
6764       (slp_node_instance->reduc_phis, code,
6765        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL_STMT_VEC_INFO);
6766
6767   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6768     {
6769       /* We can't support in-order reductions of code such as this:
6770
6771            for (int i = 0; i < n1; ++i)
6772              for (int j = 0; j < n2; ++j)
6773                l += a[j];
6774
6775          since GCC effectively transforms the loop when vectorizing:
6776
6777            for (int i = 0; i < n1 / VF; ++i)
6778              for (int j = 0; j < n2; ++j)
6779                for (int k = 0; k < VF; ++k)
6780                  l += a[j];
6781
6782          which is a reassociation of the original operation.  */
6783       if (dump_enabled_p ())
6784         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785                          "in-order double reduction not supported.\n");
6786
6787       return false;
6788     }
6789
6790   if (reduction_type == FOLD_LEFT_REDUCTION
6791       && slp_node
6792       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6793     {
6794       /* We cannot use in-order reductions in this case because there is
6795          an implicit reassociation of the operations involved.  */
6796       if (dump_enabled_p ())
6797         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6798                          "in-order unchained SLP reductions not supported.\n");
6799       return false;
6800     }
6801
6802   /* For double reductions, and for SLP reductions with a neutral value,
6803      we construct a variable-length initial vector by loading a vector
6804      full of the neutral value and then shift-and-inserting the start
6805      values into the low-numbered elements.  */
6806   if ((double_reduc || neutral_op)
6807       && !nunits_out.is_constant ()
6808       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6809                                           vectype_out, OPTIMIZE_FOR_SPEED))
6810     {
6811       if (dump_enabled_p ())
6812         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6813                          "reduction on variable-length vectors requires"
6814                          " target support for a vector-shift-and-insert"
6815                          " operation.\n");
6816       return false;
6817     }
6818
6819   /* Check extra constraints for variable-length unchained SLP reductions.  */
6820   if (STMT_SLP_TYPE (stmt_info)
6821       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6822       && !nunits_out.is_constant ())
6823     {
6824       /* We checked above that we could build the initial vector when
6825          there's a neutral element value.  Check here for the case in
6826          which each SLP statement has its own initial value and in which
6827          that value needs to be repeated for every instance of the
6828          statement within the initial vector.  */
6829       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6830       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6831       if (!neutral_op
6832           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6833         {
6834           if (dump_enabled_p ())
6835             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6836                              "unsupported form of SLP reduction for"
6837                              " variable-length vectors: cannot build"
6838                              " initial vector.\n");
6839           return false;
6840         }
6841       /* The epilogue code relies on the number of elements being a multiple
6842          of the group size.  The duplicate-and-interleave approach to setting
6843          up the the initial vector does too.  */
6844       if (!multiple_p (nunits_out, group_size))
6845         {
6846           if (dump_enabled_p ())
6847             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6848                              "unsupported form of SLP reduction for"
6849                              " variable-length vectors: the vector size"
6850                              " is not a multiple of the number of results.\n");
6851           return false;
6852         }
6853     }
6854
6855   /* In case of widenning multiplication by a constant, we update the type
6856      of the constant to be the type of the other operand.  We check that the
6857      constant fits the type in the pattern recognition pass.  */
6858   if (code == DOT_PROD_EXPR
6859       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6860     {
6861       if (TREE_CODE (ops[0]) == INTEGER_CST)
6862         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6863       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6864         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6865       else
6866         {
6867           if (dump_enabled_p ())
6868             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6869                              "invalid types in dot-prod\n");
6870
6871           return false;
6872         }
6873     }
6874
6875   if (reduction_type == COND_REDUCTION)
6876     {
6877       widest_int ni;
6878
6879       if (! max_loop_iterations (loop, &ni))
6880         {
6881           if (dump_enabled_p ())
6882             dump_printf_loc (MSG_NOTE, vect_location,
6883                              "loop count not known, cannot create cond "
6884                              "reduction.\n");
6885           return false;
6886         }
6887       /* Convert backedges to iterations.  */
6888       ni += 1;
6889
6890       /* The additional index will be the same type as the condition.  Check
6891          that the loop can fit into this less one (because we'll use up the
6892          zero slot for when there are no matches).  */
6893       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6894       if (wi::geu_p (ni, wi::to_widest (max_index)))
6895         {
6896           if (dump_enabled_p ())
6897             dump_printf_loc (MSG_NOTE, vect_location,
6898                              "loop size is greater than data size.\n");
6899           return false;
6900         }
6901     }
6902
6903   /* In case the vectorization factor (VF) is bigger than the number
6904      of elements that we can fit in a vectype (nunits), we have to generate
6905      more than one vector stmt - i.e - we need to "unroll" the
6906      vector stmt by a factor VF/nunits.  For more details see documentation
6907      in vectorizable_operation.  */
6908
6909   /* If the reduction is used in an outer loop we need to generate
6910      VF intermediate results, like so (e.g. for ncopies=2):
6911         r0 = phi (init, r0)
6912         r1 = phi (init, r1)
6913         r0 = x0 + r0;
6914         r1 = x1 + r1;
6915     (i.e. we generate VF results in 2 registers).
6916     In this case we have a separate def-use cycle for each copy, and therefore
6917     for each copy we get the vector def for the reduction variable from the
6918     respective phi node created for this copy.
6919
6920     Otherwise (the reduction is unused in the loop nest), we can combine
6921     together intermediate results, like so (e.g. for ncopies=2):
6922         r = phi (init, r)
6923         r = x0 + r;
6924         r = x1 + r;
6925    (i.e. we generate VF/2 results in a single register).
6926    In this case for each copy we get the vector def for the reduction variable
6927    from the vectorized reduction operation generated in the previous iteration.
6928
6929    This only works when we see both the reduction PHI and its only consumer
6930    in vectorizable_reduction and there are no intermediate stmts
6931    participating.  */
6932   stmt_vec_info use_stmt_info;
6933   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6934   if (ncopies > 1
6935       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6936       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6937       && (use_stmt_info == stmt_info
6938           || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
6939     {
6940       single_defuse_cycle = true;
6941       epilog_copies = 1;
6942     }
6943   else
6944     epilog_copies = ncopies;
6945
6946   /* If the reduction stmt is one of the patterns that have lane
6947      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6948   if ((ncopies > 1
6949        && ! single_defuse_cycle)
6950       && (code == DOT_PROD_EXPR
6951           || code == WIDEN_SUM_EXPR
6952           || code == SAD_EXPR))
6953     {
6954       if (dump_enabled_p ())
6955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6956                          "multi def-use cycle not possible for lane-reducing "
6957                          "reduction operation\n");
6958       return false;
6959     }
6960
6961   if (slp_node)
6962     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6963   else
6964     vec_num = 1;
6965
6966   internal_fn cond_fn = get_conditional_internal_fn (code);
6967   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6968
6969   if (!vec_stmt) /* transformation not required.  */
6970     {
6971       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6972       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6973         {
6974           if (reduction_type != FOLD_LEFT_REDUCTION
6975               && (cond_fn == IFN_LAST
6976                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6977                                                       OPTIMIZE_FOR_SPEED)))
6978             {
6979               if (dump_enabled_p ())
6980                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6981                                  "can't use a fully-masked loop because no"
6982                                  " conditional operation is available.\n");
6983               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6984             }
6985           else if (reduc_index == -1)
6986             {
6987               if (dump_enabled_p ())
6988                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6989                                  "can't use a fully-masked loop for chained"
6990                                  " reductions.\n");
6991               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6992             }
6993           else
6994             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6995                                    vectype_in);
6996         }
6997       if (dump_enabled_p ()
6998           && reduction_type == FOLD_LEFT_REDUCTION)
6999         dump_printf_loc (MSG_NOTE, vect_location,
7000                          "using an in-order (fold-left) reduction.\n");
7001       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7002       return true;
7003     }
7004
7005   /* Transform.  */
7006
7007   if (dump_enabled_p ())
7008     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7009
7010   /* FORNOW: Multiple types are not supported for condition.  */
7011   if (code == COND_EXPR)
7012     gcc_assert (ncopies == 1);
7013
7014   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7015
7016   if (reduction_type == FOLD_LEFT_REDUCTION)
7017     return vectorize_fold_left_reduction
7018       (stmt, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7019        reduc_fn, ops, vectype_in, reduc_index, masks);
7020
7021   if (reduction_type == EXTRACT_LAST_REDUCTION)
7022     {
7023       gcc_assert (!slp_node);
7024       return vectorizable_condition (stmt, gsi, vec_stmt,
7025                                      NULL, reduc_index, NULL, NULL);
7026     }
7027
7028   /* Create the destination vector  */
7029   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7030
7031   prev_stmt_info = NULL;
7032   prev_phi_info = NULL;
7033   if (!slp_node)
7034     {
7035       vec_oprnds0.create (1);
7036       vec_oprnds1.create (1);
7037       if (op_type == ternary_op)
7038         vec_oprnds2.create (1);
7039     }
7040
7041   phis.create (vec_num);
7042   vect_defs.create (vec_num);
7043   if (!slp_node)
7044     vect_defs.quick_push (NULL_TREE);
7045
7046   if (slp_node)
7047     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7048   else
7049     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7050
7051   for (j = 0; j < ncopies; j++)
7052     {
7053       if (code == COND_EXPR)
7054         {
7055           gcc_assert (!slp_node);
7056           vectorizable_condition (stmt, gsi, vec_stmt,
7057                                   PHI_RESULT (phis[0]->stmt),
7058                                   reduc_index, NULL, NULL);
7059           /* Multiple types are not supported for condition.  */
7060           break;
7061         }
7062
7063       /* Handle uses.  */
7064       if (j == 0)
7065         {
7066           if (slp_node)
7067             {
7068               /* Get vec defs for all the operands except the reduction index,
7069                  ensuring the ordering of the ops in the vector is kept.  */
7070               auto_vec<tree, 3> slp_ops;
7071               auto_vec<vec<tree>, 3> vec_defs;
7072
7073               slp_ops.quick_push (ops[0]);
7074               slp_ops.quick_push (ops[1]);
7075               if (op_type == ternary_op)
7076                 slp_ops.quick_push (ops[2]);
7077
7078               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7079
7080               vec_oprnds0.safe_splice (vec_defs[0]);
7081               vec_defs[0].release ();
7082               vec_oprnds1.safe_splice (vec_defs[1]);
7083               vec_defs[1].release ();
7084               if (op_type == ternary_op)
7085                 {
7086                   vec_oprnds2.safe_splice (vec_defs[2]);
7087                   vec_defs[2].release ();
7088                 }
7089             }
7090           else
7091             {
7092               vec_oprnds0.quick_push
7093                 (vect_get_vec_def_for_operand (ops[0], stmt));
7094               vec_oprnds1.quick_push
7095                 (vect_get_vec_def_for_operand (ops[1], stmt));
7096               if (op_type == ternary_op)
7097                 vec_oprnds2.quick_push
7098                   (vect_get_vec_def_for_operand (ops[2], stmt));
7099             }
7100         }
7101       else
7102         {
7103           if (!slp_node)
7104             {
7105               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7106
7107               if (single_defuse_cycle && reduc_index == 0)
7108                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7109               else
7110                 vec_oprnds0[0]
7111                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7112               if (single_defuse_cycle && reduc_index == 1)
7113                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7114               else
7115                 vec_oprnds1[0]
7116                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7117               if (op_type == ternary_op)
7118                 {
7119                   if (single_defuse_cycle && reduc_index == 2)
7120                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7121                   else
7122                     vec_oprnds2[0]
7123                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7124                 }
7125             }
7126         }
7127
7128       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7129         {
7130           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7131           if (masked_loop_p)
7132             {
7133               /* Make sure that the reduction accumulator is vop[0].  */
7134               if (reduc_index == 1)
7135                 {
7136                   gcc_assert (commutative_tree_code (code));
7137                   std::swap (vop[0], vop[1]);
7138                 }
7139               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7140                                               vectype_in, i * ncopies + j);
7141               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7142                                                         vop[0], vop[1],
7143                                                         vop[0]);
7144               new_temp = make_ssa_name (vec_dest, call);
7145               gimple_call_set_lhs (call, new_temp);
7146               gimple_call_set_nothrow (call, true);
7147               new_stmt_info = vect_finish_stmt_generation (stmt, call, gsi);
7148             }
7149           else
7150             {
7151               if (op_type == ternary_op)
7152                 vop[2] = vec_oprnds2[i];
7153
7154               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7155                                                        vop[0], vop[1], vop[2]);
7156               new_temp = make_ssa_name (vec_dest, new_stmt);
7157               gimple_assign_set_lhs (new_stmt, new_temp);
7158               new_stmt_info
7159                 = vect_finish_stmt_generation (stmt, new_stmt, gsi);
7160             }
7161
7162           if (slp_node)
7163             {
7164               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7165               vect_defs.quick_push (new_temp);
7166             }
7167           else
7168             vect_defs[0] = new_temp;
7169         }
7170
7171       if (slp_node)
7172         continue;
7173
7174       if (j == 0)
7175         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7176       else
7177         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7178
7179       prev_stmt_info = new_stmt_info;
7180     }
7181
7182   /* Finalize the reduction-phi (set its arguments) and create the
7183      epilog reduction code.  */
7184   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7185     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7186
7187   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_phi,
7188                                     epilog_copies, reduc_fn, phis,
7189                                     double_reduc, slp_node, slp_node_instance,
7190                                     cond_reduc_val, cond_reduc_op_code,
7191                                     neutral_op);
7192
7193   return true;
7194 }
7195
7196 /* Function vect_min_worthwhile_factor.
7197
7198    For a loop where we could vectorize the operation indicated by CODE,
7199    return the minimum vectorization factor that makes it worthwhile
7200    to use generic vectors.  */
7201 static unsigned int
7202 vect_min_worthwhile_factor (enum tree_code code)
7203 {
7204   switch (code)
7205     {
7206     case PLUS_EXPR:
7207     case MINUS_EXPR:
7208     case NEGATE_EXPR:
7209       return 4;
7210
7211     case BIT_AND_EXPR:
7212     case BIT_IOR_EXPR:
7213     case BIT_XOR_EXPR:
7214     case BIT_NOT_EXPR:
7215       return 2;
7216
7217     default:
7218       return INT_MAX;
7219     }
7220 }
7221
7222 /* Return true if VINFO indicates we are doing loop vectorization and if
7223    it is worth decomposing CODE operations into scalar operations for
7224    that loop's vectorization factor.  */
7225
7226 bool
7227 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7228 {
7229   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7230   unsigned HOST_WIDE_INT value;
7231   return (loop_vinfo
7232           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7233           && value >= vect_min_worthwhile_factor (code));
7234 }
7235
7236 /* Function vectorizable_induction
7237
7238    Check if PHI performs an induction computation that can be vectorized.
7239    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7240    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7241    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7242
7243 bool
7244 vectorizable_induction (gimple *phi,
7245                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7246                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7247                         stmt_vector_for_cost *cost_vec)
7248 {
7249   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7250   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7251   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7252   unsigned ncopies;
7253   bool nested_in_vect_loop = false;
7254   struct loop *iv_loop;
7255   tree vec_def;
7256   edge pe = loop_preheader_edge (loop);
7257   basic_block new_bb;
7258   tree new_vec, vec_init, vec_step, t;
7259   tree new_name;
7260   gimple *new_stmt;
7261   gphi *induction_phi;
7262   tree induc_def, vec_dest;
7263   tree init_expr, step_expr;
7264   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7265   unsigned i;
7266   tree expr;
7267   gimple_seq stmts;
7268   imm_use_iterator imm_iter;
7269   use_operand_p use_p;
7270   gimple *exit_phi;
7271   edge latch_e;
7272   tree loop_arg;
7273   gimple_stmt_iterator si;
7274   basic_block bb = gimple_bb (phi);
7275
7276   if (gimple_code (phi) != GIMPLE_PHI)
7277     return false;
7278
7279   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7280     return false;
7281
7282   /* Make sure it was recognized as induction computation.  */
7283   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7284     return false;
7285
7286   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7287   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7288
7289   if (slp_node)
7290     ncopies = 1;
7291   else
7292     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7293   gcc_assert (ncopies >= 1);
7294
7295   /* FORNOW. These restrictions should be relaxed.  */
7296   if (nested_in_vect_loop_p (loop, phi))
7297     {
7298       imm_use_iterator imm_iter;
7299       use_operand_p use_p;
7300       gimple *exit_phi;
7301       edge latch_e;
7302       tree loop_arg;
7303
7304       if (ncopies > 1)
7305         {
7306           if (dump_enabled_p ())
7307             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7308                              "multiple types in nested loop.\n");
7309           return false;
7310         }
7311
7312       /* FORNOW: outer loop induction with SLP not supported.  */
7313       if (STMT_SLP_TYPE (stmt_info))
7314         return false;
7315
7316       exit_phi = NULL;
7317       latch_e = loop_latch_edge (loop->inner);
7318       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7319       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7320         {
7321           gimple *use_stmt = USE_STMT (use_p);
7322           if (is_gimple_debug (use_stmt))
7323             continue;
7324
7325           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7326             {
7327               exit_phi = use_stmt;
7328               break;
7329             }
7330         }
7331       if (exit_phi)
7332         {
7333           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7334           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7335                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7336             {
7337               if (dump_enabled_p ())
7338                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7339                                  "inner-loop induction only used outside "
7340                                  "of the outer vectorized loop.\n");
7341               return false;
7342             }
7343         }
7344
7345       nested_in_vect_loop = true;
7346       iv_loop = loop->inner;
7347     }
7348   else
7349     iv_loop = loop;
7350   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7351
7352   if (slp_node && !nunits.is_constant ())
7353     {
7354       /* The current SLP code creates the initial value element-by-element.  */
7355       if (dump_enabled_p ())
7356         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7357                          "SLP induction not supported for variable-length"
7358                          " vectors.\n");
7359       return false;
7360     }
7361
7362   if (!vec_stmt) /* transformation not required.  */
7363     {
7364       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7365       DUMP_VECT_SCOPE ("vectorizable_induction");
7366       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7367       return true;
7368     }
7369
7370   /* Transform.  */
7371
7372   /* Compute a vector variable, initialized with the first VF values of
7373      the induction variable.  E.g., for an iv with IV_PHI='X' and
7374      evolution S, for a vector of 4 units, we want to compute:
7375      [X, X + S, X + 2*S, X + 3*S].  */
7376
7377   if (dump_enabled_p ())
7378     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7379
7380   latch_e = loop_latch_edge (iv_loop);
7381   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7382
7383   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7384   gcc_assert (step_expr != NULL_TREE);
7385
7386   pe = loop_preheader_edge (iv_loop);
7387   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7388                                      loop_preheader_edge (iv_loop));
7389
7390   stmts = NULL;
7391   if (!nested_in_vect_loop)
7392     {
7393       /* Convert the initial value to the desired type.  */
7394       tree new_type = TREE_TYPE (vectype);
7395       init_expr = gimple_convert (&stmts, new_type, init_expr);
7396
7397       /* If we are using the loop mask to "peel" for alignment then we need
7398          to adjust the start value here.  */
7399       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7400       if (skip_niters != NULL_TREE)
7401         {
7402           if (FLOAT_TYPE_P (vectype))
7403             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7404                                         skip_niters);
7405           else
7406             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7407           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7408                                          skip_niters, step_expr);
7409           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7410                                     init_expr, skip_step);
7411         }
7412     }
7413
7414   /* Convert the step to the desired type.  */
7415   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7416
7417   if (stmts)
7418     {
7419       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7420       gcc_assert (!new_bb);
7421     }
7422
7423   /* Find the first insertion point in the BB.  */
7424   si = gsi_after_labels (bb);
7425
7426   /* For SLP induction we have to generate several IVs as for example
7427      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7428      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7429      [VF*S, VF*S, VF*S, VF*S] for all.  */
7430   if (slp_node)
7431     {
7432       /* Enforced above.  */
7433       unsigned int const_nunits = nunits.to_constant ();
7434
7435       /* Generate [VF*S, VF*S, ... ].  */
7436       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7437         {
7438           expr = build_int_cst (integer_type_node, vf);
7439           expr = fold_convert (TREE_TYPE (step_expr), expr);
7440         }
7441       else
7442         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7443       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7444                               expr, step_expr);
7445       if (! CONSTANT_CLASS_P (new_name))
7446         new_name = vect_init_vector (phi, new_name,
7447                                      TREE_TYPE (step_expr), NULL);
7448       new_vec = build_vector_from_val (vectype, new_name);
7449       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7450
7451       /* Now generate the IVs.  */
7452       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7453       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7454       unsigned elts = const_nunits * nvects;
7455       unsigned nivs = least_common_multiple (group_size,
7456                                              const_nunits) / const_nunits;
7457       gcc_assert (elts % group_size == 0);
7458       tree elt = init_expr;
7459       unsigned ivn;
7460       for (ivn = 0; ivn < nivs; ++ivn)
7461         {
7462           tree_vector_builder elts (vectype, const_nunits, 1);
7463           stmts = NULL;
7464           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7465             {
7466               if (ivn*const_nunits + eltn >= group_size
7467                   && (ivn * const_nunits + eltn) % group_size == 0)
7468                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7469                                     elt, step_expr);
7470               elts.quick_push (elt);
7471             }
7472           vec_init = gimple_build_vector (&stmts, &elts);
7473           if (stmts)
7474             {
7475               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7476               gcc_assert (!new_bb);
7477             }
7478
7479           /* Create the induction-phi that defines the induction-operand.  */
7480           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7481           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7482           stmt_vec_info induction_phi_info
7483             = loop_vinfo->add_stmt (induction_phi);
7484           induc_def = PHI_RESULT (induction_phi);
7485
7486           /* Create the iv update inside the loop  */
7487           vec_def = make_ssa_name (vec_dest);
7488           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7489           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7490           loop_vinfo->add_stmt (new_stmt);
7491
7492           /* Set the arguments of the phi node:  */
7493           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7494           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7495                        UNKNOWN_LOCATION);
7496
7497           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7498         }
7499
7500       /* Re-use IVs when we can.  */
7501       if (ivn < nvects)
7502         {
7503           unsigned vfp
7504             = least_common_multiple (group_size, const_nunits) / group_size;
7505           /* Generate [VF'*S, VF'*S, ... ].  */
7506           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7507             {
7508               expr = build_int_cst (integer_type_node, vfp);
7509               expr = fold_convert (TREE_TYPE (step_expr), expr);
7510             }
7511           else
7512             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7513           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7514                                   expr, step_expr);
7515           if (! CONSTANT_CLASS_P (new_name))
7516             new_name = vect_init_vector (phi, new_name,
7517                                          TREE_TYPE (step_expr), NULL);
7518           new_vec = build_vector_from_val (vectype, new_name);
7519           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7520           for (; ivn < nvects; ++ivn)
7521             {
7522               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7523               tree def;
7524               if (gimple_code (iv) == GIMPLE_PHI)
7525                 def = gimple_phi_result (iv);
7526               else
7527                 def = gimple_assign_lhs (iv);
7528               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7529                                               PLUS_EXPR,
7530                                               def, vec_step);
7531               if (gimple_code (iv) == GIMPLE_PHI)
7532                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7533               else
7534                 {
7535                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7536                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7537                 }
7538               SLP_TREE_VEC_STMTS (slp_node).quick_push
7539                 (loop_vinfo->add_stmt (new_stmt));
7540             }
7541         }
7542
7543       return true;
7544     }
7545
7546   /* Create the vector that holds the initial_value of the induction.  */
7547   if (nested_in_vect_loop)
7548     {
7549       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7550          been created during vectorization of previous stmts.  We obtain it
7551          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7552       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7553       /* If the initial value is not of proper type, convert it.  */
7554       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7555         {
7556           new_stmt
7557             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7558                                                           vect_simple_var,
7559                                                           "vec_iv_"),
7560                                    VIEW_CONVERT_EXPR,
7561                                    build1 (VIEW_CONVERT_EXPR, vectype,
7562                                            vec_init));
7563           vec_init = gimple_assign_lhs (new_stmt);
7564           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7565                                                  new_stmt);
7566           gcc_assert (!new_bb);
7567           loop_vinfo->add_stmt (new_stmt);
7568         }
7569     }
7570   else
7571     {
7572       /* iv_loop is the loop to be vectorized. Create:
7573          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7574       stmts = NULL;
7575       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7576
7577       unsigned HOST_WIDE_INT const_nunits;
7578       if (nunits.is_constant (&const_nunits))
7579         {
7580           tree_vector_builder elts (vectype, const_nunits, 1);
7581           elts.quick_push (new_name);
7582           for (i = 1; i < const_nunits; i++)
7583             {
7584               /* Create: new_name_i = new_name + step_expr  */
7585               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7586                                        new_name, step_expr);
7587               elts.quick_push (new_name);
7588             }
7589           /* Create a vector from [new_name_0, new_name_1, ...,
7590              new_name_nunits-1]  */
7591           vec_init = gimple_build_vector (&stmts, &elts);
7592         }
7593       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7594         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7595         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7596                                  new_name, step_expr);
7597       else
7598         {
7599           /* Build:
7600                 [base, base, base, ...]
7601                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7602           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7603           gcc_assert (flag_associative_math);
7604           tree index = build_index_vector (vectype, 0, 1);
7605           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7606                                                         new_name);
7607           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7608                                                         step_expr);
7609           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7610           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7611                                    vec_init, step_vec);
7612           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7613                                    vec_init, base_vec);
7614         }
7615
7616       if (stmts)
7617         {
7618           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7619           gcc_assert (!new_bb);
7620         }
7621     }
7622
7623
7624   /* Create the vector that holds the step of the induction.  */
7625   if (nested_in_vect_loop)
7626     /* iv_loop is nested in the loop to be vectorized. Generate:
7627        vec_step = [S, S, S, S]  */
7628     new_name = step_expr;
7629   else
7630     {
7631       /* iv_loop is the loop to be vectorized. Generate:
7632           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7633       gimple_seq seq = NULL;
7634       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7635         {
7636           expr = build_int_cst (integer_type_node, vf);
7637           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7638         }
7639       else
7640         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7641       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7642                                expr, step_expr);
7643       if (seq)
7644         {
7645           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7646           gcc_assert (!new_bb);
7647         }
7648     }
7649
7650   t = unshare_expr (new_name);
7651   gcc_assert (CONSTANT_CLASS_P (new_name)
7652               || TREE_CODE (new_name) == SSA_NAME);
7653   new_vec = build_vector_from_val (vectype, t);
7654   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7655
7656
7657   /* Create the following def-use cycle:
7658      loop prolog:
7659          vec_init = ...
7660          vec_step = ...
7661      loop:
7662          vec_iv = PHI <vec_init, vec_loop>
7663          ...
7664          STMT
7665          ...
7666          vec_loop = vec_iv + vec_step;  */
7667
7668   /* Create the induction-phi that defines the induction-operand.  */
7669   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7670   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7671   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7672   induc_def = PHI_RESULT (induction_phi);
7673
7674   /* Create the iv update inside the loop  */
7675   vec_def = make_ssa_name (vec_dest);
7676   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7677   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7678   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7679
7680   /* Set the arguments of the phi node:  */
7681   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7682   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7683                UNKNOWN_LOCATION);
7684
7685   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7686
7687   /* In case that vectorization factor (VF) is bigger than the number
7688      of elements that we can fit in a vectype (nunits), we have to generate
7689      more than one vector stmt - i.e - we need to "unroll" the
7690      vector stmt by a factor VF/nunits.  For more details see documentation
7691      in vectorizable_operation.  */
7692
7693   if (ncopies > 1)
7694     {
7695       gimple_seq seq = NULL;
7696       stmt_vec_info prev_stmt_vinfo;
7697       /* FORNOW. This restriction should be relaxed.  */
7698       gcc_assert (!nested_in_vect_loop);
7699
7700       /* Create the vector that holds the step of the induction.  */
7701       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7702         {
7703           expr = build_int_cst (integer_type_node, nunits);
7704           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7705         }
7706       else
7707         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7708       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7709                                expr, step_expr);
7710       if (seq)
7711         {
7712           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7713           gcc_assert (!new_bb);
7714         }
7715
7716       t = unshare_expr (new_name);
7717       gcc_assert (CONSTANT_CLASS_P (new_name)
7718                   || TREE_CODE (new_name) == SSA_NAME);
7719       new_vec = build_vector_from_val (vectype, t);
7720       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7721
7722       vec_def = induc_def;
7723       prev_stmt_vinfo = induction_phi_info;
7724       for (i = 1; i < ncopies; i++)
7725         {
7726           /* vec_i = vec_prev + vec_step  */
7727           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7728                                           vec_def, vec_step);
7729           vec_def = make_ssa_name (vec_dest, new_stmt);
7730           gimple_assign_set_lhs (new_stmt, vec_def);
7731
7732           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7733           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7734           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7735           prev_stmt_vinfo = new_stmt_info;
7736         }
7737     }
7738
7739   if (nested_in_vect_loop)
7740     {
7741       /* Find the loop-closed exit-phi of the induction, and record
7742          the final vector of induction results:  */
7743       exit_phi = NULL;
7744       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7745         {
7746           gimple *use_stmt = USE_STMT (use_p);
7747           if (is_gimple_debug (use_stmt))
7748             continue;
7749
7750           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7751             {
7752               exit_phi = use_stmt;
7753               break;
7754             }
7755         }
7756       if (exit_phi)
7757         {
7758           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7759           /* FORNOW. Currently not supporting the case that an inner-loop induction
7760              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7761           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7762                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7763
7764           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7765           if (dump_enabled_p ())
7766             {
7767               dump_printf_loc (MSG_NOTE, vect_location,
7768                                "vector of inductions after inner-loop:");
7769               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7770             }
7771         }
7772     }
7773
7774
7775   if (dump_enabled_p ())
7776     {
7777       dump_printf_loc (MSG_NOTE, vect_location,
7778                        "transform induction: created def-use cycle: ");
7779       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7780       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7781                         SSA_NAME_DEF_STMT (vec_def), 0);
7782     }
7783
7784   return true;
7785 }
7786
7787 /* Function vectorizable_live_operation.
7788
7789    STMT computes a value that is used outside the loop.  Check if
7790    it can be supported.  */
7791
7792 bool
7793 vectorizable_live_operation (gimple *stmt,
7794                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7795                              slp_tree slp_node, int slp_index,
7796                              stmt_vec_info *vec_stmt,
7797                              stmt_vector_for_cost *)
7798 {
7799   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7800   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7801   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7802   imm_use_iterator imm_iter;
7803   tree lhs, lhs_type, bitsize, vec_bitsize;
7804   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7805   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7806   int ncopies;
7807   gimple *use_stmt;
7808   auto_vec<tree> vec_oprnds;
7809   int vec_entry = 0;
7810   poly_uint64 vec_index = 0;
7811
7812   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7813
7814   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7815     return false;
7816
7817   /* FORNOW.  CHECKME.  */
7818   if (nested_in_vect_loop_p (loop, stmt))
7819     return false;
7820
7821   /* If STMT is not relevant and it is a simple assignment and its inputs are
7822      invariant then it can remain in place, unvectorized.  The original last
7823      scalar value that it computes will be used.  */
7824   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7825     {
7826       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7827       if (dump_enabled_p ())
7828         dump_printf_loc (MSG_NOTE, vect_location,
7829                          "statement is simple and uses invariant.  Leaving in "
7830                          "place.\n");
7831       return true;
7832     }
7833
7834   if (slp_node)
7835     ncopies = 1;
7836   else
7837     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7838
7839   if (slp_node)
7840     {
7841       gcc_assert (slp_index >= 0);
7842
7843       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7844       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7845
7846       /* Get the last occurrence of the scalar index from the concatenation of
7847          all the slp vectors. Calculate which slp vector it is and the index
7848          within.  */
7849       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7850
7851       /* Calculate which vector contains the result, and which lane of
7852          that vector we need.  */
7853       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7854         {
7855           if (dump_enabled_p ())
7856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7857                              "Cannot determine which vector holds the"
7858                              " final result.\n");
7859           return false;
7860         }
7861     }
7862
7863   if (!vec_stmt)
7864     {
7865       /* No transformation required.  */
7866       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7867         {
7868           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7869                                                OPTIMIZE_FOR_SPEED))
7870             {
7871               if (dump_enabled_p ())
7872                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7873                                  "can't use a fully-masked loop because "
7874                                  "the target doesn't support extract last "
7875                                  "reduction.\n");
7876               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7877             }
7878           else if (slp_node)
7879             {
7880               if (dump_enabled_p ())
7881                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7882                                  "can't use a fully-masked loop because an "
7883                                  "SLP statement is live after the loop.\n");
7884               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7885             }
7886           else if (ncopies > 1)
7887             {
7888               if (dump_enabled_p ())
7889                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7890                                  "can't use a fully-masked loop because"
7891                                  " ncopies is greater than 1.\n");
7892               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7893             }
7894           else
7895             {
7896               gcc_assert (ncopies == 1 && !slp_node);
7897               vect_record_loop_mask (loop_vinfo,
7898                                      &LOOP_VINFO_MASKS (loop_vinfo),
7899                                      1, vectype);
7900             }
7901         }
7902       return true;
7903     }
7904
7905   /* If stmt has a related stmt, then use that for getting the lhs.  */
7906   if (is_pattern_stmt_p (stmt_info))
7907     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7908
7909   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7910         : gimple_get_lhs (stmt);
7911   lhs_type = TREE_TYPE (lhs);
7912
7913   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7914              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7915              : TYPE_SIZE (TREE_TYPE (vectype)));
7916   vec_bitsize = TYPE_SIZE (vectype);
7917
7918   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7919   tree vec_lhs, bitstart;
7920   if (slp_node)
7921     {
7922       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7923
7924       /* Get the correct slp vectorized stmt.  */
7925       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7926       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7927         vec_lhs = gimple_phi_result (phi);
7928       else
7929         vec_lhs = gimple_get_lhs (vec_stmt);
7930
7931       /* Get entry to use.  */
7932       bitstart = bitsize_int (vec_index);
7933       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7934     }
7935   else
7936     {
7937       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7938       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7939       gcc_checking_assert (ncopies == 1
7940                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7941
7942       /* For multiple copies, get the last copy.  */
7943       for (int i = 1; i < ncopies; ++i)
7944         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7945                                                   vec_lhs);
7946
7947       /* Get the last lane in the vector.  */
7948       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7949     }
7950
7951   gimple_seq stmts = NULL;
7952   tree new_tree;
7953   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7954     {
7955       /* Emit:
7956
7957            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7958
7959          where VEC_LHS is the vectorized live-out result and MASK is
7960          the loop mask for the final iteration.  */
7961       gcc_assert (ncopies == 1 && !slp_node);
7962       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7963       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7964                                       1, vectype, 0);
7965       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7966                                       scalar_type, mask, vec_lhs);
7967
7968       /* Convert the extracted vector element to the required scalar type.  */
7969       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7970     }
7971   else
7972     {
7973       tree bftype = TREE_TYPE (vectype);
7974       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7975         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7976       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7977       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7978                                        &stmts, true, NULL_TREE);
7979     }
7980
7981   if (stmts)
7982     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7983
7984   /* Replace use of lhs with newly computed result.  If the use stmt is a
7985      single arg PHI, just replace all uses of PHI result.  It's necessary
7986      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7987   use_operand_p use_p;
7988   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7989     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7990         && !is_gimple_debug (use_stmt))
7991     {
7992       if (gimple_code (use_stmt) == GIMPLE_PHI
7993           && gimple_phi_num_args (use_stmt) == 1)
7994         {
7995           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7996         }
7997       else
7998         {
7999           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8000             SET_USE (use_p, new_tree);
8001         }
8002       update_stmt (use_stmt);
8003     }
8004
8005   return true;
8006 }
8007
8008 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8009
8010 static void
8011 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8012 {
8013   ssa_op_iter op_iter;
8014   imm_use_iterator imm_iter;
8015   def_operand_p def_p;
8016   gimple *ustmt;
8017
8018   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8019     {
8020       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8021         {
8022           basic_block bb;
8023
8024           if (!is_gimple_debug (ustmt))
8025             continue;
8026
8027           bb = gimple_bb (ustmt);
8028
8029           if (!flow_bb_inside_loop_p (loop, bb))
8030             {
8031               if (gimple_debug_bind_p (ustmt))
8032                 {
8033                   if (dump_enabled_p ())
8034                     dump_printf_loc (MSG_NOTE, vect_location,
8035                                      "killing debug use\n");
8036
8037                   gimple_debug_bind_reset_value (ustmt);
8038                   update_stmt (ustmt);
8039                 }
8040               else
8041                 gcc_unreachable ();
8042             }
8043         }
8044     }
8045 }
8046
8047 /* Given loop represented by LOOP_VINFO, return true if computation of
8048    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8049    otherwise.  */
8050
8051 static bool
8052 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8053 {
8054   /* Constant case.  */
8055   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8056     {
8057       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8058       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8059
8060       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8061       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8062       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8063         return true;
8064     }
8065
8066   widest_int max;
8067   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8068   /* Check the upper bound of loop niters.  */
8069   if (get_max_loop_iterations (loop, &max))
8070     {
8071       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8072       signop sgn = TYPE_SIGN (type);
8073       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8074       if (max < type_max)
8075         return true;
8076     }
8077   return false;
8078 }
8079
8080 /* Return a mask type with half the number of elements as TYPE.  */
8081
8082 tree
8083 vect_halve_mask_nunits (tree type)
8084 {
8085   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8086   return build_truth_vector_type (nunits, current_vector_size);
8087 }
8088
8089 /* Return a mask type with twice as many elements as TYPE.  */
8090
8091 tree
8092 vect_double_mask_nunits (tree type)
8093 {
8094   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8095   return build_truth_vector_type (nunits, current_vector_size);
8096 }
8097
8098 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8099    contain a sequence of NVECTORS masks that each control a vector of type
8100    VECTYPE.  */
8101
8102 void
8103 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8104                        unsigned int nvectors, tree vectype)
8105 {
8106   gcc_assert (nvectors != 0);
8107   if (masks->length () < nvectors)
8108     masks->safe_grow_cleared (nvectors);
8109   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8110   /* The number of scalars per iteration and the number of vectors are
8111      both compile-time constants.  */
8112   unsigned int nscalars_per_iter
8113     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8114                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8115   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8116     {
8117       rgm->max_nscalars_per_iter = nscalars_per_iter;
8118       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8119     }
8120 }
8121
8122 /* Given a complete set of masks MASKS, extract mask number INDEX
8123    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8124    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8125
8126    See the comment above vec_loop_masks for more details about the mask
8127    arrangement.  */
8128
8129 tree
8130 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8131                     unsigned int nvectors, tree vectype, unsigned int index)
8132 {
8133   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8134   tree mask_type = rgm->mask_type;
8135
8136   /* Populate the rgroup's mask array, if this is the first time we've
8137      used it.  */
8138   if (rgm->masks.is_empty ())
8139     {
8140       rgm->masks.safe_grow_cleared (nvectors);
8141       for (unsigned int i = 0; i < nvectors; ++i)
8142         {
8143           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8144           /* Provide a dummy definition until the real one is available.  */
8145           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8146           rgm->masks[i] = mask;
8147         }
8148     }
8149
8150   tree mask = rgm->masks[index];
8151   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8152                 TYPE_VECTOR_SUBPARTS (vectype)))
8153     {
8154       /* A loop mask for data type X can be reused for data type Y
8155          if X has N times more elements than Y and if Y's elements
8156          are N times bigger than X's.  In this case each sequence
8157          of N elements in the loop mask will be all-zero or all-one.
8158          We can then view-convert the mask so that each sequence of
8159          N elements is replaced by a single element.  */
8160       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8161                               TYPE_VECTOR_SUBPARTS (vectype)));
8162       gimple_seq seq = NULL;
8163       mask_type = build_same_sized_truth_vector_type (vectype);
8164       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8165       if (seq)
8166         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8167     }
8168   return mask;
8169 }
8170
8171 /* Scale profiling counters by estimation for LOOP which is vectorized
8172    by factor VF.  */
8173
8174 static void
8175 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8176 {
8177   edge preheader = loop_preheader_edge (loop);
8178   /* Reduce loop iterations by the vectorization factor.  */
8179   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8180   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8181
8182   if (freq_h.nonzero_p ())
8183     {
8184       profile_probability p;
8185
8186       /* Avoid dropping loop body profile counter to 0 because of zero count
8187          in loop's preheader.  */
8188       if (!(freq_e == profile_count::zero ()))
8189         freq_e = freq_e.force_nonzero ();
8190       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8191       scale_loop_frequencies (loop, p);
8192     }
8193
8194   edge exit_e = single_exit (loop);
8195   exit_e->probability = profile_probability::always ()
8196                                  .apply_scale (1, new_est_niter + 1);
8197
8198   edge exit_l = single_pred_edge (loop->latch);
8199   profile_probability prob = exit_l->probability;
8200   exit_l->probability = exit_e->probability.invert ();
8201   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8202     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8203 }
8204
8205 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8206    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8207    *SLP_SCHEDULE is a running record of whether we have called
8208    vect_schedule_slp.  */
8209
8210 static void
8211 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8212                           gimple_stmt_iterator *gsi,
8213                           stmt_vec_info *seen_store, bool *slp_scheduled)
8214 {
8215   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8216   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8217   stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8218   if (!stmt_info)
8219     return;
8220
8221   if (dump_enabled_p ())
8222     {
8223       dump_printf_loc (MSG_NOTE, vect_location,
8224                        "------>vectorizing statement: ");
8225       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8226     }
8227
8228   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8229     vect_loop_kill_debug_uses (loop, stmt);
8230
8231   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8232       && !STMT_VINFO_LIVE_P (stmt_info))
8233     return;
8234
8235   if (STMT_VINFO_VECTYPE (stmt_info))
8236     {
8237       poly_uint64 nunits
8238         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8239       if (!STMT_SLP_TYPE (stmt_info)
8240           && maybe_ne (nunits, vf)
8241           && dump_enabled_p ())
8242         /* For SLP VF is set according to unrolling factor, and not
8243            to vector size, hence for SLP this print is not valid.  */
8244         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8245     }
8246
8247   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8248      reached.  */
8249   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8250     {
8251
8252       if (!*slp_scheduled)
8253         {
8254           *slp_scheduled = true;
8255
8256           DUMP_VECT_SCOPE ("scheduling SLP instances");
8257
8258           vect_schedule_slp (loop_vinfo);
8259         }
8260
8261       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8262       if (slptype == pure_slp)
8263         return;
8264     }
8265
8266   if (dump_enabled_p ())
8267     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8268
8269   bool grouped_store = false;
8270   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8271     *seen_store = stmt_info;
8272 }
8273
8274 /* Function vect_transform_loop.
8275
8276    The analysis phase has determined that the loop is vectorizable.
8277    Vectorize the loop - created vectorized stmts to replace the scalar
8278    stmts in the loop, and update the loop exit condition.
8279    Returns scalar epilogue loop if any.  */
8280
8281 struct loop *
8282 vect_transform_loop (loop_vec_info loop_vinfo)
8283 {
8284   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8285   struct loop *epilogue = NULL;
8286   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8287   int nbbs = loop->num_nodes;
8288   int i;
8289   tree niters_vector = NULL_TREE;
8290   tree step_vector = NULL_TREE;
8291   tree niters_vector_mult_vf = NULL_TREE;
8292   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8293   unsigned int lowest_vf = constant_lower_bound (vf);
8294   bool slp_scheduled = false;
8295   gimple *stmt;
8296   bool check_profitability = false;
8297   unsigned int th;
8298
8299   DUMP_VECT_SCOPE ("vec_transform_loop");
8300
8301   loop_vinfo->shared->check_datarefs ();
8302
8303   /* Use the more conservative vectorization threshold.  If the number
8304      of iterations is constant assume the cost check has been performed
8305      by our caller.  If the threshold makes all loops profitable that
8306      run at least the (estimated) vectorization factor number of times
8307      checking is pointless, too.  */
8308   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8309   if (th >= vect_vf_for_cost (loop_vinfo)
8310       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8311     {
8312       if (dump_enabled_p ())
8313         dump_printf_loc (MSG_NOTE, vect_location,
8314                          "Profitability threshold is %d loop iterations.\n",
8315                          th);
8316       check_profitability = true;
8317     }
8318
8319   /* Make sure there exists a single-predecessor exit bb.  Do this before
8320      versioning.   */
8321   edge e = single_exit (loop);
8322   if (! single_pred_p (e->dest))
8323     {
8324       split_loop_exit_edge (e);
8325       if (dump_enabled_p ())
8326         dump_printf (MSG_NOTE, "split exit edge\n");
8327     }
8328
8329   /* Version the loop first, if required, so the profitability check
8330      comes first.  */
8331
8332   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8333     {
8334       poly_uint64 versioning_threshold
8335         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8336       if (check_profitability
8337           && ordered_p (poly_uint64 (th), versioning_threshold))
8338         {
8339           versioning_threshold = ordered_max (poly_uint64 (th),
8340                                               versioning_threshold);
8341           check_profitability = false;
8342         }
8343       vect_loop_versioning (loop_vinfo, th, check_profitability,
8344                             versioning_threshold);
8345       check_profitability = false;
8346     }
8347
8348   /* Make sure there exists a single-predecessor exit bb also on the
8349      scalar loop copy.  Do this after versioning but before peeling
8350      so CFG structure is fine for both scalar and if-converted loop
8351      to make slpeel_duplicate_current_defs_from_edges face matched
8352      loop closed PHI nodes on the exit.  */
8353   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8354     {
8355       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8356       if (! single_pred_p (e->dest))
8357         {
8358           split_loop_exit_edge (e);
8359           if (dump_enabled_p ())
8360             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8361         }
8362     }
8363
8364   tree niters = vect_build_loop_niters (loop_vinfo);
8365   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8366   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8367   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8368   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8369                               &step_vector, &niters_vector_mult_vf, th,
8370                               check_profitability, niters_no_overflow);
8371
8372   if (niters_vector == NULL_TREE)
8373     {
8374       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8375           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8376           && known_eq (lowest_vf, vf))
8377         {
8378           niters_vector
8379             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8380                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8381           step_vector = build_one_cst (TREE_TYPE (niters));
8382         }
8383       else
8384         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8385                                      &step_vector, niters_no_overflow);
8386     }
8387
8388   /* 1) Make sure the loop header has exactly two entries
8389      2) Make sure we have a preheader basic block.  */
8390
8391   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8392
8393   split_edge (loop_preheader_edge (loop));
8394
8395   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8396       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8397     /* This will deal with any possible peeling.  */
8398     vect_prepare_for_masked_peels (loop_vinfo);
8399
8400   /* FORNOW: the vectorizer supports only loops which body consist
8401      of one basic block (header + empty latch). When the vectorizer will
8402      support more involved loop forms, the order by which the BBs are
8403      traversed need to be reconsidered.  */
8404
8405   for (i = 0; i < nbbs; i++)
8406     {
8407       basic_block bb = bbs[i];
8408       stmt_vec_info stmt_info;
8409
8410       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8411            gsi_next (&si))
8412         {
8413           gphi *phi = si.phi ();
8414           if (dump_enabled_p ())
8415             {
8416               dump_printf_loc (MSG_NOTE, vect_location,
8417                                "------>vectorizing phi: ");
8418               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8419             }
8420           stmt_info = loop_vinfo->lookup_stmt (phi);
8421           if (!stmt_info)
8422             continue;
8423
8424           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8425             vect_loop_kill_debug_uses (loop, phi);
8426
8427           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8428               && !STMT_VINFO_LIVE_P (stmt_info))
8429             continue;
8430
8431           if (STMT_VINFO_VECTYPE (stmt_info)
8432               && (maybe_ne
8433                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8434               && dump_enabled_p ())
8435             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8436
8437           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8438                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8439                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8440               && ! PURE_SLP_STMT (stmt_info))
8441             {
8442               if (dump_enabled_p ())
8443                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8444               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8445             }
8446         }
8447
8448       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8449            !gsi_end_p (si);)
8450         {
8451           stmt = gsi_stmt (si);
8452           /* During vectorization remove existing clobber stmts.  */
8453           if (gimple_clobber_p (stmt))
8454             {
8455               unlink_stmt_vdef (stmt);
8456               gsi_remove (&si, true);
8457               release_defs (stmt);
8458             }
8459           else
8460             {
8461               stmt_info = loop_vinfo->lookup_stmt (stmt);
8462
8463               /* vector stmts created in the outer-loop during vectorization of
8464                  stmts in an inner-loop may not have a stmt_info, and do not
8465                  need to be vectorized.  */
8466               stmt_vec_info seen_store = NULL;
8467               if (stmt_info)
8468                 {
8469                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8470                     {
8471                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8472                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8473                            !gsi_end_p (subsi); gsi_next (&subsi))
8474                         vect_transform_loop_stmt (loop_vinfo,
8475                                                   gsi_stmt (subsi), &si,
8476                                                   &seen_store,
8477                                                   &slp_scheduled);
8478                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8479                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8480                                                 &seen_store, &slp_scheduled);
8481                     }
8482                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8483                                             &seen_store, &slp_scheduled);
8484                 }
8485               if (seen_store)
8486                 {
8487                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8488                     {
8489                       /* Interleaving.  If IS_STORE is TRUE, the
8490                          vectorization of the interleaving chain was
8491                          completed - free all the stores in the chain.  */
8492                       gsi_next (&si);
8493                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8494                     }
8495                   else
8496                     {
8497                       /* Free the attached stmt_vec_info and remove the
8498                          stmt.  */
8499                       free_stmt_vec_info (stmt);
8500                       unlink_stmt_vdef (stmt);
8501                       gsi_remove (&si, true);
8502                       release_defs (stmt);
8503                     }
8504                 }
8505               else
8506                 gsi_next (&si);
8507             }
8508         }
8509
8510       /* Stub out scalar statements that must not survive vectorization.
8511          Doing this here helps with grouped statements, or statements that
8512          are involved in patterns.  */
8513       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8514            !gsi_end_p (gsi); gsi_next (&gsi))
8515         {
8516           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8517           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8518             {
8519               tree lhs = gimple_get_lhs (call);
8520               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8521                 {
8522                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8523                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8524                   gsi_replace (&gsi, new_stmt, true);
8525                 }
8526             }
8527         }
8528     }                           /* BBs in loop */
8529
8530   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8531      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8532   if (integer_onep (step_vector))
8533     niters_no_overflow = true;
8534   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8535                            niters_vector_mult_vf, !niters_no_overflow);
8536
8537   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8538   scale_profile_for_vect_loop (loop, assumed_vf);
8539
8540   /* True if the final iteration might not handle a full vector's
8541      worth of scalar iterations.  */
8542   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8543   /* The minimum number of iterations performed by the epilogue.  This
8544      is 1 when peeling for gaps because we always need a final scalar
8545      iteration.  */
8546   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8547   /* +1 to convert latch counts to loop iteration counts,
8548      -min_epilogue_iters to remove iterations that cannot be performed
8549        by the vector code.  */
8550   int bias_for_lowest = 1 - min_epilogue_iters;
8551   int bias_for_assumed = bias_for_lowest;
8552   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8553   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8554     {
8555       /* When the amount of peeling is known at compile time, the first
8556          iteration will have exactly alignment_npeels active elements.
8557          In the worst case it will have at least one.  */
8558       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8559       bias_for_lowest += lowest_vf - min_first_active;
8560       bias_for_assumed += assumed_vf - min_first_active;
8561     }
8562   /* In these calculations the "- 1" converts loop iteration counts
8563      back to latch counts.  */
8564   if (loop->any_upper_bound)
8565     loop->nb_iterations_upper_bound
8566       = (final_iter_may_be_partial
8567          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8568                           lowest_vf) - 1
8569          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8570                            lowest_vf) - 1);
8571   if (loop->any_likely_upper_bound)
8572     loop->nb_iterations_likely_upper_bound
8573       = (final_iter_may_be_partial
8574          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8575                           + bias_for_lowest, lowest_vf) - 1
8576          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8577                            + bias_for_lowest, lowest_vf) - 1);
8578   if (loop->any_estimate)
8579     loop->nb_iterations_estimate
8580       = (final_iter_may_be_partial
8581          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8582                           assumed_vf) - 1
8583          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8584                            assumed_vf) - 1);
8585
8586   if (dump_enabled_p ())
8587     {
8588       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8589         {
8590           dump_printf_loc (MSG_NOTE, vect_location,
8591                            "LOOP VECTORIZED\n");
8592           if (loop->inner)
8593             dump_printf_loc (MSG_NOTE, vect_location,
8594                              "OUTER LOOP VECTORIZED\n");
8595           dump_printf (MSG_NOTE, "\n");
8596         }
8597       else
8598         {
8599           dump_printf_loc (MSG_NOTE, vect_location,
8600                            "LOOP EPILOGUE VECTORIZED (VS=");
8601           dump_dec (MSG_NOTE, current_vector_size);
8602           dump_printf (MSG_NOTE, ")\n");
8603         }
8604     }
8605
8606   /* Free SLP instances here because otherwise stmt reference counting
8607      won't work.  */
8608   slp_instance instance;
8609   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8610     vect_free_slp_instance (instance, true);
8611   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8612   /* Clear-up safelen field since its value is invalid after vectorization
8613      since vectorized loop can have loop-carried dependencies.  */
8614   loop->safelen = 0;
8615
8616   /* Don't vectorize epilogue for epilogue.  */
8617   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8618     epilogue = NULL;
8619
8620   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8621     epilogue = NULL;
8622
8623   if (epilogue)
8624     {
8625       auto_vector_sizes vector_sizes;
8626       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8627       unsigned int next_size = 0;
8628
8629       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8630           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8631           && known_eq (vf, lowest_vf))
8632         {
8633           unsigned int eiters
8634             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8635                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8636           eiters = eiters % lowest_vf;
8637           epilogue->nb_iterations_upper_bound = eiters - 1;
8638
8639           unsigned int ratio;
8640           while (next_size < vector_sizes.length ()
8641                  && !(constant_multiple_p (current_vector_size,
8642                                            vector_sizes[next_size], &ratio)
8643                       && eiters >= lowest_vf / ratio))
8644             next_size += 1;
8645         }
8646       else
8647         while (next_size < vector_sizes.length ()
8648                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8649           next_size += 1;
8650
8651       if (next_size == vector_sizes.length ())
8652         epilogue = NULL;
8653     }
8654
8655   if (epilogue)
8656     {
8657       epilogue->force_vectorize = loop->force_vectorize;
8658       epilogue->safelen = loop->safelen;
8659       epilogue->dont_vectorize = false;
8660
8661       /* We may need to if-convert epilogue to vectorize it.  */
8662       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8663         tree_if_conversion (epilogue);
8664     }
8665
8666   return epilogue;
8667 }
8668
8669 /* The code below is trying to perform simple optimization - revert
8670    if-conversion for masked stores, i.e. if the mask of a store is zero
8671    do not perform it and all stored value producers also if possible.
8672    For example,
8673      for (i=0; i<n; i++)
8674        if (c[i])
8675         {
8676           p1[i] += 1;
8677           p2[i] = p3[i] +2;
8678         }
8679    this transformation will produce the following semi-hammock:
8680
8681    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8682      {
8683        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8684        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8685        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8686        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8687        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8688        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8689      }
8690 */
8691
8692 void
8693 optimize_mask_stores (struct loop *loop)
8694 {
8695   basic_block *bbs = get_loop_body (loop);
8696   unsigned nbbs = loop->num_nodes;
8697   unsigned i;
8698   basic_block bb;
8699   struct loop *bb_loop;
8700   gimple_stmt_iterator gsi;
8701   gimple *stmt;
8702   auto_vec<gimple *> worklist;
8703
8704   vect_location = find_loop_location (loop);
8705   /* Pick up all masked stores in loop if any.  */
8706   for (i = 0; i < nbbs; i++)
8707     {
8708       bb = bbs[i];
8709       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8710            gsi_next (&gsi))
8711         {
8712           stmt = gsi_stmt (gsi);
8713           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8714             worklist.safe_push (stmt);
8715         }
8716     }
8717
8718   free (bbs);
8719   if (worklist.is_empty ())
8720     return;
8721
8722   /* Loop has masked stores.  */
8723   while (!worklist.is_empty ())
8724     {
8725       gimple *last, *last_store;
8726       edge e, efalse;
8727       tree mask;
8728       basic_block store_bb, join_bb;
8729       gimple_stmt_iterator gsi_to;
8730       tree vdef, new_vdef;
8731       gphi *phi;
8732       tree vectype;
8733       tree zero;
8734
8735       last = worklist.pop ();
8736       mask = gimple_call_arg (last, 2);
8737       bb = gimple_bb (last);
8738       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8739          the same loop as if_bb.  It could be different to LOOP when two
8740          level loop-nest is vectorized and mask_store belongs to the inner
8741          one.  */
8742       e = split_block (bb, last);
8743       bb_loop = bb->loop_father;
8744       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8745       join_bb = e->dest;
8746       store_bb = create_empty_bb (bb);
8747       add_bb_to_loop (store_bb, bb_loop);
8748       e->flags = EDGE_TRUE_VALUE;
8749       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8750       /* Put STORE_BB to likely part.  */
8751       efalse->probability = profile_probability::unlikely ();
8752       store_bb->count = efalse->count ();
8753       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8754       if (dom_info_available_p (CDI_DOMINATORS))
8755         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8756       if (dump_enabled_p ())
8757         dump_printf_loc (MSG_NOTE, vect_location,
8758                          "Create new block %d to sink mask stores.",
8759                          store_bb->index);
8760       /* Create vector comparison with boolean result.  */
8761       vectype = TREE_TYPE (mask);
8762       zero = build_zero_cst (vectype);
8763       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8764       gsi = gsi_last_bb (bb);
8765       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8766       /* Create new PHI node for vdef of the last masked store:
8767          .MEM_2 = VDEF <.MEM_1>
8768          will be converted to
8769          .MEM.3 = VDEF <.MEM_1>
8770          and new PHI node will be created in join bb
8771          .MEM_2 = PHI <.MEM_1, .MEM_3>
8772       */
8773       vdef = gimple_vdef (last);
8774       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8775       gimple_set_vdef (last, new_vdef);
8776       phi = create_phi_node (vdef, join_bb);
8777       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8778
8779       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8780       while (true)
8781         {
8782           gimple_stmt_iterator gsi_from;
8783           gimple *stmt1 = NULL;
8784
8785           /* Move masked store to STORE_BB.  */
8786           last_store = last;
8787           gsi = gsi_for_stmt (last);
8788           gsi_from = gsi;
8789           /* Shift GSI to the previous stmt for further traversal.  */
8790           gsi_prev (&gsi);
8791           gsi_to = gsi_start_bb (store_bb);
8792           gsi_move_before (&gsi_from, &gsi_to);
8793           /* Setup GSI_TO to the non-empty block start.  */
8794           gsi_to = gsi_start_bb (store_bb);
8795           if (dump_enabled_p ())
8796             {
8797               dump_printf_loc (MSG_NOTE, vect_location,
8798                                "Move stmt to created bb\n");
8799               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8800             }
8801           /* Move all stored value producers if possible.  */
8802           while (!gsi_end_p (gsi))
8803             {
8804               tree lhs;
8805               imm_use_iterator imm_iter;
8806               use_operand_p use_p;
8807               bool res;
8808
8809               /* Skip debug statements.  */
8810               if (is_gimple_debug (gsi_stmt (gsi)))
8811                 {
8812                   gsi_prev (&gsi);
8813                   continue;
8814                 }
8815               stmt1 = gsi_stmt (gsi);
8816               /* Do not consider statements writing to memory or having
8817                  volatile operand.  */
8818               if (gimple_vdef (stmt1)
8819                   || gimple_has_volatile_ops (stmt1))
8820                 break;
8821               gsi_from = gsi;
8822               gsi_prev (&gsi);
8823               lhs = gimple_get_lhs (stmt1);
8824               if (!lhs)
8825                 break;
8826
8827               /* LHS of vectorized stmt must be SSA_NAME.  */
8828               if (TREE_CODE (lhs) != SSA_NAME)
8829                 break;
8830
8831               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8832                 {
8833                   /* Remove dead scalar statement.  */
8834                   if (has_zero_uses (lhs))
8835                     {
8836                       gsi_remove (&gsi_from, true);
8837                       continue;
8838                     }
8839                 }
8840
8841               /* Check that LHS does not have uses outside of STORE_BB.  */
8842               res = true;
8843               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8844                 {
8845                   gimple *use_stmt;
8846                   use_stmt = USE_STMT (use_p);
8847                   if (is_gimple_debug (use_stmt))
8848                     continue;
8849                   if (gimple_bb (use_stmt) != store_bb)
8850                     {
8851                       res = false;
8852                       break;
8853                     }
8854                 }
8855               if (!res)
8856                 break;
8857
8858               if (gimple_vuse (stmt1)
8859                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8860                 break;
8861
8862               /* Can move STMT1 to STORE_BB.  */
8863               if (dump_enabled_p ())
8864                 {
8865                   dump_printf_loc (MSG_NOTE, vect_location,
8866                                    "Move stmt to created bb\n");
8867                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8868                 }
8869               gsi_move_before (&gsi_from, &gsi_to);
8870               /* Shift GSI_TO for further insertion.  */
8871               gsi_prev (&gsi_to);
8872             }
8873           /* Put other masked stores with the same mask to STORE_BB.  */
8874           if (worklist.is_empty ()
8875               || gimple_call_arg (worklist.last (), 2) != mask
8876               || worklist.last () != stmt1)
8877             break;
8878           last = worklist.pop ();
8879         }
8880       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8881     }
8882 }