gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 316                                *indices);
 317 }
 318
 319 /* Try to use permutes to define the masks in DEST_RGM using the masks
 320    in SRC_RGM, given that the former has twice as many masks as the
 321    latter.  Return true on success, adding any new statements to SEQ.  */
 322
 323 static bool
 324 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 325                                rgroup_controls *src_rgm)
 326 {
 327   tree src_masktype = src_rgm->type;
 328   tree dest_masktype = dest_rgm->type;
 329   machine_mode src_mode = TYPE_MODE (src_masktype);
 330   insn_code icode1, icode2;
 331   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 332       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 333                                   src_mode)) != CODE_FOR_nothing
 334       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 335                                   src_mode)) != CODE_FOR_nothing)
 336     {
 337       /* Unpacking the source masks gives at least as many mask bits as
 338          we need.  We can then VIEW_CONVERT any excess bits away.  */
 339       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 340       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 341       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 342       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 343         {
 344           tree src = src_rgm->controls[i / 2];
 345           tree dest = dest_rgm->controls[i];
 346           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 347                             ? VEC_UNPACK_HI_EXPR
 348                             : VEC_UNPACK_LO_EXPR);
 349           gassign *stmt;
 350           if (dest_masktype == unpack_masktype)
 351             stmt = gimple_build_assign (dest, code, src);
 352           else
 353             {
 354               tree temp = make_ssa_name (unpack_masktype);
 355               stmt = gimple_build_assign (temp, code, src);
 356               gimple_seq_add_stmt (seq, stmt);
 357               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 358                                           build1 (VIEW_CONVERT_EXPR,
 359                                                   dest_masktype, temp));
 360             }
 361           gimple_seq_add_stmt (seq, stmt);
 362         }
 363       return true;
 364     }
 365   vec_perm_indices indices[2];
 366   if (dest_masktype == src_masktype
 367       && interleave_supported_p (&indices[0], src_masktype, 0)
 368       && interleave_supported_p (&indices[1], src_masktype, 1))
 369     {
 370       /* The destination requires twice as many mask bits as the source, so
 371          we can use interleaving permutes to double up the number of bits.  */
 372       tree masks[2];
 373       for (unsigned int i = 0; i < 2; ++i)
 374         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 375       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 376         {
 377           tree src = src_rgm->controls[i / 2];
 378           tree dest = dest_rgm->controls[i];
 379           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 380                                               src, src, masks[i & 1]);
 381           gimple_seq_add_stmt (seq, stmt);
 382         }
 383       return true;
 384     }
 385   return false;
 386 }
 387
 388 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 389    for all the rgroup controls in RGC and return a control that is nonzero
 390    when the loop needs to iterate.  Add any new preheader statements to
 391    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 392
 393    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 394    times and has been vectorized according to LOOP_VINFO.
 395
 396    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 397    starts with NITERS_SKIP dummy iterations of the scalar loop before
 398    the real work starts.  The mask elements for these dummy iterations
 399    must be 0, to ensure that the extra iterations do not have an effect.
 400
 401    It is known that:
 402
 403      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 404
 405    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 406    variable that starts at 0 and has step:
 407
 408      VF * RGC->max_nscalars_per_iter * RGC->factor
 409
 410    might overflow before hitting a value above:
 411
 412      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 413
 414    This means that we cannot guarantee that such an induction variable
 415    would ever hit a value that produces a set of all-false masks or zero
 416    lengths for RGC.
 417
 418    Note: the cost of the code generated by this function is modeled
 419    by vect_estimate_min_profitable_iters, so changes here may need
 420    corresponding changes there.  */
 421
 422 static tree
 423 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 424                                  gimple_seq *preheader_seq,
 425                                  gimple_seq *header_seq,
 426                                  gimple_stmt_iterator loop_cond_gsi,
 427                                  rgroup_controls *rgc, tree niters,
 428                                  tree niters_skip, bool might_wrap_p)
 429 {
 430   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 431   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 432   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 433
 434   tree ctrl_type = rgc->type;
 435   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 436   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 437   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 438   tree length_limit = NULL_TREE;
 439   /* For length, we need length_limit to ensure length in range.  */
 440   if (!use_masks_p)
 441     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 442
 443   /* Calculate the maximum number of item values that the rgroup
 444      handles in total, the number that it handles for each iteration
 445      of the vector loop, and the number that it should skip during the
 446      first iteration of the vector loop.  */
 447   tree nitems_total = niters;
 448   tree nitems_step = build_int_cst (iv_type, vf);
 449   tree nitems_skip = niters_skip;
 450   if (nitems_per_iter != 1)
 451     {
 452       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 453          these multiplications don't overflow.  */
 454       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 455       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 456       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 457                                    nitems_total, compare_factor);
 458       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 459                                   nitems_step, iv_factor);
 460       if (nitems_skip)
 461         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 462                                     nitems_skip, compare_factor);
 463     }
 464
 465   /* Create an induction variable that counts the number of items
 466      processed.  */
 467   tree index_before_incr, index_after_incr;
 468   gimple_stmt_iterator incr_gsi;
 469   bool insert_after;
 470   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 471   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 472              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 473
 474   tree zero_index = build_int_cst (compare_type, 0);
 475   tree test_index, test_limit, first_limit;
 476   gimple_stmt_iterator *test_gsi;
 477   if (might_wrap_p)
 478     {
 479       /* In principle the loop should stop iterating once the incremented
 480          IV reaches a value greater than or equal to:
 481
 482            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 483
 484          However, there's no guarantee that this addition doesn't overflow
 485          the comparison type, or that the IV hits a value above it before
 486          wrapping around.  We therefore adjust the limit down by one
 487          IV step:
 488
 489            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 490            -[infinite-prec] NITEMS_STEP
 491
 492          and compare the IV against this limit _before_ incrementing it.
 493          Since the comparison type is unsigned, we actually want the
 494          subtraction to saturate at zero:
 495
 496            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 497            -[sat] NITEMS_STEP
 498
 499          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 500
 501            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 502
 503          where the rightmost subtraction can be done directly in
 504          COMPARE_TYPE.  */
 505       test_index = index_before_incr;
 506       tree adjust = gimple_convert (preheader_seq, compare_type,
 507                                     nitems_step);
 508       if (nitems_skip)
 509         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 510                                adjust, nitems_skip);
 511       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 512                                  nitems_total, adjust);
 513       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 514                                  test_limit, adjust);
 515       test_gsi = &incr_gsi;
 516
 517       /* Get a safe limit for the first iteration.  */
 518       if (nitems_skip)
 519         {
 520           /* The first vector iteration can handle at most NITEMS_STEP
 521              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 522              NITEMS_SKIP to that cannot overflow.  */
 523           tree const_limit = build_int_cst (compare_type,
 524                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 525                                             * nitems_per_iter);
 526           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 527                                       nitems_total, const_limit);
 528           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 529                                       first_limit, nitems_skip);
 530         }
 531       else
 532         /* For the first iteration it doesn't matter whether the IV hits
 533            a value above NITEMS_TOTAL.  That only matters for the latch
 534            condition.  */
 535         first_limit = nitems_total;
 536     }
 537   else
 538     {
 539       /* Test the incremented IV, which will always hit a value above
 540          the bound before wrapping.  */
 541       test_index = index_after_incr;
 542       test_limit = nitems_total;
 543       if (nitems_skip)
 544         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 545                                    test_limit, nitems_skip);
 546       test_gsi = &loop_cond_gsi;
 547
 548       first_limit = test_limit;
 549     }
 550
 551   /* Convert the IV value to the comparison type (either a no-op or
 552      a demotion).  */
 553   gimple_seq test_seq = NULL;
 554   test_index = gimple_convert (&test_seq, compare_type, test_index);
 555   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 556
 557   /* Provide a definition of each control in the group.  */
 558   tree next_ctrl = NULL_TREE;
 559   tree ctrl;
 560   unsigned int i;
 561   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 562     {
 563       /* Previous controls will cover BIAS items.  This control covers the
 564          next batch.  */
 565       poly_uint64 bias = nitems_per_ctrl * i;
 566       tree bias_tree = build_int_cst (compare_type, bias);
 567
 568       /* See whether the first iteration of the vector loop is known
 569          to have a full control.  */
 570       poly_uint64 const_limit;
 571       bool first_iteration_full
 572         = (poly_int_tree_p (first_limit, &const_limit)
 573            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 574
 575       /* Rather than have a new IV that starts at BIAS and goes up to
 576          TEST_LIMIT, prefer to use the same 0-based IV for each control
 577          and adjust the bound down by BIAS.  */
 578       tree this_test_limit = test_limit;
 579       if (i != 0)
 580         {
 581           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 582                                           compare_type, this_test_limit,
 583                                           bias_tree);
 584           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 585                                           compare_type, this_test_limit,
 586                                           bias_tree);
 587         }
 588
 589       /* Create the initial control.  First include all items that
 590          are within the loop limit.  */
 591       tree init_ctrl = NULL_TREE;
 592       if (!first_iteration_full)
 593         {
 594           tree start, end;
 595           if (first_limit == test_limit)
 596             {
 597               /* Use a natural test between zero (the initial IV value)
 598                  and the loop limit.  The "else" block would be valid too,
 599                  but this choice can avoid the need to load BIAS_TREE into
 600                  a register.  */
 601               start = zero_index;
 602               end = this_test_limit;
 603             }
 604           else
 605             {
 606               /* FIRST_LIMIT is the maximum number of items handled by the
 607                  first iteration of the vector loop.  Test the portion
 608                  associated with this control.  */
 609               start = bias_tree;
 610               end = first_limit;
 611             }
 612
 613           if (use_masks_p)
 614             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 615                                         start, end, "max_mask");
 616           else
 617             {
 618               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 619               gimple_seq seq = vect_gen_len (init_ctrl, start,
 620                                              end, length_limit);
 621               gimple_seq_add_seq (preheader_seq, seq);
 622             }
 623         }
 624
 625       /* Now AND out the bits that are within the number of skipped
 626          items.  */
 627       poly_uint64 const_skip;
 628       if (nitems_skip
 629           && !(poly_int_tree_p (nitems_skip, &const_skip)
 630                && known_le (const_skip, bias)))
 631         {
 632           gcc_assert (use_masks_p);
 633           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 634                                                     bias_tree, nitems_skip);
 635           if (init_ctrl)
 636             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 637                                       init_ctrl, unskipped_mask);
 638           else
 639             init_ctrl = unskipped_mask;
 640         }
 641
 642       if (!init_ctrl)
 643         {
 644           /* First iteration is full.  */
 645           if (use_masks_p)
 646             init_ctrl = build_minus_one_cst (ctrl_type);
 647           else
 648             init_ctrl = length_limit;
 649         }
 650
 651       /* Get the control value for the next iteration of the loop.  */
 652       if (use_masks_p)
 653         {
 654           gimple_seq stmts = NULL;
 655           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 656                                       this_test_limit, "next_mask");
 657           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 658         }
 659       else
 660         {
 661           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 662           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 663                                          length_limit);
 664           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 665         }
 666
 667       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 668     }
 669
 670   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 671   if (partial_load_bias != 0)
 672     {
 673       tree adjusted_len = rgc->bias_adjusted_ctrl;
 674       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 675                                             rgc->controls[0],
 676                                             build_int_cst
 677                                             (TREE_TYPE (rgc->controls[0]),
 678                                              partial_load_bias));
 679       gimple_seq_add_stmt (header_seq, minus);
 680     }
 681
 682   return next_ctrl;
 683 }
 684
 685 /* Set up the iteration condition and rgroup controls for LOOP, given
 686    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 687    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 688    the number of iterations of the original scalar loop that should be
 689    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 690    for vect_set_loop_condition.
 691
 692    Insert the branch-back condition before LOOP_COND_GSI and return the
 693    final gcond.  */
 694
 695 static gcond *
 696 vect_set_loop_condition_partial_vectors (class loop *loop,
 697                                          loop_vec_info loop_vinfo, tree niters,
 698                                          tree final_iv, bool niters_maybe_zero,
 699                                          gimple_stmt_iterator loop_cond_gsi)
 700 {
 701   gimple_seq preheader_seq = NULL;
 702   gimple_seq header_seq = NULL;
 703
 704   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 705   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 706   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 707   tree orig_niters = niters;
 708
 709   /* Type of the initial value of NITERS.  */
 710   tree ni_actual_type = TREE_TYPE (niters);
 711   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 712   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 713
 714   /* Convert NITERS to the same size as the compare.  */
 715   if (compare_precision > ni_actual_precision
 716       && niters_maybe_zero)
 717     {
 718       /* We know that there is always at least one iteration, so if the
 719          count is zero then it must have wrapped.  Cope with this by
 720          subtracting 1 before the conversion and adding 1 to the result.  */
 721       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 722       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 723                              niters, build_minus_one_cst (ni_actual_type));
 724       niters = gimple_convert (&preheader_seq, compare_type, niters);
 725       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 726                              niters, build_one_cst (compare_type));
 727     }
 728   else
 729     niters = gimple_convert (&preheader_seq, compare_type, niters);
 730
 731   /* Iterate over all the rgroups and fill in their controls.  We could use
 732      the first control from any rgroup for the loop condition; here we
 733      arbitrarily pick the last.  */
 734   tree test_ctrl = NULL_TREE;
 735   rgroup_controls *rgc;
 736   unsigned int i;
 737   auto_vec<rgroup_controls> *controls = use_masks_p
 738                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 739                                           : &LOOP_VINFO_LENS (loop_vinfo);
 740   FOR_EACH_VEC_ELT (*controls, i, rgc)
 741     if (!rgc->controls.is_empty ())
 742       {
 743         /* First try using permutes.  This adds a single vector
 744            instruction to the loop for each mask, but needs no extra
 745            loop invariants or IVs.  */
 746         unsigned int nmasks = i + 1;
 747         if (use_masks_p && (nmasks & 1) == 0)
 748           {
 749             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 750             if (!half_rgc->controls.is_empty ()
 751                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 752               continue;
 753           }
 754
 755         /* See whether zero-based IV would ever generate all-false masks
 756            or zero length before wrapping around.  */
 757         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 758
 759         /* Set up all controls for this group.  */
 760         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 761                                                      &preheader_seq,
 762                                                      &header_seq,
 763                                                      loop_cond_gsi, rgc,
 764                                                      niters, niters_skip,
 765                                                      might_wrap_p);
 766       }
 767
 768   /* Emit all accumulated statements.  */
 769   add_preheader_seq (loop, preheader_seq);
 770   add_header_seq (loop, header_seq);
 771
 772   /* Get a boolean result that tells us whether to iterate.  */
 773   edge exit_edge = single_exit (loop);
 774   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 775   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 776   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 777                                         NULL_TREE, NULL_TREE);
 778   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 779
 780   /* The loop iterates (NITERS - 1) / VF + 1 times.
 781      Subtract one from this to get the latch count.  */
 782   tree step = build_int_cst (compare_type,
 783                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 784   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 785                                        build_minus_one_cst (compare_type));
 786   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 787                                      niters_minus_one, step);
 788
 789   if (final_iv)
 790     {
 791       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 792       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 793     }
 794
 795   return cond_stmt;
 796 }
 797
 798 /* Like vect_set_loop_condition, but handle the case in which the vector
 799    loop handles exactly VF scalars per iteration.  */
 800
 801 static gcond *
 802 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 803                                 tree final_iv, bool niters_maybe_zero,
 804                                 gimple_stmt_iterator loop_cond_gsi)
 805 {
 806   tree indx_before_incr, indx_after_incr;
 807   gcond *cond_stmt;
 808   gcond *orig_cond;
 809   edge pe = loop_preheader_edge (loop);
 810   edge exit_edge = single_exit (loop);
 811   gimple_stmt_iterator incr_gsi;
 812   bool insert_after;
 813   enum tree_code code;
 814   tree niters_type = TREE_TYPE (niters);
 815
 816   orig_cond = get_loop_exit_condition (loop);
 817   gcc_assert (orig_cond);
 818   loop_cond_gsi = gsi_for_stmt (orig_cond);
 819
 820   tree init, limit;
 821   if (!niters_maybe_zero && integer_onep (step))
 822     {
 823       /* In this case we can use a simple 0-based IV:
 824
 825          A:
 826            x = 0;
 827            do
 828              {
 829                ...
 830                x += 1;
 831              }
 832            while (x < NITERS);  */
 833       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 834       init = build_zero_cst (niters_type);
 835       limit = niters;
 836     }
 837   else
 838     {
 839       /* The following works for all values of NITERS except 0:
 840
 841          B:
 842            x = 0;
 843            do
 844              {
 845                ...
 846                x += STEP;
 847              }
 848            while (x <= NITERS - STEP);
 849
 850          so that the loop continues to iterate if x + STEP - 1 < NITERS
 851          but stops if x + STEP - 1 >= NITERS.
 852
 853          However, if NITERS is zero, x never hits a value above NITERS - STEP
 854          before wrapping around.  There are two obvious ways of dealing with
 855          this:
 856
 857          - start at STEP - 1 and compare x before incrementing it
 858          - start at -1 and compare x after incrementing it
 859
 860          The latter is simpler and is what we use.  The loop in this case
 861          looks like:
 862
 863          C:
 864            x = -1;
 865            do
 866              {
 867                ...
 868                x += STEP;
 869              }
 870            while (x < NITERS - STEP);
 871
 872          In both cases the loop limit is NITERS - STEP.  */
 873       gimple_seq seq = NULL;
 874       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 875       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 876       if (seq)
 877         {
 878           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 879           gcc_assert (!new_bb);
 880         }
 881       if (niters_maybe_zero)
 882         {
 883           /* Case C.  */
 884           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 885           init = build_all_ones_cst (niters_type);
 886         }
 887       else
 888         {
 889           /* Case B.  */
 890           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 891           init = build_zero_cst (niters_type);
 892         }
 893     }
 894
 895   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 896   create_iv (init, step, NULL_TREE, loop,
 897              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 898   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 899                                               true, NULL_TREE, true,
 900                                               GSI_SAME_STMT);
 901   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 902                                      true, GSI_SAME_STMT);
 903
 904   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 905                                  NULL_TREE);
 906
 907   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 908
 909   /* Record the number of latch iterations.  */
 910   if (limit == niters)
 911     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 912        latch count.  */
 913     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 914                                        build_int_cst (niters_type, 1));
 915   else
 916     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 917        Subtract one from this to get the latch count.  */
 918     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 919                                        limit, step);
 920
 921   if (final_iv)
 922     {
 923       gassign *assign;
 924       edge exit = single_exit (loop);
 925       gcc_assert (single_pred_p (exit->dest));
 926       tree phi_dest
 927         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
 928       /* Make sure to maintain LC SSA form here and elide the subtraction
 929          if the value is zero.  */
 930       gphi *phi = create_phi_node (phi_dest, exit->dest);
 931       add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
 932       if (!integer_zerop (init))
 933         {
 934           assign = gimple_build_assign (final_iv, MINUS_EXPR,
 935                                         phi_dest, init);
 936           gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
 937           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
 938         }
 939     }
 940
 941   return cond_stmt;
 942 }
 943
 944 /* If we're using fully-masked loops, make LOOP iterate:
 945
 946       N == (NITERS - 1) / STEP + 1
 947
 948    times.  When NITERS is zero, this is equivalent to making the loop
 949    execute (1 << M) / STEP times, where M is the precision of NITERS.
 950    NITERS_MAYBE_ZERO is true if this last case might occur.
 951
 952    If we're not using fully-masked loops, make LOOP iterate:
 953
 954       N == (NITERS - STEP) / STEP + 1
 955
 956    times, where NITERS is known to be outside the range [1, STEP - 1].
 957    This is equivalent to making the loop execute NITERS / STEP times
 958    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 959    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 960
 961    If FINAL_IV is nonnull, it is an SSA name that should be set to
 962    N * STEP on exit from the loop.
 963
 964    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 965
 966 void
 967 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 968                          tree niters, tree step, tree final_iv,
 969                          bool niters_maybe_zero)
 970 {
 971   gcond *cond_stmt;
 972   gcond *orig_cond = get_loop_exit_condition (loop);
 973   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 974
 975   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 976     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 977                                                          niters, final_iv,
 978                                                          niters_maybe_zero,
 979                                                          loop_cond_gsi);
 980   else
 981     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 982                                                 niters_maybe_zero,
 983                                                 loop_cond_gsi);
 984
 985   /* Remove old loop exit test.  */
 986   stmt_vec_info orig_cond_info;
 987   if (loop_vinfo
 988       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 989     loop_vinfo->remove_stmt (orig_cond_info);
 990   else
 991     gsi_remove (&loop_cond_gsi, true);
 992
 993   if (dump_enabled_p ())
 994     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 995                      cond_stmt);
 996 }
 997
 998 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 999    For all PHI arguments in FROM->dest and TO->dest from those
1000    edges ensure that TO->dest PHI arguments have current_def
1001    to that in from.  */
1002
1003 static void
1004 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1005 {
1006   gimple_stmt_iterator gsi_from, gsi_to;
1007
1008   for (gsi_from = gsi_start_phis (from->dest),
1009        gsi_to = gsi_start_phis (to->dest);
1010        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1011     {
1012       gimple *from_phi = gsi_stmt (gsi_from);
1013       gimple *to_phi = gsi_stmt (gsi_to);
1014       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1015       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1016       if (virtual_operand_p (from_arg))
1017         {
1018           gsi_next (&gsi_from);
1019           continue;
1020         }
1021       if (virtual_operand_p (to_arg))
1022         {
1023           gsi_next (&gsi_to);
1024           continue;
1025         }
1026       if (TREE_CODE (from_arg) != SSA_NAME)
1027         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1028       else if (TREE_CODE (to_arg) == SSA_NAME
1029                && from_arg != to_arg)
1030         {
1031           if (get_current_def (to_arg) == NULL_TREE)
1032             {
1033               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1034                                               TREE_TYPE (get_current_def
1035                                                            (from_arg))));
1036               set_current_def (to_arg, get_current_def (from_arg));
1037             }
1038         }
1039       gsi_next (&gsi_from);
1040       gsi_next (&gsi_to);
1041     }
1042
1043   gphi *from_phi = get_virtual_phi (from->dest);
1044   gphi *to_phi = get_virtual_phi (to->dest);
1045   if (from_phi)
1046     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1047                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1048 }
1049
1050
1051 /* Given LOOP this function generates a new copy of it and puts it
1052    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1053    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1054    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1055    entry or exit of LOOP.  */
1056
1057 class loop *
1058 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1059                                         class loop *scalar_loop, edge e)
1060 {
1061   class loop *new_loop;
1062   basic_block *new_bbs, *bbs, *pbbs;
1063   bool at_exit;
1064   bool was_imm_dom;
1065   basic_block exit_dest;
1066   edge exit, new_exit;
1067   bool duplicate_outer_loop = false;
1068
1069   exit = single_exit (loop);
1070   at_exit = (e == exit);
1071   if (!at_exit && e != loop_preheader_edge (loop))
1072     return NULL;
1073
1074   if (scalar_loop == NULL)
1075     scalar_loop = loop;
1076
1077   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1078   pbbs = bbs + 1;
1079   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1080   /* Allow duplication of outer loops.  */
1081   if (scalar_loop->inner)
1082     duplicate_outer_loop = true;
1083   /* Check whether duplication is possible.  */
1084   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1085     {
1086       free (bbs);
1087       return NULL;
1088     }
1089
1090   /* Generate new loop structure.  */
1091   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1092   duplicate_subloops (scalar_loop, new_loop);
1093
1094   exit_dest = exit->dest;
1095   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1096                                           exit_dest) == loop->header ?
1097                  true : false);
1098
1099   /* Also copy the pre-header, this avoids jumping through hoops to
1100      duplicate the loop entry PHI arguments.  Create an empty
1101      pre-header unconditionally for this.  */
1102   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1103   edge entry_e = single_pred_edge (preheader);
1104   bbs[0] = preheader;
1105   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1106
1107   exit = single_exit (scalar_loop);
1108   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1109             &exit, 1, &new_exit, NULL,
1110             at_exit ? loop->latch : e->src, true);
1111   exit = single_exit (loop);
1112   basic_block new_preheader = new_bbs[0];
1113
1114   /* Before installing PHI arguments make sure that the edges
1115      into them match that of the scalar loop we analyzed.  This
1116      makes sure the SLP tree matches up between the main vectorized
1117      loop and the epilogue vectorized copies.  */
1118   if (single_succ_edge (preheader)->dest_idx
1119       != single_succ_edge (new_bbs[0])->dest_idx)
1120     {
1121       basic_block swap_bb = new_bbs[1];
1122       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1123       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1124       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1125       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1126     }
1127   if (duplicate_outer_loop)
1128     {
1129       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1130       if (loop_preheader_edge (scalar_loop)->dest_idx
1131           != loop_preheader_edge (new_inner_loop)->dest_idx)
1132         {
1133           basic_block swap_bb = new_inner_loop->header;
1134           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1135           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1136           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1137           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1138         }
1139     }
1140
1141   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1142
1143   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1144   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1145     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1146
1147   if (scalar_loop != loop)
1148     {
1149       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1150          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1151          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1152          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1153          header) to have current_def set, so copy them over.  */
1154       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1155                                                 exit);
1156       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1157                                                            0),
1158                                                 EDGE_SUCC (loop->latch, 0));
1159     }
1160
1161   if (at_exit) /* Add the loop copy at exit.  */
1162     {
1163       if (scalar_loop != loop)
1164         {
1165           gphi_iterator gsi;
1166           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1167
1168           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1169                gsi_next (&gsi))
1170             {
1171               gphi *phi = gsi.phi ();
1172               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1173               location_t orig_locus
1174                 = gimple_phi_arg_location_from_edge (phi, e);
1175
1176               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1177             }
1178         }
1179       redirect_edge_and_branch_force (e, new_preheader);
1180       flush_pending_stmts (e);
1181       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1182       if (was_imm_dom || duplicate_outer_loop)
1183         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1184
1185       /* And remove the non-necessary forwarder again.  Keep the other
1186          one so we have a proper pre-header for the loop at the exit edge.  */
1187       redirect_edge_pred (single_succ_edge (preheader),
1188                           single_pred (preheader));
1189       delete_basic_block (preheader);
1190       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1191                                loop_preheader_edge (scalar_loop)->src);
1192     }
1193   else /* Add the copy at entry.  */
1194     {
1195       if (scalar_loop != loop)
1196         {
1197           /* Remove the non-necessary forwarder of scalar_loop again.  */
1198           redirect_edge_pred (single_succ_edge (preheader),
1199                               single_pred (preheader));
1200           delete_basic_block (preheader);
1201           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1202                                    loop_preheader_edge (scalar_loop)->src);
1203           preheader = split_edge (loop_preheader_edge (loop));
1204           entry_e = single_pred_edge (preheader);
1205         }
1206
1207       redirect_edge_and_branch_force (entry_e, new_preheader);
1208       flush_pending_stmts (entry_e);
1209       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1210
1211       redirect_edge_and_branch_force (new_exit, preheader);
1212       flush_pending_stmts (new_exit);
1213       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1214
1215       /* And remove the non-necessary forwarder again.  Keep the other
1216          one so we have a proper pre-header for the loop at the exit edge.  */
1217       redirect_edge_pred (single_succ_edge (new_preheader),
1218                           single_pred (new_preheader));
1219       delete_basic_block (new_preheader);
1220       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1221                                loop_preheader_edge (new_loop)->src);
1222     }
1223
1224   if (scalar_loop != loop)
1225     {
1226       /* Update new_loop->header PHIs, so that on the preheader
1227          edge they are the ones from loop rather than scalar_loop.  */
1228       gphi_iterator gsi_orig, gsi_new;
1229       edge orig_e = loop_preheader_edge (loop);
1230       edge new_e = loop_preheader_edge (new_loop);
1231
1232       for (gsi_orig = gsi_start_phis (loop->header),
1233            gsi_new = gsi_start_phis (new_loop->header);
1234            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1235            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1236         {
1237           gphi *orig_phi = gsi_orig.phi ();
1238           gphi *new_phi = gsi_new.phi ();
1239           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1240           location_t orig_locus
1241             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1242
1243           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1244         }
1245     }
1246
1247   free (new_bbs);
1248   free (bbs);
1249
1250   checking_verify_dominators (CDI_DOMINATORS);
1251
1252   return new_loop;
1253 }
1254
1255
1256 /* Given the condition expression COND, put it as the last statement of
1257    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1258    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1259    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1260    new edge as irreducible if IRREDUCIBLE_P is true.  */
1261
1262 static edge
1263 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1264                        basic_block guard_to, basic_block dom_bb,
1265                        profile_probability probability, bool irreducible_p)
1266 {
1267   gimple_stmt_iterator gsi;
1268   edge new_e, enter_e;
1269   gcond *cond_stmt;
1270   gimple_seq gimplify_stmt_list = NULL;
1271
1272   enter_e = EDGE_SUCC (guard_bb, 0);
1273   enter_e->flags &= ~EDGE_FALLTHRU;
1274   enter_e->flags |= EDGE_FALSE_VALUE;
1275   gsi = gsi_last_bb (guard_bb);
1276
1277   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1278                                  is_gimple_condexpr_for_cond, NULL_TREE);
1279   if (gimplify_stmt_list)
1280     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1281
1282   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1283   gsi = gsi_last_bb (guard_bb);
1284   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1285
1286   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1287   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1288
1289   new_e->probability = probability;
1290   if (irreducible_p)
1291     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1292
1293   enter_e->probability = probability.invert ();
1294   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1295
1296   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1297   if (enter_e->dest->loop_father->header == enter_e->dest)
1298     split_edge (enter_e);
1299
1300   return new_e;
1301 }
1302
1303
1304 /* This function verifies that the following restrictions apply to LOOP:
1305    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1306        for innermost loop and 5 basic blocks for outer-loop.
1307    (2) it is single entry, single exit
1308    (3) its exit condition is the last stmt in the header
1309    (4) E is the entry/exit edge of LOOP.
1310  */
1311
1312 bool
1313 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1314 {
1315   edge exit_e = single_exit (loop);
1316   edge entry_e = loop_preheader_edge (loop);
1317   gcond *orig_cond = get_loop_exit_condition (loop);
1318   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1319   unsigned int num_bb = loop->inner? 5 : 2;
1320
1321   /* All loops have an outer scope; the only case loop->outer is NULL is for
1322      the function itself.  */
1323   if (!loop_outer (loop)
1324       || loop->num_nodes != num_bb
1325       || !empty_block_p (loop->latch)
1326       || !single_exit (loop)
1327       /* Verify that new loop exit condition can be trivially modified.  */
1328       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1329       || (e != exit_e && e != entry_e))
1330     return false;
1331
1332   return true;
1333 }
1334
1335 /* Function vect_get_loop_location.
1336
1337    Extract the location of the loop in the source code.
1338    If the loop is not well formed for vectorization, an estimated
1339    location is calculated.
1340    Return the loop location if succeed and NULL if not.  */
1341
1342 dump_user_location_t
1343 find_loop_location (class loop *loop)
1344 {
1345   gimple *stmt = NULL;
1346   basic_block bb;
1347   gimple_stmt_iterator si;
1348
1349   if (!loop)
1350     return dump_user_location_t ();
1351
1352   stmt = get_loop_exit_condition (loop);
1353
1354   if (stmt
1355       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1356     return stmt;
1357
1358   /* If we got here the loop is probably not "well formed",
1359      try to estimate the loop location */
1360
1361   if (!loop->header)
1362     return dump_user_location_t ();
1363
1364   bb = loop->header;
1365
1366   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1367     {
1368       stmt = gsi_stmt (si);
1369       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1370         return stmt;
1371     }
1372
1373   return dump_user_location_t ();
1374 }
1375
1376 /* Return true if the phi described by STMT_INFO defines an IV of the
1377    loop to be vectorized.  */
1378
1379 static bool
1380 iv_phi_p (stmt_vec_info stmt_info)
1381 {
1382   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1383   if (virtual_operand_p (PHI_RESULT (phi)))
1384     return false;
1385
1386   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1387       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1388     return false;
1389
1390   return true;
1391 }
1392
1393 /* Function vect_can_advance_ivs_p
1394
1395    In case the number of iterations that LOOP iterates is unknown at compile
1396    time, an epilog loop will be generated, and the loop induction variables
1397    (IVs) will be "advanced" to the value they are supposed to take just before
1398    the epilog loop.  Here we check that the access function of the loop IVs
1399    and the expression that represents the loop bound are simple enough.
1400    These restrictions will be relaxed in the future.  */
1401
1402 bool
1403 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1404 {
1405   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1406   basic_block bb = loop->header;
1407   gphi_iterator gsi;
1408
1409   /* Analyze phi functions of the loop header.  */
1410
1411   if (dump_enabled_p ())
1412     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1413   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1414     {
1415       tree evolution_part;
1416
1417       gphi *phi = gsi.phi ();
1418       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1419       if (dump_enabled_p ())
1420         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1421                          phi_info->stmt);
1422
1423       /* Skip virtual phi's. The data dependences that are associated with
1424          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1425
1426          Skip reduction phis.  */
1427       if (!iv_phi_p (phi_info))
1428         {
1429           if (dump_enabled_p ())
1430             dump_printf_loc (MSG_NOTE, vect_location,
1431                              "reduc or virtual phi. skip.\n");
1432           continue;
1433         }
1434
1435       /* Analyze the evolution function.  */
1436
1437       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1438       if (evolution_part == NULL_TREE)
1439         {
1440           if (dump_enabled_p ())
1441             dump_printf (MSG_MISSED_OPTIMIZATION,
1442                          "No access function or evolution.\n");
1443           return false;
1444         }
1445
1446       /* FORNOW: We do not transform initial conditions of IVs
1447          which evolution functions are not invariants in the loop.  */
1448
1449       if (!expr_invariant_in_loop_p (loop, evolution_part))
1450         {
1451           if (dump_enabled_p ())
1452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                              "evolution not invariant in loop.\n");
1454           return false;
1455         }
1456
1457       /* FORNOW: We do not transform initial conditions of IVs
1458          which evolution functions are a polynomial of degree >= 2.  */
1459
1460       if (tree_is_chrec (evolution_part))
1461         {
1462           if (dump_enabled_p ())
1463             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                              "evolution is chrec.\n");
1465           return false;
1466         }
1467     }
1468
1469   return true;
1470 }
1471
1472
1473 /*   Function vect_update_ivs_after_vectorizer.
1474
1475      "Advance" the induction variables of LOOP to the value they should take
1476      after the execution of LOOP.  This is currently necessary because the
1477      vectorizer does not handle induction variables that are used after the
1478      loop.  Such a situation occurs when the last iterations of LOOP are
1479      peeled, because:
1480      1. We introduced new uses after LOOP for IVs that were not originally used
1481         after LOOP: the IVs of LOOP are now used by an epilog loop.
1482      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1483         times, whereas the loop IVs should be bumped N times.
1484
1485      Input:
1486      - LOOP - a loop that is going to be vectorized. The last few iterations
1487               of LOOP were peeled.
1488      - NITERS - the number of iterations that LOOP executes (before it is
1489                 vectorized). i.e, the number of times the ivs should be bumped.
1490      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1491                   coming out from LOOP on which there are uses of the LOOP ivs
1492                   (this is the path from LOOP->exit to epilog_loop->preheader).
1493
1494                   The new definitions of the ivs are placed in LOOP->exit.
1495                   The phi args associated with the edge UPDATE_E in the bb
1496                   UPDATE_E->dest are updated accordingly.
1497
1498      Assumption 1: Like the rest of the vectorizer, this function assumes
1499      a single loop exit that has a single predecessor.
1500
1501      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1502      organized in the same order.
1503
1504      Assumption 3: The access function of the ivs is simple enough (see
1505      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1506
1507      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1508      coming out of LOOP on which the ivs of LOOP are used (this is the path
1509      that leads to the epilog loop; other paths skip the epilog loop).  This
1510      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1511      needs to have its phis updated.
1512  */
1513
1514 static void
1515 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1516                                   tree niters, edge update_e)
1517 {
1518   gphi_iterator gsi, gsi1;
1519   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520   basic_block update_bb = update_e->dest;
1521   basic_block exit_bb = single_exit (loop)->dest;
1522
1523   /* Make sure there exists a single-predecessor exit bb:  */
1524   gcc_assert (single_pred_p (exit_bb));
1525   gcc_assert (single_succ_edge (exit_bb) == update_e);
1526
1527   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1528        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1529        gsi_next (&gsi), gsi_next (&gsi1))
1530     {
1531       tree init_expr;
1532       tree step_expr, off;
1533       tree type;
1534       tree var, ni, ni_name;
1535       gimple_stmt_iterator last_gsi;
1536
1537       gphi *phi = gsi.phi ();
1538       gphi *phi1 = gsi1.phi ();
1539       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1540       if (dump_enabled_p ())
1541         dump_printf_loc (MSG_NOTE, vect_location,
1542                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1543
1544       /* Skip reduction and virtual phis.  */
1545       if (!iv_phi_p (phi_info))
1546         {
1547           if (dump_enabled_p ())
1548             dump_printf_loc (MSG_NOTE, vect_location,
1549                              "reduc or virtual phi. skip.\n");
1550           continue;
1551         }
1552
1553       type = TREE_TYPE (gimple_phi_result (phi));
1554       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1555       step_expr = unshare_expr (step_expr);
1556
1557       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1558          of degree >= 2 or exponential.  */
1559       gcc_assert (!tree_is_chrec (step_expr));
1560
1561       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1562
1563       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1564                          fold_convert (TREE_TYPE (step_expr), niters),
1565                          step_expr);
1566       if (POINTER_TYPE_P (type))
1567         ni = fold_build_pointer_plus (init_expr, off);
1568       else
1569         ni = fold_build2 (PLUS_EXPR, type,
1570                           init_expr, fold_convert (type, off));
1571
1572       var = create_tmp_var (type, "tmp");
1573
1574       last_gsi = gsi_last_bb (exit_bb);
1575       gimple_seq new_stmts = NULL;
1576       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1577       /* Exit_bb shouldn't be empty.  */
1578       if (!gsi_end_p (last_gsi))
1579         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1580       else
1581         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1582
1583       /* Fix phi expressions in the successor bb.  */
1584       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1585     }
1586 }
1587
1588 /* Return a gimple value containing the misalignment (measured in vector
1589    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1590    it is away from a perfectly aligned address.  Add any new statements
1591    to SEQ.  */
1592
1593 static tree
1594 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1595 {
1596   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1597   stmt_vec_info stmt_info = dr_info->stmt;
1598   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1599
1600   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1601   unsigned HOST_WIDE_INT target_align_c;
1602   tree target_align_minus_1;
1603
1604   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1605                                         size_zero_node) < 0;
1606   tree offset = (negative
1607                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1608                              * TREE_INT_CST_LOW
1609                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1610                  : size_zero_node);
1611   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1612                                                           stmt_info, seq,
1613                                                           offset);
1614   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1615   if (target_align.is_constant (&target_align_c))
1616     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1617   else
1618     {
1619       tree vla = build_int_cst (type, target_align);
1620       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1621                                     fold_build2 (MINUS_EXPR, type,
1622                                                  build_int_cst (type, 0), vla));
1623       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1624                                           build_int_cst (type, 1));
1625     }
1626
1627   HOST_WIDE_INT elem_size
1628     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1629   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1630
1631   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1632   tree int_start_addr = fold_convert (type, start_addr);
1633   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1634                                         target_align_minus_1);
1635
1636   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1637   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1638                                         elem_size_log);
1639
1640   return misalign_in_elems;
1641 }
1642
1643 /* Function vect_gen_prolog_loop_niters
1644
1645    Generate the number of iterations which should be peeled as prolog for the
1646    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1647    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1648    As a result, after the execution of this loop, the data reference DR will
1649    refer to an aligned location.  The following computation is generated:
1650
1651    If the misalignment of DR is known at compile time:
1652      addr_mis = int mis = DR_MISALIGNMENT (dr);
1653    Else, compute address misalignment in bytes:
1654      addr_mis = addr & (target_align - 1)
1655
1656    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1657
1658    (elem_size = element type size; an element is the scalar element whose type
1659    is the inner type of the vectype)
1660
1661    The computations will be emitted at the end of BB.  We also compute and
1662    store upper bound (included) of the result in BOUND.
1663
1664    When the step of the data-ref in the loop is not 1 (as in interleaved data
1665    and SLP), the number of iterations of the prolog must be divided by the step
1666    (which is equal to the size of interleaved group).
1667
1668    The above formulas assume that VF == number of elements in the vector. This
1669    may not hold when there are multiple-types in the loop.
1670    In this case, for some data-references in the loop the VF does not represent
1671    the number of elements that fit in the vector.  Therefore, instead of VF we
1672    use TYPE_VECTOR_SUBPARTS.  */
1673
1674 static tree
1675 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1676                              basic_block bb, int *bound)
1677 {
1678   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1679   tree var;
1680   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1681   gimple_seq stmts = NULL, new_stmts = NULL;
1682   tree iters, iters_name;
1683   stmt_vec_info stmt_info = dr_info->stmt;
1684   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1685   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1686
1687   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1688     {
1689       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1690
1691       if (dump_enabled_p ())
1692         dump_printf_loc (MSG_NOTE, vect_location,
1693                          "known peeling = %d.\n", npeel);
1694
1695       iters = build_int_cst (niters_type, npeel);
1696       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1697     }
1698   else
1699     {
1700       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1701       tree type = TREE_TYPE (misalign_in_elems);
1702       HOST_WIDE_INT elem_size
1703         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1704       /* We only do prolog peeling if the target alignment is known at compile
1705          time.  */
1706       poly_uint64 align_in_elems =
1707         exact_div (target_align, elem_size);
1708       tree align_in_elems_minus_1 =
1709         build_int_cst (type, align_in_elems - 1);
1710       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1711
1712       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1713                                  & (align_in_elems - 1)).  */
1714       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1715                                             size_zero_node) < 0;
1716       if (negative)
1717         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1718                              align_in_elems_tree);
1719       else
1720         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1721                              misalign_in_elems);
1722       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1723       iters = fold_convert (niters_type, iters);
1724       unsigned HOST_WIDE_INT align_in_elems_c;
1725       if (align_in_elems.is_constant (&align_in_elems_c))
1726         *bound = align_in_elems_c - 1;
1727       else
1728         *bound = -1;
1729     }
1730
1731   if (dump_enabled_p ())
1732     dump_printf_loc (MSG_NOTE, vect_location,
1733                      "niters for prolog loop: %T\n", iters);
1734
1735   var = create_tmp_var (niters_type, "prolog_loop_niters");
1736   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1737
1738   if (new_stmts)
1739     gimple_seq_add_seq (&stmts, new_stmts);
1740   if (stmts)
1741     {
1742       gcc_assert (single_succ_p (bb));
1743       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1744       if (gsi_end_p (gsi))
1745         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1746       else
1747         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1748     }
1749   return iters_name;
1750 }
1751
1752
1753 /* Function vect_update_init_of_dr
1754
1755    If CODE is PLUS, the vector loop starts NITERS iterations after the
1756    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1757    iterations before the scalar one (using masking to skip inactive
1758    elements).  This function updates the information recorded in DR to
1759    account for the difference.  Specifically, it updates the OFFSET
1760    field of DR_INFO.  */
1761
1762 static void
1763 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1764 {
1765   struct data_reference *dr = dr_info->dr;
1766   tree offset = dr_info->offset;
1767   if (!offset)
1768     offset = build_zero_cst (sizetype);
1769
1770   niters = fold_build2 (MULT_EXPR, sizetype,
1771                         fold_convert (sizetype, niters),
1772                         fold_convert (sizetype, DR_STEP (dr)));
1773   offset = fold_build2 (code, sizetype,
1774                         fold_convert (sizetype, offset), niters);
1775   dr_info->offset = offset;
1776 }
1777
1778
1779 /* Function vect_update_inits_of_drs
1780
1781    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1782    CODE and NITERS are as for vect_update_inits_of_dr.  */
1783
1784 void
1785 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1786                           tree_code code)
1787 {
1788   unsigned int i;
1789   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1790   struct data_reference *dr;
1791
1792   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1793
1794   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1795      here, but since we might use these niters to update the epilogues niters
1796      and data references we can't insert them here as this definition might not
1797      always dominate its uses.  */
1798   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1799     niters = fold_convert (sizetype, niters);
1800
1801   FOR_EACH_VEC_ELT (datarefs, i, dr)
1802     {
1803       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1804       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1805           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1806         vect_update_init_of_dr (dr_info, niters, code);
1807     }
1808 }
1809
1810 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1811    by masking.  This involves calculating the number of iterations to
1812    be peeled and then aligning all memory references appropriately.  */
1813
1814 void
1815 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1816 {
1817   tree misalign_in_elems;
1818   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1819
1820   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1821
1822   /* From the information recorded in LOOP_VINFO get the number of iterations
1823      that need to be skipped via masking.  */
1824   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1825     {
1826       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1827                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1828       misalign_in_elems = build_int_cst (type, misalign);
1829     }
1830   else
1831     {
1832       gimple_seq seq1 = NULL, seq2 = NULL;
1833       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1834       misalign_in_elems = fold_convert (type, misalign_in_elems);
1835       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1836                                                 &seq2, true, NULL_TREE);
1837       gimple_seq_add_seq (&seq1, seq2);
1838       if (seq1)
1839         {
1840           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1841           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1842           gcc_assert (!new_bb);
1843         }
1844     }
1845
1846   if (dump_enabled_p ())
1847     dump_printf_loc (MSG_NOTE, vect_location,
1848                      "misalignment for fully-masked loop: %T\n",
1849                      misalign_in_elems);
1850
1851   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1852
1853   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1854 }
1855
1856 /* This function builds ni_name = number of iterations.  Statements
1857    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1858    it to TRUE if new ssa_var is generated.  */
1859
1860 tree
1861 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1862 {
1863   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1864   if (TREE_CODE (ni) == INTEGER_CST)
1865     return ni;
1866   else
1867     {
1868       tree ni_name, var;
1869       gimple_seq stmts = NULL;
1870       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1871
1872       var = create_tmp_var (TREE_TYPE (ni), "niters");
1873       ni_name = force_gimple_operand (ni, &stmts, false, var);
1874       if (stmts)
1875         {
1876           gsi_insert_seq_on_edge_immediate (pe, stmts);
1877           if (new_var_p != NULL)
1878             *new_var_p = true;
1879         }
1880
1881       return ni_name;
1882     }
1883 }
1884
1885 /* Calculate the number of iterations above which vectorized loop will be
1886    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1887    of prolog loop.  If it's integer const, the integer number is also passed
1888    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1889    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1890    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1891    threshold below which the scalar (rather than vectorized) loop will be
1892    executed.  This function stores the upper bound (inclusive) of the result
1893    in BOUND_SCALAR.  */
1894
1895 static tree
1896 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1897                              int bound_prolog, poly_int64 bound_epilog, int th,
1898                              poly_uint64 *bound_scalar,
1899                              bool check_profitability)
1900 {
1901   tree type = TREE_TYPE (niters_prolog);
1902   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1903                              build_int_cst (type, bound_epilog));
1904
1905   *bound_scalar = bound_prolog + bound_epilog;
1906   if (check_profitability)
1907     {
1908       /* TH indicates the minimum niters of vectorized loop, while we
1909          compute the maximum niters of scalar loop.  */
1910       th--;
1911       /* Peeling for constant times.  */
1912       if (int_niters_prolog >= 0)
1913         {
1914           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1915           return build_int_cst (type, *bound_scalar);
1916         }
1917       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1918          and BOUND_EPILOG are inclusive upper bounds.  */
1919       if (known_ge (th, bound_prolog + bound_epilog))
1920         {
1921           *bound_scalar = th;
1922           return build_int_cst (type, th);
1923         }
1924       /* Need to do runtime comparison.  */
1925       else if (maybe_gt (th, bound_epilog))
1926         {
1927           *bound_scalar = upper_bound (*bound_scalar, th);
1928           return fold_build2 (MAX_EXPR, type,
1929                               build_int_cst (type, th), niters);
1930         }
1931     }
1932   return niters;
1933 }
1934
1935 /* NITERS is the number of times that the original scalar loop executes
1936    after peeling.  Work out the maximum number of iterations N that can
1937    be handled by the vectorized form of the loop and then either:
1938
1939    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1940
1941         niters_vector = N
1942
1943    b) set *STEP_VECTOR_PTR to one and generate:
1944
1945         niters_vector = N / vf
1946
1947    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1948    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1949    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1950
1951 void
1952 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1953                              tree *niters_vector_ptr, tree *step_vector_ptr,
1954                              bool niters_no_overflow)
1955 {
1956   tree ni_minus_gap, var;
1957   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1958   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1960   tree log_vf = NULL_TREE;
1961
1962   /* If epilogue loop is required because of data accesses with gaps, we
1963      subtract one iteration from the total number of iterations here for
1964      correct calculation of RATIO.  */
1965   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1966     {
1967       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1968                                   build_one_cst (type));
1969       if (!is_gimple_val (ni_minus_gap))
1970         {
1971           var = create_tmp_var (type, "ni_gap");
1972           gimple *stmts = NULL;
1973           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1974                                                true, var);
1975           gsi_insert_seq_on_edge_immediate (pe, stmts);
1976         }
1977     }
1978   else
1979     ni_minus_gap = niters;
1980
1981   unsigned HOST_WIDE_INT const_vf;
1982   if (vf.is_constant (&const_vf)
1983       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1984     {
1985       /* Create: niters >> log2(vf) */
1986       /* If it's known that niters == number of latch executions + 1 doesn't
1987          overflow, we can generate niters >> log2(vf); otherwise we generate
1988          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1989          will be at least one.  */
1990       log_vf = build_int_cst (type, exact_log2 (const_vf));
1991       if (niters_no_overflow)
1992         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1993       else
1994         niters_vector
1995           = fold_build2 (PLUS_EXPR, type,
1996                          fold_build2 (RSHIFT_EXPR, type,
1997                                       fold_build2 (MINUS_EXPR, type,
1998                                                    ni_minus_gap,
1999                                                    build_int_cst (type, vf)),
2000                                       log_vf),
2001                          build_int_cst (type, 1));
2002       step_vector = build_one_cst (type);
2003     }
2004   else
2005     {
2006       niters_vector = ni_minus_gap;
2007       step_vector = build_int_cst (type, vf);
2008     }
2009
2010   if (!is_gimple_val (niters_vector))
2011     {
2012       var = create_tmp_var (type, "bnd");
2013       gimple_seq stmts = NULL;
2014       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2015       gsi_insert_seq_on_edge_immediate (pe, stmts);
2016       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2017          we set range information to make niters analyzer's life easier.
2018          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2019          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2020       if (stmts != NULL && log_vf)
2021         {
2022           if (niters_no_overflow)
2023             {
2024               value_range vr (type,
2025                               wi::one (TYPE_PRECISION (type)),
2026                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2027                                                          TYPE_SIGN (type)),
2028                                           exact_log2 (const_vf),
2029                                           TYPE_SIGN (type)));
2030               set_range_info (niters_vector, vr);
2031             }
2032           /* For VF == 1 the vector IV might also overflow so we cannot
2033              assert a minimum value of 1.  */
2034           else if (const_vf > 1)
2035             {
2036               value_range vr (type,
2037                               wi::one (TYPE_PRECISION (type)),
2038                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2039                                                          TYPE_SIGN (type))
2040                                           - (const_vf - 1),
2041                                           exact_log2 (const_vf), TYPE_SIGN (type))
2042                               + 1);
2043               set_range_info (niters_vector, vr);
2044             }
2045         }
2046     }
2047   *niters_vector_ptr = niters_vector;
2048   *step_vector_ptr = step_vector;
2049
2050   return;
2051 }
2052
2053 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2054    loop specified by LOOP_VINFO after vectorization, compute the number
2055    of iterations before vectorization (niters_vector * vf) and store it
2056    to NITERS_VECTOR_MULT_VF_PTR.  */
2057
2058 static void
2059 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2060                                      tree niters_vector,
2061                                      tree *niters_vector_mult_vf_ptr)
2062 {
2063   /* We should be using a step_vector of VF if VF is variable.  */
2064   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2065   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2066   tree type = TREE_TYPE (niters_vector);
2067   tree log_vf = build_int_cst (type, exact_log2 (vf));
2068   basic_block exit_bb = single_exit (loop)->dest;
2069
2070   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2071   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2072                                             niters_vector, log_vf);
2073   if (!is_gimple_val (niters_vector_mult_vf))
2074     {
2075       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2076       gimple_seq stmts = NULL;
2077       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2078                                                     &stmts, true, var);
2079       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2080       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2081     }
2082   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2083 }
2084
2085 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2086    this function searches for the corresponding lcssa phi node in exit
2087    bb of LOOP.  If it is found, return the phi result; otherwise return
2088    NULL.  */
2089
2090 static tree
2091 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2092                 gphi *lcssa_phi)
2093 {
2094   gphi_iterator gsi;
2095   edge e = single_exit (loop);
2096
2097   gcc_assert (single_pred_p (e->dest));
2098   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2099     {
2100       gphi *phi = gsi.phi ();
2101       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2102                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2103         return PHI_RESULT (phi);
2104     }
2105   return NULL_TREE;
2106 }
2107
2108 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2109    from SECOND/FIRST and puts it at the original loop's preheader/exit
2110    edge, the two loops are arranged as below:
2111
2112        preheader_a:
2113      first_loop:
2114        header_a:
2115          i_1 = PHI<i_0, i_2>;
2116          ...
2117          i_2 = i_1 + 1;
2118          if (cond_a)
2119            goto latch_a;
2120          else
2121            goto between_bb;
2122        latch_a:
2123          goto header_a;
2124
2125        between_bb:
2126          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2127
2128      second_loop:
2129        header_b:
2130          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2131                                  or with i_2 if no LCSSA phi is created
2132                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2133          ...
2134          i_4 = i_3 + 1;
2135          if (cond_b)
2136            goto latch_b;
2137          else
2138            goto exit_bb;
2139        latch_b:
2140          goto header_b;
2141
2142        exit_bb:
2143
2144    This function creates loop closed SSA for the first loop; update the
2145    second loop's PHI nodes by replacing argument on incoming edge with the
2146    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2147    is false, Loop closed ssa phis will only be created for non-iv phis for
2148    the first loop.
2149
2150    This function assumes exit bb of the first loop is preheader bb of the
2151    second loop, i.e, between_bb in the example code.  With PHIs updated,
2152    the second loop will execute rest iterations of the first.  */
2153
2154 static void
2155 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2156                                    class loop *first, class loop *second,
2157                                    bool create_lcssa_for_iv_phis)
2158 {
2159   gphi_iterator gsi_update, gsi_orig;
2160   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2161
2162   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2163   edge second_preheader_e = loop_preheader_edge (second);
2164   basic_block between_bb = single_exit (first)->dest;
2165
2166   gcc_assert (between_bb == second_preheader_e->src);
2167   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2168   /* Either the first loop or the second is the loop to be vectorized.  */
2169   gcc_assert (loop == first || loop == second);
2170
2171   for (gsi_orig = gsi_start_phis (first->header),
2172        gsi_update = gsi_start_phis (second->header);
2173        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2174        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2175     {
2176       gphi *orig_phi = gsi_orig.phi ();
2177       gphi *update_phi = gsi_update.phi ();
2178
2179       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2180       /* Generate lcssa PHI node for the first loop.  */
2181       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2182       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2183       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2184         {
2185           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2186           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2187           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2188           arg = new_res;
2189         }
2190
2191       /* Update PHI node in the second loop by replacing arg on the loop's
2192          incoming edge.  */
2193       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2194     }
2195
2196   /* For epilogue peeling we have to make sure to copy all LC PHIs
2197      for correct vectorization of live stmts.  */
2198   if (loop == first)
2199     {
2200       basic_block orig_exit = single_exit (second)->dest;
2201       for (gsi_orig = gsi_start_phis (orig_exit);
2202            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2203         {
2204           gphi *orig_phi = gsi_orig.phi ();
2205           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2206           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2207             continue;
2208
2209           /* Already created in the above loop.   */
2210           if (find_guard_arg (first, second, orig_phi))
2211             continue;
2212
2213           tree new_res = copy_ssa_name (orig_arg);
2214           gphi *lcphi = create_phi_node (new_res, between_bb);
2215           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2216         }
2217     }
2218 }
2219
2220 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2221    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2222    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2223    appear like below:
2224
2225        guard_bb:
2226          if (cond)
2227            goto merge_bb;
2228          else
2229            goto skip_loop;
2230
2231      skip_loop:
2232        header_a:
2233          i_1 = PHI<i_0, i_2>;
2234          ...
2235          i_2 = i_1 + 1;
2236          if (cond_a)
2237            goto latch_a;
2238          else
2239            goto exit_a;
2240        latch_a:
2241          goto header_a;
2242
2243        exit_a:
2244          i_5 = PHI<i_2>;
2245
2246        merge_bb:
2247          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2248
2249      update_loop:
2250        header_b:
2251          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2252          ...
2253          i_4 = i_3 + 1;
2254          if (cond_b)
2255            goto latch_b;
2256          else
2257            goto exit_bb;
2258        latch_b:
2259          goto header_b;
2260
2261        exit_bb:
2262
2263    This function creates PHI nodes at merge_bb and replaces the use of i_5
2264    in the update_loop's PHI node with the result of new PHI result.  */
2265
2266 static void
2267 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2268                                     class loop *update_loop,
2269                                     edge guard_edge, edge merge_edge)
2270 {
2271   location_t merge_loc, guard_loc;
2272   edge orig_e = loop_preheader_edge (skip_loop);
2273   edge update_e = loop_preheader_edge (update_loop);
2274   gphi_iterator gsi_orig, gsi_update;
2275
2276   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2277         gsi_update = gsi_start_phis (update_loop->header));
2278        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2279        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2280     {
2281       gphi *orig_phi = gsi_orig.phi ();
2282       gphi *update_phi = gsi_update.phi ();
2283
2284       /* Generate new phi node at merge bb of the guard.  */
2285       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2286       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2287
2288       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2289          args in NEW_PHI for these edges.  */
2290       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2291       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2292       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2293       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2294       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2295       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2296
2297       /* Update phi in UPDATE_PHI.  */
2298       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2299     }
2300 }
2301
2302 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2303    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2304    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2305    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2306    The CFG looks like:
2307
2308      loop:
2309        header_a:
2310          i_1 = PHI<i_0, i_2>;
2311          ...
2312          i_2 = i_1 + 1;
2313          if (cond_a)
2314            goto latch_a;
2315          else
2316            goto exit_a;
2317        latch_a:
2318          goto header_a;
2319
2320        exit_a:
2321
2322        guard_bb:
2323          if (cond)
2324            goto merge_bb;
2325          else
2326            goto epilog_loop;
2327
2328        ;; fall_through_bb
2329
2330      epilog_loop:
2331        header_b:
2332          i_3 = PHI<i_2, i_4>;
2333          ...
2334          i_4 = i_3 + 1;
2335          if (cond_b)
2336            goto latch_b;
2337          else
2338            goto merge_bb;
2339        latch_b:
2340          goto header_b;
2341
2342        merge_bb:
2343          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2344
2345        exit_bb:
2346          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2347
2348    For each name used out side EPILOG (i.e - for each name that has a lcssa
2349    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2350    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2351    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2352    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2353    in exit_bb will also be updated.  */
2354
2355 static void
2356 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2357                                     edge guard_edge, edge merge_edge)
2358 {
2359   gphi_iterator gsi;
2360   basic_block merge_bb = guard_edge->dest;
2361
2362   gcc_assert (single_succ_p (merge_bb));
2363   edge e = single_succ_edge (merge_bb);
2364   basic_block exit_bb = e->dest;
2365   gcc_assert (single_pred_p (exit_bb));
2366   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2367
2368   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2369     {
2370       gphi *update_phi = gsi.phi ();
2371       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2372
2373       tree merge_arg = NULL_TREE;
2374
2375       /* If the old argument is a SSA_NAME use its current_def.  */
2376       if (TREE_CODE (old_arg) == SSA_NAME)
2377         merge_arg = get_current_def (old_arg);
2378       /* If it's a constant or doesn't have a current_def, just use the old
2379          argument.  */
2380       if (!merge_arg)
2381         merge_arg = old_arg;
2382
2383       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2384       /* If the var is live after loop but not a reduction, we simply
2385          use the old arg.  */
2386       if (!guard_arg)
2387         guard_arg = old_arg;
2388
2389       /* Create new phi node in MERGE_BB:  */
2390       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2391       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2392
2393       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2394          the two PHI args in merge_phi for these edges.  */
2395       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2396       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2397
2398       /* Update the original phi in exit_bb.  */
2399       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2400     }
2401 }
2402
2403 /* EPILOG loop is duplicated from the original loop for vectorizing,
2404    the arg of its loop closed ssa PHI needs to be updated.  */
2405
2406 static void
2407 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2408 {
2409   gphi_iterator gsi;
2410   basic_block exit_bb = single_exit (epilog)->dest;
2411
2412   gcc_assert (single_pred_p (exit_bb));
2413   edge e = EDGE_PRED (exit_bb, 0);
2414   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2415     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2416 }
2417
2418 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2419    iterate exactly CONST_NITERS times.  Make a final decision about
2420    whether the epilogue loop should be used, returning true if so.  */
2421
2422 static bool
2423 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2424                              unsigned HOST_WIDE_INT const_niters)
2425 {
2426   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2427      using an epilogue loop for a single scalar iteration, even if
2428      we could in principle implement that using partial vectors.  */
2429   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2430   if (const_niters <= gap_niters + 1)
2431     return false;
2432
2433   /* Install the number of iterations.  */
2434   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2435   tree niters_tree = build_int_cst (niters_type, const_niters);
2436   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2437
2438   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2439   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2440
2441   /* Decide what to do if the number of epilogue iterations is not
2442      a multiple of the epilogue loop's vectorization factor.  */
2443   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2444 }
2445
2446 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2447    Return a value that equals:
2448
2449    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2450    - SKIP_VALUE when the main loop is skipped.  */
2451
2452 tree
2453 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2454                            tree skip_value)
2455 {
2456   gcc_assert (loop_vinfo->main_loop_edge);
2457
2458   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2459   basic_block bb = loop_vinfo->main_loop_edge->dest;
2460   gphi *new_phi = create_phi_node (phi_result, bb);
2461   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2462                UNKNOWN_LOCATION);
2463   add_phi_arg (new_phi, skip_value,
2464                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2465   return phi_result;
2466 }
2467
2468 /* Function vect_do_peeling.
2469
2470    Input:
2471    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2472
2473        preheader:
2474      LOOP:
2475        header_bb:
2476          loop_body
2477          if (exit_loop_cond) goto exit_bb
2478          else                goto header_bb
2479        exit_bb:
2480
2481    - NITERS: The number of iterations of the loop.
2482    - NITERSM1: The number of iterations of the loop's latch.
2483    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2484    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2485                               CHECK_PROFITABILITY is true.
2486    Output:
2487    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2488      iterate after vectorization; see vect_set_loop_condition for details.
2489    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2490      should be set to the number of scalar iterations handled by the
2491      vector loop.  The SSA name is only used on exit from the loop.
2492
2493    This function peels prolog and epilog from the loop, adds guards skipping
2494    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2495    would look like:
2496
2497        guard_bb_1:
2498          if (prefer_scalar_loop) goto merge_bb_1
2499          else                    goto guard_bb_2
2500
2501        guard_bb_2:
2502          if (skip_prolog) goto merge_bb_2
2503          else             goto prolog_preheader
2504
2505        prolog_preheader:
2506      PROLOG:
2507        prolog_header_bb:
2508          prolog_body
2509          if (exit_prolog_cond) goto prolog_exit_bb
2510          else                  goto prolog_header_bb
2511        prolog_exit_bb:
2512
2513        merge_bb_2:
2514
2515        vector_preheader:
2516      VECTOR LOOP:
2517        vector_header_bb:
2518          vector_body
2519          if (exit_vector_cond) goto vector_exit_bb
2520          else                  goto vector_header_bb
2521        vector_exit_bb:
2522
2523        guard_bb_3:
2524          if (skip_epilog) goto merge_bb_3
2525          else             goto epilog_preheader
2526
2527        merge_bb_1:
2528
2529        epilog_preheader:
2530      EPILOG:
2531        epilog_header_bb:
2532          epilog_body
2533          if (exit_epilog_cond) goto merge_bb_3
2534          else                  goto epilog_header_bb
2535
2536        merge_bb_3:
2537
2538    Note this function peels prolog and epilog only if it's necessary,
2539    as well as guards.
2540    This function returns the epilogue loop if a decision was made to vectorize
2541    it, otherwise NULL.
2542
2543    The analysis resulting in this epilogue loop's loop_vec_info was performed
2544    in the same vect_analyze_loop call as the main loop's.  At that time
2545    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2546    vectorization factors than the main loop.  This list is stored in the main
2547    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2548    vectorize the epilogue loop for a lower vectorization factor,  the
2549    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2550    updated and linked to the epilogue loop.  This is later used to vectorize
2551    the epilogue.  The reason the loop_vec_info needs updating is that it was
2552    constructed based on the original main loop, and the epilogue loop is a
2553    copy of this loop, so all links pointing to statements in the original loop
2554    need updating.  Furthermore, these loop_vec_infos share the
2555    data_reference's records, which will also need to be updated.
2556
2557    TODO: Guard for prefer_scalar_loop should be emitted along with
2558    versioning conditions if loop versioning is needed.  */
2559
2560
2561 class loop *
2562 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2563                  tree *niters_vector, tree *step_vector,
2564                  tree *niters_vector_mult_vf_var, int th,
2565                  bool check_profitability, bool niters_no_overflow,
2566                  tree *advance)
2567 {
2568   edge e, guard_e;
2569   tree type = TREE_TYPE (niters), guard_cond;
2570   basic_block guard_bb, guard_to;
2571   profile_probability prob_prolog, prob_vector, prob_epilog;
2572   int estimated_vf;
2573   int prolog_peeling = 0;
2574   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2575   bool vect_epilogues_updated_niters = false;
2576   /* We currently do not support prolog peeling if the target alignment is not
2577      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2578      target alignment being constant.  */
2579   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2580   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2581     return NULL;
2582
2583   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2584     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2585
2586   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2587   poly_uint64 bound_epilog = 0;
2588   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2589       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2590     bound_epilog += vf - 1;
2591   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2592     bound_epilog += 1;
2593   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2594   poly_uint64 bound_scalar = bound_epilog;
2595
2596   if (!prolog_peeling && !epilog_peeling)
2597     return NULL;
2598
2599   /* Before doing any peeling make sure to reset debug binds outside of
2600      the loop refering to defs not in LC SSA.  */
2601   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2602   for (unsigned i = 0; i < loop->num_nodes; ++i)
2603     {
2604       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2605       imm_use_iterator ui;
2606       gimple *use_stmt;
2607       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2608            gsi_next (&gsi))
2609         {
2610           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2611             if (gimple_debug_bind_p (use_stmt)
2612                 && loop != gimple_bb (use_stmt)->loop_father
2613                 && !flow_loop_nested_p (loop,
2614                                         gimple_bb (use_stmt)->loop_father))
2615               {
2616                 gimple_debug_bind_reset_value (use_stmt);
2617                 update_stmt (use_stmt);
2618               }
2619         }
2620       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2621            gsi_next (&gsi))
2622         {
2623           ssa_op_iter op_iter;
2624           def_operand_p def_p;
2625           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2626             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2627               if (gimple_debug_bind_p (use_stmt)
2628                   && loop != gimple_bb (use_stmt)->loop_father
2629                   && !flow_loop_nested_p (loop,
2630                                           gimple_bb (use_stmt)->loop_father))
2631                 {
2632                   gimple_debug_bind_reset_value (use_stmt);
2633                   update_stmt (use_stmt);
2634                 }
2635         }
2636     }
2637
2638   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2639   estimated_vf = vect_vf_for_cost (loop_vinfo);
2640   if (estimated_vf == 2)
2641     estimated_vf = 3;
2642   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2643                         .apply_scale (estimated_vf - 1, estimated_vf);
2644
2645   class loop *prolog, *epilog = NULL;
2646   class loop *first_loop = loop;
2647   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2648
2649   /* SSA form needs to be up-to-date since we are going to manually
2650      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2651      update SSA state after that, so we have to make sure to not lose any
2652      pending update needs.  */
2653   gcc_assert (!need_ssa_update_p (cfun));
2654
2655   /* If we're vectorizing an epilogue loop, we have ensured that the
2656      virtual operand is in SSA form throughout the vectorized main loop.
2657      Normally it is possible to trace the updated
2658      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2659      back to scalar-stmt vuses, meaning that the effect of the SSA update
2660      remains local to the main loop.  However, there are rare cases in
2661      which the vectorized loop should have vdefs even when the original scalar
2662      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2663      introduces clobbers of the temporary vector array, which in turn
2664      needs new vdefs.  If the scalar loop doesn't write to memory, these
2665      new vdefs will be the only ones in the vector loop.
2666      We are currently defering updating virtual SSA form and creating
2667      of a virtual PHI for this case so we do not have to make sure the
2668      newly introduced virtual def is in LCSSA form.  */
2669
2670   if (MAY_HAVE_DEBUG_BIND_STMTS)
2671     {
2672       gcc_assert (!adjust_vec.exists ());
2673       adjust_vec.create (32);
2674     }
2675   initialize_original_copy_tables ();
2676
2677   /* Record the anchor bb at which the guard should be placed if the scalar
2678      loop might be preferred.  */
2679   basic_block anchor = loop_preheader_edge (loop)->src;
2680
2681   /* Generate the number of iterations for the prolog loop.  We do this here
2682      so that we can also get the upper bound on the number of iterations.  */
2683   tree niters_prolog;
2684   int bound_prolog = 0;
2685   if (prolog_peeling)
2686     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2687                                                   &bound_prolog);
2688   else
2689     niters_prolog = build_int_cst (type, 0);
2690
2691   loop_vec_info epilogue_vinfo = NULL;
2692   if (vect_epilogues)
2693     {
2694       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2695       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2696     }
2697
2698   tree niters_vector_mult_vf = NULL_TREE;
2699   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2700   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2701   edge update_e = NULL, skip_e = NULL;
2702   unsigned int lowest_vf = constant_lower_bound (vf);
2703   /* If we know the number of scalar iterations for the main loop we should
2704      check whether after the main loop there are enough iterations left over
2705      for the epilogue.  */
2706   if (vect_epilogues
2707       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2708       && prolog_peeling >= 0
2709       && known_eq (vf, lowest_vf))
2710     {
2711       unsigned HOST_WIDE_INT eiters
2712         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2713            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2714
2715       eiters -= prolog_peeling;
2716       eiters
2717         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2718
2719       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2720         {
2721           delete epilogue_vinfo;
2722           epilogue_vinfo = NULL;
2723           if (loop_vinfo->epilogue_vinfos.length () == 0)
2724             {
2725               vect_epilogues = false;
2726               break;
2727             }
2728           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2729           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2730         }
2731       vect_epilogues_updated_niters = true;
2732     }
2733   /* Prolog loop may be skipped.  */
2734   bool skip_prolog = (prolog_peeling != 0);
2735   /* Skip this loop to epilog when there are not enough iterations to enter this
2736      vectorized loop.  If true we should perform runtime checks on the NITERS
2737      to check whether we should skip the current vectorized loop.  If we know
2738      the number of scalar iterations we may choose to add a runtime check if
2739      this number "maybe" smaller than the number of iterations required
2740      when we know the number of scalar iterations may potentially
2741      be smaller than the number of iterations required to enter this loop, for
2742      this we use the upper bounds on the prolog and epilog peeling.  When we
2743      don't know the number of iterations and don't require versioning it is
2744      because we have asserted that there are enough scalar iterations to enter
2745      the main loop, so this skip is not necessary.  When we are versioning then
2746      we only add such a skip if we have chosen to vectorize the epilogue.  */
2747   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2748                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2749                                   bound_prolog + bound_epilog)
2750                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2751                          || vect_epilogues));
2752   /* Epilog loop must be executed if the number of iterations for epilog
2753      loop is known at compile time, otherwise we need to add a check at
2754      the end of vector loop and skip to the end of epilog loop.  */
2755   bool skip_epilog = (prolog_peeling < 0
2756                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2757                       || !vf.is_constant ());
2758   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2759   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2760     skip_epilog = false;
2761
2762   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2763   auto_vec<profile_count> original_counts;
2764   basic_block *original_bbs = NULL;
2765
2766   if (skip_vector)
2767     {
2768       split_edge (loop_preheader_edge (loop));
2769
2770       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
2771         {
2772           original_bbs = get_loop_body (loop);
2773           for (unsigned int i = 0; i < loop->num_nodes; i++)
2774             original_counts.safe_push(original_bbs[i]->count);
2775         }
2776
2777       /* Due to the order in which we peel prolog and epilog, we first
2778          propagate probability to the whole loop.  The purpose is to
2779          avoid adjusting probabilities of both prolog and vector loops
2780          separately.  Note in this case, the probability of epilog loop
2781          needs to be scaled back later.  */
2782       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2783       if (prob_vector.initialized_p ())
2784         {
2785           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2786           scale_loop_profile (loop, prob_vector, 0);
2787         }
2788     }
2789
2790   dump_user_location_t loop_loc = find_loop_location (loop);
2791   if (vect_epilogues)
2792     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2793        use the original scalar loop as remaining epilogue if necessary.  */
2794     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2795       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2796
2797   if (prolog_peeling)
2798     {
2799       e = loop_preheader_edge (loop);
2800       if (!slpeel_can_duplicate_loop_p (loop, e))
2801         {
2802           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2803                            "loop can't be duplicated to preheader edge.\n");
2804           gcc_unreachable ();
2805         }
2806       /* Peel prolog and put it on preheader edge of loop.  */
2807       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2808       if (!prolog)
2809         {
2810           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2811                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2812           gcc_unreachable ();
2813         }
2814       prolog->force_vectorize = false;
2815       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2816       first_loop = prolog;
2817       reset_original_copy_tables ();
2818
2819       /* Update the number of iterations for prolog loop.  */
2820       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2821       vect_set_loop_condition (prolog, NULL, niters_prolog,
2822                                step_prolog, NULL_TREE, false);
2823
2824       /* Skip the prolog loop.  */
2825       if (skip_prolog)
2826         {
2827           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2828                                     niters_prolog, build_int_cst (type, 0));
2829           guard_bb = loop_preheader_edge (prolog)->src;
2830           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2831           guard_to = split_edge (loop_preheader_edge (loop));
2832           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2833                                            guard_to, guard_bb,
2834                                            prob_prolog.invert (),
2835                                            irred_flag);
2836           e = EDGE_PRED (guard_to, 0);
2837           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2838           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2839
2840           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2841           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2842         }
2843
2844       /* Update init address of DRs.  */
2845       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2846       /* Update niters for vector loop.  */
2847       LOOP_VINFO_NITERS (loop_vinfo)
2848         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2849       LOOP_VINFO_NITERSM1 (loop_vinfo)
2850         = fold_build2 (MINUS_EXPR, type,
2851                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2852       bool new_var_p = false;
2853       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2854       /* It's guaranteed that vector loop bound before vectorization is at
2855          least VF, so set range information for newly generated var.  */
2856       if (new_var_p)
2857         {
2858           value_range vr (type,
2859                           wi::to_wide (build_int_cst (type, vf)),
2860                           wi::to_wide (TYPE_MAX_VALUE (type)));
2861           set_range_info (niters, vr);
2862         }
2863
2864       /* Prolog iterates at most bound_prolog times, latch iterates at
2865          most bound_prolog - 1 times.  */
2866       record_niter_bound (prolog, bound_prolog - 1, false, true);
2867       delete_update_ssa ();
2868       adjust_vec_debug_stmts ();
2869       scev_reset ();
2870     }
2871
2872   if (epilog_peeling)
2873     {
2874       e = single_exit (loop);
2875       if (!slpeel_can_duplicate_loop_p (loop, e))
2876         {
2877           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2878                            "loop can't be duplicated to exit edge.\n");
2879           gcc_unreachable ();
2880         }
2881       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2882          said epilog then we should use a copy of the main loop as a starting
2883          point.  This loop may have already had some preliminary transformations
2884          to allow for more optimal vectorization, for example if-conversion.
2885          If we are not vectorizing the epilog then we should use the scalar loop
2886          as the transformations mentioned above make less or no sense when not
2887          vectorizing.  */
2888       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2889       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2890       if (!epilog)
2891         {
2892           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2893                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2894           gcc_unreachable ();
2895         }
2896       epilog->force_vectorize = false;
2897       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2898
2899       /* Scalar version loop may be preferred.  In this case, add guard
2900          and skip to epilog.  Note this only happens when the number of
2901          iterations of loop is unknown at compile time, otherwise this
2902          won't be vectorized.  */
2903       if (skip_vector)
2904         {
2905           /* Additional epilogue iteration is peeled if gap exists.  */
2906           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2907                                                 bound_prolog, bound_epilog,
2908                                                 th, &bound_scalar,
2909                                                 check_profitability);
2910           /* Build guard against NITERSM1 since NITERS may overflow.  */
2911           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2912           guard_bb = anchor;
2913           guard_to = split_edge (loop_preheader_edge (epilog));
2914           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2915                                            guard_to, guard_bb,
2916                                            prob_vector.invert (),
2917                                            irred_flag);
2918           skip_e = guard_e;
2919           e = EDGE_PRED (guard_to, 0);
2920           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2921           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2922
2923           /* Simply propagate profile info from guard_bb to guard_to which is
2924              a merge point of control flow.  */
2925           guard_to->count = guard_bb->count;
2926
2927           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
2928           if (vect_epilogues || scalar_loop == NULL)
2929             {
2930               gcc_assert(epilog->num_nodes == loop->num_nodes);
2931               basic_block *bbs = get_loop_body (epilog);
2932               for (unsigned int i = 0; i < epilog->num_nodes; i++)
2933                 {
2934                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
2935                   bbs[i]->count = original_counts[i];
2936                 }
2937               free (bbs);
2938               free (original_bbs);
2939             }
2940         }
2941
2942       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2943       /* If loop is peeled for non-zero constant times, now niters refers to
2944          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2945          overflows.  */
2946       niters_no_overflow |= (prolog_peeling > 0);
2947       vect_gen_vector_loop_niters (loop_vinfo, niters,
2948                                    niters_vector, step_vector,
2949                                    niters_no_overflow);
2950       if (!integer_onep (*step_vector))
2951         {
2952           /* On exit from the loop we will have an easy way of calcalating
2953              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2954              until then.  */
2955           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2956           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2957           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2958         }
2959       else
2960         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2961                                              &niters_vector_mult_vf);
2962       /* Update IVs of original loop as if they were advanced by
2963          niters_vector_mult_vf steps.  */
2964       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2965       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2966       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2967                                         update_e);
2968
2969       if (skip_epilog)
2970         {
2971           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2972                                     niters, niters_vector_mult_vf);
2973           guard_bb = single_exit (loop)->dest;
2974           guard_to = split_edge (single_exit (epilog));
2975           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2976                                            skip_vector ? anchor : guard_bb,
2977                                            prob_epilog.invert (),
2978                                            irred_flag);
2979           if (vect_epilogues)
2980             epilogue_vinfo->skip_this_loop_edge = guard_e;
2981           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2982                                               single_exit (epilog));
2983           /* Only need to handle basic block before epilog loop if it's not
2984              the guard_bb, which is the case when skip_vector is true.  */
2985           if (guard_bb != bb_before_epilog)
2986             {
2987               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2988
2989               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2990             }
2991           scale_loop_profile (epilog, prob_epilog, 0);
2992         }
2993       else
2994         slpeel_update_phi_nodes_for_lcssa (epilog);
2995
2996       unsigned HOST_WIDE_INT bound;
2997       if (bound_scalar.is_constant (&bound))
2998         {
2999           gcc_assert (bound != 0);
3000           /* -1 to convert loop iterations to latch iterations.  */
3001           record_niter_bound (epilog, bound - 1, false, true);
3002         }
3003
3004       delete_update_ssa ();
3005       adjust_vec_debug_stmts ();
3006       scev_reset ();
3007     }
3008
3009   if (vect_epilogues)
3010     {
3011       epilog->aux = epilogue_vinfo;
3012       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3013
3014       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3015
3016       /* We now must calculate the number of NITERS performed by the previous
3017          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3018       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3019                                  niters_prolog, niters_vector_mult_vf);
3020
3021       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3022          determine whether we are coming from the previous vectorized loop
3023          using the update_e edge or the skip_vector basic block using the
3024          skip_e edge.  */
3025       if (skip_vector)
3026         {
3027           gcc_assert (update_e != NULL
3028                       && skip_e != NULL
3029                       && !vect_epilogues_updated_niters);
3030           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3031                                            update_e->dest);
3032           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3033           gimple *stmt = gimple_build_assign (new_ssa, niters);
3034           gimple_stmt_iterator gsi;
3035           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3036               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3037             {
3038               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3039               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3040             }
3041           else
3042             {
3043               gsi = gsi_last_bb (update_e->src);
3044               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3045             }
3046
3047           niters = new_ssa;
3048           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3049           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3050                        UNKNOWN_LOCATION);
3051           niters = PHI_RESULT (new_phi);
3052           epilogue_vinfo->main_loop_edge = update_e;
3053           epilogue_vinfo->skip_main_loop_edge = skip_e;
3054         }
3055
3056       /* Set ADVANCE to the number of iterations performed by the previous
3057          loop and its prologue.  */
3058       *advance = niters;
3059
3060       if (!vect_epilogues_updated_niters)
3061         {
3062           /* Subtract the number of iterations performed by the vectorized loop
3063              from the number of total iterations.  */
3064           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3065                                               before_loop_niters,
3066                                               niters);
3067
3068           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3069           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3070             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3071                            epilogue_niters,
3072                            build_one_cst (TREE_TYPE (epilogue_niters)));
3073
3074           /* Decide what to do if the number of epilogue iterations is not
3075              a multiple of the epilogue loop's vectorization factor.
3076              We should have rejected the loop during the analysis phase
3077              if this fails.  */
3078           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3079                                                            true))
3080             gcc_unreachable ();
3081         }
3082     }
3083
3084   adjust_vec.release ();
3085   free_original_copy_tables ();
3086
3087   return vect_epilogues ? epilog : NULL;
3088 }
3089
3090 /* Function vect_create_cond_for_niters_checks.
3091
3092    Create a conditional expression that represents the run-time checks for
3093    loop's niter.  The loop is guaranteed to terminate if the run-time
3094    checks hold.
3095
3096    Input:
3097    COND_EXPR  - input conditional expression.  New conditions will be chained
3098                 with logical AND operation.  If it is NULL, then the function
3099                 is used to return the number of alias checks.
3100    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3101                 to be checked.
3102
3103    Output:
3104    COND_EXPR - conditional expression.
3105
3106    The returned COND_EXPR is the conditional expression to be used in the
3107    if statement that controls which version of the loop gets executed at
3108    runtime.  */
3109
3110 static void
3111 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3112 {
3113   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3114
3115   if (*cond_expr)
3116     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3117                               *cond_expr, part_cond_expr);
3118   else
3119     *cond_expr = part_cond_expr;
3120 }
3121
3122 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3123    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3124
3125 static void
3126 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3127 {
3128   if (*cond_expr)
3129     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3130                               *cond_expr, part_cond_expr);
3131   else
3132     *cond_expr = part_cond_expr;
3133 }
3134
3135 /* Function vect_create_cond_for_align_checks.
3136
3137    Create a conditional expression that represents the alignment checks for
3138    all of data references (array element references) whose alignment must be
3139    checked at runtime.
3140
3141    Input:
3142    COND_EXPR  - input conditional expression.  New conditions will be chained
3143                 with logical AND operation.
3144    LOOP_VINFO - two fields of the loop information are used.
3145                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3146                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3147
3148    Output:
3149    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3150                          expression.
3151    The returned value is the conditional expression to be used in the if
3152    statement that controls which version of the loop gets executed at runtime.
3153
3154    The algorithm makes two assumptions:
3155      1) The number of bytes "n" in a vector is a power of 2.
3156      2) An address "a" is aligned if a%n is zero and that this
3157         test can be done as a&(n-1) == 0.  For example, for 16
3158         byte vectors the test is a&0xf == 0.  */
3159
3160 static void
3161 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3162                                    tree *cond_expr,
3163                                    gimple_seq *cond_expr_stmt_list)
3164 {
3165   const vec<stmt_vec_info> &may_misalign_stmts
3166     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3167   stmt_vec_info stmt_info;
3168   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3169   tree mask_cst;
3170   unsigned int i;
3171   tree int_ptrsize_type;
3172   char tmp_name[20];
3173   tree or_tmp_name = NULL_TREE;
3174   tree and_tmp_name;
3175   gimple *and_stmt;
3176   tree ptrsize_zero;
3177   tree part_cond_expr;
3178
3179   /* Check that mask is one less than a power of 2, i.e., mask is
3180      all zeros followed by all ones.  */
3181   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3182
3183   int_ptrsize_type = signed_type_for (ptr_type_node);
3184
3185   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3186      of the first vector of the i'th data reference. */
3187
3188   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3189     {
3190       gimple_seq new_stmt_list = NULL;
3191       tree addr_base;
3192       tree addr_tmp_name;
3193       tree new_or_tmp_name;
3194       gimple *addr_stmt, *or_stmt;
3195       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3196       bool negative = tree_int_cst_compare
3197         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3198       tree offset = negative
3199         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3200                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3201         : size_zero_node;
3202
3203       /* create: addr_tmp = (int)(address_of_first_vector) */
3204       addr_base =
3205         vect_create_addr_base_for_vector_ref (loop_vinfo,
3206                                               stmt_info, &new_stmt_list,
3207                                               offset);
3208       if (new_stmt_list != NULL)
3209         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3210
3211       sprintf (tmp_name, "addr2int%d", i);
3212       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3213       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3214       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3215
3216       /* The addresses are OR together.  */
3217
3218       if (or_tmp_name != NULL_TREE)
3219         {
3220           /* create: or_tmp = or_tmp | addr_tmp */
3221           sprintf (tmp_name, "orptrs%d", i);
3222           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3223           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3224                                          or_tmp_name, addr_tmp_name);
3225           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3226           or_tmp_name = new_or_tmp_name;
3227         }
3228       else
3229         or_tmp_name = addr_tmp_name;
3230
3231     } /* end for i */
3232
3233   mask_cst = build_int_cst (int_ptrsize_type, mask);
3234
3235   /* create: and_tmp = or_tmp & mask  */
3236   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3237
3238   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3239                                   or_tmp_name, mask_cst);
3240   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3241
3242   /* Make and_tmp the left operand of the conditional test against zero.
3243      if and_tmp has a nonzero bit then some address is unaligned.  */
3244   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3245   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3246                                 and_tmp_name, ptrsize_zero);
3247   chain_cond_expr (cond_expr, part_cond_expr);
3248 }
3249
3250 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3251    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3252    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3253    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3254
3255 static void
3256 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3257 {
3258   const vec<vec_object_pair> &pairs
3259     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3260   unsigned int i;
3261   vec_object_pair *pair;
3262   FOR_EACH_VEC_ELT (pairs, i, pair)
3263     {
3264       tree addr1 = build_fold_addr_expr (pair->first);
3265       tree addr2 = build_fold_addr_expr (pair->second);
3266       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3267                                          addr1, addr2);
3268       chain_cond_expr (cond_expr, part_cond_expr);
3269     }
3270 }
3271
3272 /* Create an expression that is true when all lower-bound conditions for
3273    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3274
3275 static void
3276 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3277 {
3278   const vec<vec_lower_bound> &lower_bounds
3279     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3280   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3281     {
3282       tree expr = lower_bounds[i].expr;
3283       tree type = unsigned_type_for (TREE_TYPE (expr));
3284       expr = fold_convert (type, expr);
3285       poly_uint64 bound = lower_bounds[i].min_value;
3286       if (!lower_bounds[i].unsigned_p)
3287         {
3288           expr = fold_build2 (PLUS_EXPR, type, expr,
3289                               build_int_cstu (type, bound - 1));
3290           bound += bound - 1;
3291         }
3292       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3293                                          build_int_cstu (type, bound));
3294       chain_cond_expr (cond_expr, part_cond_expr);
3295     }
3296 }
3297
3298 /* Function vect_create_cond_for_alias_checks.
3299
3300    Create a conditional expression that represents the run-time checks for
3301    overlapping of address ranges represented by a list of data references
3302    relations passed as input.
3303
3304    Input:
3305    COND_EXPR  - input conditional expression.  New conditions will be chained
3306                 with logical AND operation.  If it is NULL, then the function
3307                 is used to return the number of alias checks.
3308    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3309                 to be checked.
3310
3311    Output:
3312    COND_EXPR - conditional expression.
3313
3314    The returned COND_EXPR is the conditional expression to be used in the if
3315    statement that controls which version of the loop gets executed at runtime.
3316 */
3317
3318 void
3319 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3320 {
3321   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3322     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3323
3324   if (comp_alias_ddrs.is_empty ())
3325     return;
3326
3327   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3328                                &comp_alias_ddrs, cond_expr);
3329   if (dump_enabled_p ())
3330     dump_printf_loc (MSG_NOTE, vect_location,
3331                      "created %u versioning for alias checks.\n",
3332                      comp_alias_ddrs.length ());
3333 }
3334
3335
3336 /* Function vect_loop_versioning.
3337
3338    If the loop has data references that may or may not be aligned or/and
3339    has data reference relations whose independence was not proven then
3340    two versions of the loop need to be generated, one which is vectorized
3341    and one which isn't.  A test is then generated to control which of the
3342    loops is executed.  The test checks for the alignment of all of the
3343    data references that may or may not be aligned.  An additional
3344    sequence of runtime tests is generated for each pairs of DDRs whose
3345    independence was not proven.  The vectorized version of loop is
3346    executed only if both alias and alignment tests are passed.
3347
3348    The test generated to check which version of loop is executed
3349    is modified to also check for profitability as indicated by the
3350    cost model threshold TH.
3351
3352    The versioning precondition(s) are placed in *COND_EXPR and
3353    *COND_EXPR_STMT_LIST.  */
3354
3355 class loop *
3356 vect_loop_versioning (loop_vec_info loop_vinfo,
3357                       gimple *loop_vectorized_call)
3358 {
3359   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3360   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3361   basic_block condition_bb;
3362   gphi_iterator gsi;
3363   gimple_stmt_iterator cond_exp_gsi;
3364   basic_block merge_bb;
3365   basic_block new_exit_bb;
3366   edge new_exit_e, e;
3367   gphi *orig_phi, *new_phi;
3368   tree cond_expr = NULL_TREE;
3369   gimple_seq cond_expr_stmt_list = NULL;
3370   tree arg;
3371   profile_probability prob = profile_probability::likely ();
3372   gimple_seq gimplify_stmt_list = NULL;
3373   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3374   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3375   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3376   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3377   poly_uint64 versioning_threshold
3378     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3379   tree version_simd_if_cond
3380     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3381   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3382
3383   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3384       && !ordered_p (th, versioning_threshold))
3385     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3386                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3387                                             th - 1));
3388   if (maybe_ne (versioning_threshold, 0U))
3389     {
3390       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3391                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3392                                               versioning_threshold - 1));
3393       if (cond_expr)
3394         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3395                                  expr, cond_expr);
3396       else
3397         cond_expr = expr;
3398     }
3399
3400   tree cost_name = NULL_TREE;
3401   profile_probability prob2 = profile_probability::uninitialized ();
3402   if (cond_expr
3403       && !integer_truep (cond_expr)
3404       && (version_niter
3405           || version_align
3406           || version_alias
3407           || version_simd_if_cond))
3408     {
3409       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3410                                                       &cond_expr_stmt_list,
3411                                                       is_gimple_val, NULL_TREE);
3412       /* Split prob () into two so that the overall probability of passing
3413          both the cost-model and versioning checks is the orig prob.  */
3414       prob2 = prob.split (prob);
3415     }
3416
3417   if (version_niter)
3418     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3419
3420   if (cond_expr)
3421     {
3422       gimple_seq tem = NULL;
3423       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3424                                           &tem, is_gimple_condexpr_for_cond,
3425                                           NULL_TREE);
3426       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3427     }
3428
3429   if (version_align)
3430     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3431                                        &cond_expr_stmt_list);
3432
3433   if (version_alias)
3434     {
3435       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3436       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3437       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3438     }
3439
3440   if (version_simd_if_cond)
3441     {
3442       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3443       if (flag_checking)
3444         if (basic_block bb
3445             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3446           gcc_assert (bb != loop->header
3447                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3448                       && (scalar_loop == NULL
3449                           || (bb != scalar_loop->header
3450                               && dominated_by_p (CDI_DOMINATORS,
3451                                                  scalar_loop->header, bb))));
3452       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3453       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3454                             version_simd_if_cond, zero);
3455       if (cond_expr)
3456         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3457                                  c, cond_expr);
3458       else
3459         cond_expr = c;
3460       if (dump_enabled_p ())
3461         dump_printf_loc (MSG_NOTE, vect_location,
3462                          "created versioning for simd if condition check.\n");
3463     }
3464
3465   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3466                                       &gimplify_stmt_list,
3467                                       is_gimple_condexpr_for_cond, NULL_TREE);
3468   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3469
3470   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3471      invariant in.  */
3472   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3473   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3474        !gsi_end_p (gsi); gsi_next (&gsi))
3475     {
3476       gimple *stmt = gsi_stmt (gsi);
3477       update_stmt (stmt);
3478       ssa_op_iter iter;
3479       use_operand_p use_p;
3480       basic_block def_bb;
3481       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3482         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3483             && flow_bb_inside_loop_p (outermost, def_bb))
3484           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3485     }
3486
3487   /* Search for the outermost loop we can version.  Avoid versioning of
3488      non-perfect nests but allow if-conversion versioned loops inside.  */
3489   class loop *loop_to_version = loop;
3490   if (flow_loop_nested_p (outermost, loop))
3491     {
3492       if (dump_enabled_p ())
3493         dump_printf_loc (MSG_NOTE, vect_location,
3494                          "trying to apply versioning to outer loop %d\n",
3495                          outermost->num);
3496       if (outermost->num == 0)
3497         outermost = superloop_at_depth (loop, 1);
3498       /* And avoid applying versioning on non-perfect nests.  */
3499       while (loop_to_version != outermost
3500              && (e = single_exit (loop_outer (loop_to_version)))
3501              && !(e->flags & EDGE_COMPLEX)
3502              && (!loop_outer (loop_to_version)->inner->next
3503                  || vect_loop_vectorized_call (loop_to_version))
3504              && (!loop_outer (loop_to_version)->inner->next
3505                  || !loop_outer (loop_to_version)->inner->next->next))
3506         loop_to_version = loop_outer (loop_to_version);
3507     }
3508
3509   /* Apply versioning.  If there is already a scalar version created by
3510      if-conversion re-use that.  Note we cannot re-use the copy of
3511      an if-converted outer-loop when vectorizing the inner loop only.  */
3512   gcond *cond;
3513   if ((!loop_to_version->inner || loop == loop_to_version)
3514       && loop_vectorized_call)
3515     {
3516       gcc_assert (scalar_loop);
3517       condition_bb = gimple_bb (loop_vectorized_call);
3518       cond = as_a <gcond *> (last_stmt (condition_bb));
3519       gimple_cond_set_condition_from_tree (cond, cond_expr);
3520       update_stmt (cond);
3521
3522       if (cond_expr_stmt_list)
3523         {
3524           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3525           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3526                                  GSI_SAME_STMT);
3527         }
3528
3529       /* if-conversion uses profile_probability::always () for both paths,
3530          reset the paths probabilities appropriately.  */
3531       edge te, fe;
3532       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3533       te->probability = prob;
3534       fe->probability = prob.invert ();
3535       /* We can scale loops counts immediately but have to postpone
3536          scaling the scalar loop because we re-use it during peeling.  */
3537       scale_loop_frequencies (loop_to_version, te->probability);
3538       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3539
3540       nloop = scalar_loop;
3541       if (dump_enabled_p ())
3542         dump_printf_loc (MSG_NOTE, vect_location,
3543                          "reusing %sloop version created by if conversion\n",
3544                          loop_to_version != loop ? "outer " : "");
3545     }
3546   else
3547     {
3548       if (loop_to_version != loop
3549           && dump_enabled_p ())
3550         dump_printf_loc (MSG_NOTE, vect_location,
3551                          "applying loop versioning to outer loop %d\n",
3552                          loop_to_version->num);
3553
3554       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3555
3556       initialize_original_copy_tables ();
3557       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3558                             prob, prob.invert (), prob, prob.invert (), true);
3559       gcc_assert (nloop);
3560       nloop = get_loop_copy (loop);
3561
3562       /* For cycle vectorization with SLP we rely on the PHI arguments
3563          appearing in the same order as the SLP node operands which for the
3564          loop PHI nodes means the preheader edge dest index needs to remain
3565          the same for the analyzed loop which also becomes the vectorized one.
3566          Make it so in case the state after versioning differs by redirecting
3567          the first edge into the header to the same destination which moves
3568          it last.  */
3569       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3570         {
3571           edge e = EDGE_PRED (loop->header, 0);
3572           ssa_redirect_edge (e, e->dest);
3573           flush_pending_stmts (e);
3574         }
3575       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3576
3577       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3578          reap those otherwise;  they also refer to the original
3579          loops.  */
3580       class loop *l = loop;
3581       while (gimple *call = vect_loop_vectorized_call (l))
3582         {
3583           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3584           fold_loop_internal_call (call, boolean_false_node);
3585           l = loop_outer (l);
3586         }
3587       free_original_copy_tables ();
3588
3589       if (cond_expr_stmt_list)
3590         {
3591           cond_exp_gsi = gsi_last_bb (condition_bb);
3592           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3593                                  GSI_SAME_STMT);
3594         }
3595
3596       /* Loop versioning violates an assumption we try to maintain during
3597          vectorization - that the loop exit block has a single predecessor.
3598          After versioning, the exit block of both loop versions is the same
3599          basic block (i.e. it has two predecessors). Just in order to simplify
3600          following transformations in the vectorizer, we fix this situation
3601          here by adding a new (empty) block on the exit-edge of the loop,
3602          with the proper loop-exit phis to maintain loop-closed-form.
3603          If loop versioning wasn't done from loop, but scalar_loop instead,
3604          merge_bb will have already just a single successor.  */
3605
3606       merge_bb = single_exit (loop_to_version)->dest;
3607       if (EDGE_COUNT (merge_bb->preds) >= 2)
3608         {
3609           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3610           new_exit_bb = split_edge (single_exit (loop_to_version));
3611           new_exit_e = single_exit (loop_to_version);
3612           e = EDGE_SUCC (new_exit_bb, 0);
3613
3614           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3615                gsi_next (&gsi))
3616             {
3617               tree new_res;
3618               orig_phi = gsi.phi ();
3619               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3620               new_phi = create_phi_node (new_res, new_exit_bb);
3621               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3622               add_phi_arg (new_phi, arg, new_exit_e,
3623                            gimple_phi_arg_location_from_edge (orig_phi, e));
3624               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3625             }
3626         }
3627
3628       update_ssa (TODO_update_ssa_no_phi);
3629     }
3630
3631   /* Split the cost model check off to a separate BB.  Costing assumes
3632      this is the only thing we perform when we enter the scalar loop
3633      from a failed cost decision.  */
3634   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3635     {
3636       gimple *def = SSA_NAME_DEF_STMT (cost_name);
3637       /* All uses of the cost check are 'true' after the check we
3638          are going to insert.  */
3639       replace_uses_by (cost_name, boolean_true_node);
3640       /* And we're going to build the new single use of it.  */
3641       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3642                                        NULL_TREE, NULL_TREE);
3643       edge e = split_block (gimple_bb (def), def);
3644       gimple_stmt_iterator gsi = gsi_for_stmt (def);
3645       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3646       edge true_e, false_e;
3647       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3648       e->flags &= ~EDGE_FALLTHRU;
3649       e->flags |= EDGE_TRUE_VALUE;
3650       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3651       e->probability = prob2;
3652       e2->probability = prob2.invert ();
3653       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3654       auto_vec<basic_block, 3> adj;
3655       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3656            son;
3657            son = next_dom_son (CDI_DOMINATORS, son))
3658         if (EDGE_COUNT (son->preds) > 1)
3659           adj.safe_push (son);
3660       for (auto son : adj)
3661         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3662     }
3663
3664   if (version_niter)
3665     {
3666       /* The versioned loop could be infinite, we need to clear existing
3667          niter information which is copied from the original loop.  */
3668       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3669       vect_free_loop_info_assumptions (nloop);
3670     }
3671
3672   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3673       && dump_enabled_p ())
3674     {
3675       if (version_alias)
3676         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3677                          vect_location,
3678                          "loop versioned for vectorization because of "
3679                          "possible aliasing\n");
3680       if (version_align)
3681         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3682                          vect_location,
3683                          "loop versioned for vectorization to enhance "
3684                          "alignment\n");
3685
3686     }
3687
3688   return nloop;
3689 }