gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 316                                *indices);
 317 }
 318
 319 /* Try to use permutes to define the masks in DEST_RGM using the masks
 320    in SRC_RGM, given that the former has twice as many masks as the
 321    latter.  Return true on success, adding any new statements to SEQ.  */
 322
 323 static bool
 324 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 325                                rgroup_controls *src_rgm)
 326 {
 327   tree src_masktype = src_rgm->type;
 328   tree dest_masktype = dest_rgm->type;
 329   machine_mode src_mode = TYPE_MODE (src_masktype);
 330   insn_code icode1, icode2;
 331   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 332       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 333                                   src_mode)) != CODE_FOR_nothing
 334       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 335                                   src_mode)) != CODE_FOR_nothing)
 336     {
 337       /* Unpacking the source masks gives at least as many mask bits as
 338          we need.  We can then VIEW_CONVERT any excess bits away.  */
 339       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 340       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 341       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 342       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 343         {
 344           tree src = src_rgm->controls[i / 2];
 345           tree dest = dest_rgm->controls[i];
 346           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 347                             ? VEC_UNPACK_HI_EXPR
 348                             : VEC_UNPACK_LO_EXPR);
 349           gassign *stmt;
 350           if (dest_masktype == unpack_masktype)
 351             stmt = gimple_build_assign (dest, code, src);
 352           else
 353             {
 354               tree temp = make_ssa_name (unpack_masktype);
 355               stmt = gimple_build_assign (temp, code, src);
 356               gimple_seq_add_stmt (seq, stmt);
 357               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 358                                           build1 (VIEW_CONVERT_EXPR,
 359                                                   dest_masktype, temp));
 360             }
 361           gimple_seq_add_stmt (seq, stmt);
 362         }
 363       return true;
 364     }
 365   vec_perm_indices indices[2];
 366   if (dest_masktype == src_masktype
 367       && interleave_supported_p (&indices[0], src_masktype, 0)
 368       && interleave_supported_p (&indices[1], src_masktype, 1))
 369     {
 370       /* The destination requires twice as many mask bits as the source, so
 371          we can use interleaving permutes to double up the number of bits.  */
 372       tree masks[2];
 373       for (unsigned int i = 0; i < 2; ++i)
 374         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 375       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 376         {
 377           tree src = src_rgm->controls[i / 2];
 378           tree dest = dest_rgm->controls[i];
 379           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 380                                               src, src, masks[i & 1]);
 381           gimple_seq_add_stmt (seq, stmt);
 382         }
 383       return true;
 384     }
 385   return false;
 386 }
 387
 388 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 389    for all the rgroup controls in RGC and return a control that is nonzero
 390    when the loop needs to iterate.  Add any new preheader statements to
 391    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 392
 393    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 394    times and has been vectorized according to LOOP_VINFO.
 395
 396    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 397    starts with NITERS_SKIP dummy iterations of the scalar loop before
 398    the real work starts.  The mask elements for these dummy iterations
 399    must be 0, to ensure that the extra iterations do not have an effect.
 400
 401    It is known that:
 402
 403      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 404
 405    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 406    variable that starts at 0 and has step:
 407
 408      VF * RGC->max_nscalars_per_iter * RGC->factor
 409
 410    might overflow before hitting a value above:
 411
 412      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 413
 414    This means that we cannot guarantee that such an induction variable
 415    would ever hit a value that produces a set of all-false masks or zero
 416    lengths for RGC.
 417
 418    Note: the cost of the code generated by this function is modeled
 419    by vect_estimate_min_profitable_iters, so changes here may need
 420    corresponding changes there.  */
 421
 422 static tree
 423 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 424                                  gimple_seq *preheader_seq,
 425                                  gimple_seq *header_seq,
 426                                  gimple_stmt_iterator loop_cond_gsi,
 427                                  rgroup_controls *rgc, tree niters,
 428                                  tree niters_skip, bool might_wrap_p)
 429 {
 430   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 431   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 432   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 433
 434   tree ctrl_type = rgc->type;
 435   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 436   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 437   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 438   tree length_limit = NULL_TREE;
 439   /* For length, we need length_limit to ensure length in range.  */
 440   if (!use_masks_p)
 441     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 442
 443   /* Calculate the maximum number of item values that the rgroup
 444      handles in total, the number that it handles for each iteration
 445      of the vector loop, and the number that it should skip during the
 446      first iteration of the vector loop.  */
 447   tree nitems_total = niters;
 448   tree nitems_step = build_int_cst (iv_type, vf);
 449   tree nitems_skip = niters_skip;
 450   if (nitems_per_iter != 1)
 451     {
 452       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 453          these multiplications don't overflow.  */
 454       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 455       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 456       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 457                                    nitems_total, compare_factor);
 458       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 459                                   nitems_step, iv_factor);
 460       if (nitems_skip)
 461         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 462                                     nitems_skip, compare_factor);
 463     }
 464
 465   /* Create an induction variable that counts the number of items
 466      processed.  */
 467   tree index_before_incr, index_after_incr;
 468   gimple_stmt_iterator incr_gsi;
 469   bool insert_after;
 470   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 471   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 472              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 473
 474   tree zero_index = build_int_cst (compare_type, 0);
 475   tree test_index, test_limit, first_limit;
 476   gimple_stmt_iterator *test_gsi;
 477   if (might_wrap_p)
 478     {
 479       /* In principle the loop should stop iterating once the incremented
 480          IV reaches a value greater than or equal to:
 481
 482            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 483
 484          However, there's no guarantee that this addition doesn't overflow
 485          the comparison type, or that the IV hits a value above it before
 486          wrapping around.  We therefore adjust the limit down by one
 487          IV step:
 488
 489            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 490            -[infinite-prec] NITEMS_STEP
 491
 492          and compare the IV against this limit _before_ incrementing it.
 493          Since the comparison type is unsigned, we actually want the
 494          subtraction to saturate at zero:
 495
 496            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 497            -[sat] NITEMS_STEP
 498
 499          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 500
 501            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 502
 503          where the rightmost subtraction can be done directly in
 504          COMPARE_TYPE.  */
 505       test_index = index_before_incr;
 506       tree adjust = gimple_convert (preheader_seq, compare_type,
 507                                     nitems_step);
 508       if (nitems_skip)
 509         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 510                                adjust, nitems_skip);
 511       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 512                                  nitems_total, adjust);
 513       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 514                                  test_limit, adjust);
 515       test_gsi = &incr_gsi;
 516
 517       /* Get a safe limit for the first iteration.  */
 518       if (nitems_skip)
 519         {
 520           /* The first vector iteration can handle at most NITEMS_STEP
 521              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 522              NITEMS_SKIP to that cannot overflow.  */
 523           tree const_limit = build_int_cst (compare_type,
 524                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 525                                             * nitems_per_iter);
 526           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 527                                       nitems_total, const_limit);
 528           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 529                                       first_limit, nitems_skip);
 530         }
 531       else
 532         /* For the first iteration it doesn't matter whether the IV hits
 533            a value above NITEMS_TOTAL.  That only matters for the latch
 534            condition.  */
 535         first_limit = nitems_total;
 536     }
 537   else
 538     {
 539       /* Test the incremented IV, which will always hit a value above
 540          the bound before wrapping.  */
 541       test_index = index_after_incr;
 542       test_limit = nitems_total;
 543       if (nitems_skip)
 544         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 545                                    test_limit, nitems_skip);
 546       test_gsi = &loop_cond_gsi;
 547
 548       first_limit = test_limit;
 549     }
 550
 551   /* Convert the IV value to the comparison type (either a no-op or
 552      a demotion).  */
 553   gimple_seq test_seq = NULL;
 554   test_index = gimple_convert (&test_seq, compare_type, test_index);
 555   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 556
 557   /* Provide a definition of each control in the group.  */
 558   tree next_ctrl = NULL_TREE;
 559   tree ctrl;
 560   unsigned int i;
 561   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 562     {
 563       /* Previous controls will cover BIAS items.  This control covers the
 564          next batch.  */
 565       poly_uint64 bias = nitems_per_ctrl * i;
 566       tree bias_tree = build_int_cst (compare_type, bias);
 567
 568       /* See whether the first iteration of the vector loop is known
 569          to have a full control.  */
 570       poly_uint64 const_limit;
 571       bool first_iteration_full
 572         = (poly_int_tree_p (first_limit, &const_limit)
 573            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 574
 575       /* Rather than have a new IV that starts at BIAS and goes up to
 576          TEST_LIMIT, prefer to use the same 0-based IV for each control
 577          and adjust the bound down by BIAS.  */
 578       tree this_test_limit = test_limit;
 579       if (i != 0)
 580         {
 581           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 582                                           compare_type, this_test_limit,
 583                                           bias_tree);
 584           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 585                                           compare_type, this_test_limit,
 586                                           bias_tree);
 587         }
 588
 589       /* Create the initial control.  First include all items that
 590          are within the loop limit.  */
 591       tree init_ctrl = NULL_TREE;
 592       if (!first_iteration_full)
 593         {
 594           tree start, end;
 595           if (first_limit == test_limit)
 596             {
 597               /* Use a natural test between zero (the initial IV value)
 598                  and the loop limit.  The "else" block would be valid too,
 599                  but this choice can avoid the need to load BIAS_TREE into
 600                  a register.  */
 601               start = zero_index;
 602               end = this_test_limit;
 603             }
 604           else
 605             {
 606               /* FIRST_LIMIT is the maximum number of items handled by the
 607                  first iteration of the vector loop.  Test the portion
 608                  associated with this control.  */
 609               start = bias_tree;
 610               end = first_limit;
 611             }
 612
 613           if (use_masks_p)
 614             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 615                                         start, end, "max_mask");
 616           else
 617             {
 618               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 619               gimple_seq seq = vect_gen_len (init_ctrl, start,
 620                                              end, length_limit);
 621               gimple_seq_add_seq (preheader_seq, seq);
 622             }
 623         }
 624
 625       /* Now AND out the bits that are within the number of skipped
 626          items.  */
 627       poly_uint64 const_skip;
 628       if (nitems_skip
 629           && !(poly_int_tree_p (nitems_skip, &const_skip)
 630                && known_le (const_skip, bias)))
 631         {
 632           gcc_assert (use_masks_p);
 633           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 634                                                     bias_tree, nitems_skip);
 635           if (init_ctrl)
 636             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 637                                       init_ctrl, unskipped_mask);
 638           else
 639             init_ctrl = unskipped_mask;
 640         }
 641
 642       if (!init_ctrl)
 643         {
 644           /* First iteration is full.  */
 645           if (use_masks_p)
 646             init_ctrl = build_minus_one_cst (ctrl_type);
 647           else
 648             init_ctrl = length_limit;
 649         }
 650
 651       /* Get the control value for the next iteration of the loop.  */
 652       if (use_masks_p)
 653         {
 654           gimple_seq stmts = NULL;
 655           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 656                                       this_test_limit, "next_mask");
 657           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 658         }
 659       else
 660         {
 661           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 662           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 663                                          length_limit);
 664           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 665         }
 666
 667       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 668     }
 669
 670   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 671   if (partial_load_bias != 0)
 672     {
 673       tree adjusted_len = rgc->bias_adjusted_ctrl;
 674       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 675                                             rgc->controls[0],
 676                                             build_int_cst
 677                                             (TREE_TYPE (rgc->controls[0]),
 678                                              partial_load_bias));
 679       gimple_seq_add_stmt (header_seq, minus);
 680     }
 681
 682   return next_ctrl;
 683 }
 684
 685 /* Set up the iteration condition and rgroup controls for LOOP, given
 686    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 687    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 688    the number of iterations of the original scalar loop that should be
 689    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 690    for vect_set_loop_condition.
 691
 692    Insert the branch-back condition before LOOP_COND_GSI and return the
 693    final gcond.  */
 694
 695 static gcond *
 696 vect_set_loop_condition_partial_vectors (class loop *loop,
 697                                          loop_vec_info loop_vinfo, tree niters,
 698                                          tree final_iv, bool niters_maybe_zero,
 699                                          gimple_stmt_iterator loop_cond_gsi)
 700 {
 701   gimple_seq preheader_seq = NULL;
 702   gimple_seq header_seq = NULL;
 703
 704   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 705   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 706   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 707   tree orig_niters = niters;
 708
 709   /* Type of the initial value of NITERS.  */
 710   tree ni_actual_type = TREE_TYPE (niters);
 711   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 712   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 713
 714   /* Convert NITERS to the same size as the compare.  */
 715   if (compare_precision > ni_actual_precision
 716       && niters_maybe_zero)
 717     {
 718       /* We know that there is always at least one iteration, so if the
 719          count is zero then it must have wrapped.  Cope with this by
 720          subtracting 1 before the conversion and adding 1 to the result.  */
 721       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 722       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 723                              niters, build_minus_one_cst (ni_actual_type));
 724       niters = gimple_convert (&preheader_seq, compare_type, niters);
 725       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 726                              niters, build_one_cst (compare_type));
 727     }
 728   else
 729     niters = gimple_convert (&preheader_seq, compare_type, niters);
 730
 731   /* Iterate over all the rgroups and fill in their controls.  We could use
 732      the first control from any rgroup for the loop condition; here we
 733      arbitrarily pick the last.  */
 734   tree test_ctrl = NULL_TREE;
 735   rgroup_controls *rgc;
 736   unsigned int i;
 737   auto_vec<rgroup_controls> *controls = use_masks_p
 738                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 739                                           : &LOOP_VINFO_LENS (loop_vinfo);
 740   FOR_EACH_VEC_ELT (*controls, i, rgc)
 741     if (!rgc->controls.is_empty ())
 742       {
 743         /* First try using permutes.  This adds a single vector
 744            instruction to the loop for each mask, but needs no extra
 745            loop invariants or IVs.  */
 746         unsigned int nmasks = i + 1;
 747         if (use_masks_p && (nmasks & 1) == 0)
 748           {
 749             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 750             if (!half_rgc->controls.is_empty ()
 751                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 752               continue;
 753           }
 754
 755         /* See whether zero-based IV would ever generate all-false masks
 756            or zero length before wrapping around.  */
 757         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 758
 759         /* Set up all controls for this group.  */
 760         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 761                                                      &preheader_seq,
 762                                                      &header_seq,
 763                                                      loop_cond_gsi, rgc,
 764                                                      niters, niters_skip,
 765                                                      might_wrap_p);
 766       }
 767
 768   /* Emit all accumulated statements.  */
 769   add_preheader_seq (loop, preheader_seq);
 770   add_header_seq (loop, header_seq);
 771
 772   /* Get a boolean result that tells us whether to iterate.  */
 773   edge exit_edge = single_exit (loop);
 774   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 775   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 776   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 777                                         NULL_TREE, NULL_TREE);
 778   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 779
 780   /* The loop iterates (NITERS - 1) / VF + 1 times.
 781      Subtract one from this to get the latch count.  */
 782   tree step = build_int_cst (compare_type,
 783                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 784   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 785                                        build_minus_one_cst (compare_type));
 786   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 787                                      niters_minus_one, step);
 788
 789   if (final_iv)
 790     {
 791       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 792       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 793     }
 794
 795   return cond_stmt;
 796 }
 797
 798 /* Like vect_set_loop_condition, but handle the case in which the vector
 799    loop handles exactly VF scalars per iteration.  */
 800
 801 static gcond *
 802 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 803                                 tree final_iv, bool niters_maybe_zero,
 804                                 gimple_stmt_iterator loop_cond_gsi)
 805 {
 806   tree indx_before_incr, indx_after_incr;
 807   gcond *cond_stmt;
 808   gcond *orig_cond;
 809   edge pe = loop_preheader_edge (loop);
 810   edge exit_edge = single_exit (loop);
 811   gimple_stmt_iterator incr_gsi;
 812   bool insert_after;
 813   enum tree_code code;
 814   tree niters_type = TREE_TYPE (niters);
 815
 816   orig_cond = get_loop_exit_condition (loop);
 817   gcc_assert (orig_cond);
 818   loop_cond_gsi = gsi_for_stmt (orig_cond);
 819
 820   tree init, limit;
 821   if (!niters_maybe_zero && integer_onep (step))
 822     {
 823       /* In this case we can use a simple 0-based IV:
 824
 825          A:
 826            x = 0;
 827            do
 828              {
 829                ...
 830                x += 1;
 831              }
 832            while (x < NITERS);  */
 833       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 834       init = build_zero_cst (niters_type);
 835       limit = niters;
 836     }
 837   else
 838     {
 839       /* The following works for all values of NITERS except 0:
 840
 841          B:
 842            x = 0;
 843            do
 844              {
 845                ...
 846                x += STEP;
 847              }
 848            while (x <= NITERS - STEP);
 849
 850          so that the loop continues to iterate if x + STEP - 1 < NITERS
 851          but stops if x + STEP - 1 >= NITERS.
 852
 853          However, if NITERS is zero, x never hits a value above NITERS - STEP
 854          before wrapping around.  There are two obvious ways of dealing with
 855          this:
 856
 857          - start at STEP - 1 and compare x before incrementing it
 858          - start at -1 and compare x after incrementing it
 859
 860          The latter is simpler and is what we use.  The loop in this case
 861          looks like:
 862
 863          C:
 864            x = -1;
 865            do
 866              {
 867                ...
 868                x += STEP;
 869              }
 870            while (x < NITERS - STEP);
 871
 872          In both cases the loop limit is NITERS - STEP.  */
 873       gimple_seq seq = NULL;
 874       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 875       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 876       if (seq)
 877         {
 878           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 879           gcc_assert (!new_bb);
 880         }
 881       if (niters_maybe_zero)
 882         {
 883           /* Case C.  */
 884           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 885           init = build_all_ones_cst (niters_type);
 886         }
 887       else
 888         {
 889           /* Case B.  */
 890           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 891           init = build_zero_cst (niters_type);
 892         }
 893     }
 894
 895   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 896   create_iv (init, step, NULL_TREE, loop,
 897              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 898   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 899                                               true, NULL_TREE, true,
 900                                               GSI_SAME_STMT);
 901   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 902                                      true, GSI_SAME_STMT);
 903
 904   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 905                                  NULL_TREE);
 906
 907   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 908
 909   /* Record the number of latch iterations.  */
 910   if (limit == niters)
 911     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 912        latch count.  */
 913     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 914                                        build_int_cst (niters_type, 1));
 915   else
 916     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 917        Subtract one from this to get the latch count.  */
 918     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 919                                        limit, step);
 920
 921   if (final_iv)
 922     {
 923       gassign *assign;
 924       edge exit = single_exit (loop);
 925       gcc_assert (single_pred_p (exit->dest));
 926       tree phi_dest
 927         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
 928       /* Make sure to maintain LC SSA form here and elide the subtraction
 929          if the value is zero.  */
 930       gphi *phi = create_phi_node (phi_dest, exit->dest);
 931       add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
 932       if (!integer_zerop (init))
 933         {
 934           assign = gimple_build_assign (final_iv, MINUS_EXPR,
 935                                         phi_dest, init);
 936           gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
 937           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
 938         }
 939     }
 940
 941   return cond_stmt;
 942 }
 943
 944 /* If we're using fully-masked loops, make LOOP iterate:
 945
 946       N == (NITERS - 1) / STEP + 1
 947
 948    times.  When NITERS is zero, this is equivalent to making the loop
 949    execute (1 << M) / STEP times, where M is the precision of NITERS.
 950    NITERS_MAYBE_ZERO is true if this last case might occur.
 951
 952    If we're not using fully-masked loops, make LOOP iterate:
 953
 954       N == (NITERS - STEP) / STEP + 1
 955
 956    times, where NITERS is known to be outside the range [1, STEP - 1].
 957    This is equivalent to making the loop execute NITERS / STEP times
 958    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 959    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 960
 961    If FINAL_IV is nonnull, it is an SSA name that should be set to
 962    N * STEP on exit from the loop.
 963
 964    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 965
 966 void
 967 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 968                          tree niters, tree step, tree final_iv,
 969                          bool niters_maybe_zero)
 970 {
 971   gcond *cond_stmt;
 972   gcond *orig_cond = get_loop_exit_condition (loop);
 973   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 974
 975   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 976     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 977                                                          niters, final_iv,
 978                                                          niters_maybe_zero,
 979                                                          loop_cond_gsi);
 980   else
 981     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 982                                                 niters_maybe_zero,
 983                                                 loop_cond_gsi);
 984
 985   /* Remove old loop exit test.  */
 986   stmt_vec_info orig_cond_info;
 987   if (loop_vinfo
 988       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 989     loop_vinfo->remove_stmt (orig_cond_info);
 990   else
 991     gsi_remove (&loop_cond_gsi, true);
 992
 993   if (dump_enabled_p ())
 994     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 995                      (gimple *) cond_stmt);
 996 }
 997
 998 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 999    For all PHI arguments in FROM->dest and TO->dest from those
1000    edges ensure that TO->dest PHI arguments have current_def
1001    to that in from.  */
1002
1003 static void
1004 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1005 {
1006   gimple_stmt_iterator gsi_from, gsi_to;
1007
1008   for (gsi_from = gsi_start_phis (from->dest),
1009        gsi_to = gsi_start_phis (to->dest);
1010        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1011     {
1012       gimple *from_phi = gsi_stmt (gsi_from);
1013       gimple *to_phi = gsi_stmt (gsi_to);
1014       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1015       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1016       if (virtual_operand_p (from_arg))
1017         {
1018           gsi_next (&gsi_from);
1019           continue;
1020         }
1021       if (virtual_operand_p (to_arg))
1022         {
1023           gsi_next (&gsi_to);
1024           continue;
1025         }
1026       if (TREE_CODE (from_arg) != SSA_NAME)
1027         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1028       else if (TREE_CODE (to_arg) == SSA_NAME
1029                && from_arg != to_arg)
1030         {
1031           if (get_current_def (to_arg) == NULL_TREE)
1032             {
1033               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1034                                               TREE_TYPE (get_current_def
1035                                                            (from_arg))));
1036               set_current_def (to_arg, get_current_def (from_arg));
1037             }
1038         }
1039       gsi_next (&gsi_from);
1040       gsi_next (&gsi_to);
1041     }
1042
1043   gphi *from_phi = get_virtual_phi (from->dest);
1044   gphi *to_phi = get_virtual_phi (to->dest);
1045   if (from_phi)
1046     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1047                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1048 }
1049
1050
1051 /* Given LOOP this function generates a new copy of it and puts it
1052    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1053    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1054    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1055    entry or exit of LOOP.  */
1056
1057 class loop *
1058 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1059                                         class loop *scalar_loop, edge e)
1060 {
1061   class loop *new_loop;
1062   basic_block *new_bbs, *bbs, *pbbs;
1063   bool at_exit;
1064   bool was_imm_dom;
1065   basic_block exit_dest;
1066   edge exit, new_exit;
1067   bool duplicate_outer_loop = false;
1068
1069   exit = single_exit (loop);
1070   at_exit = (e == exit);
1071   if (!at_exit && e != loop_preheader_edge (loop))
1072     return NULL;
1073
1074   if (scalar_loop == NULL)
1075     scalar_loop = loop;
1076
1077   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1078   pbbs = bbs + 1;
1079   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1080   /* Allow duplication of outer loops.  */
1081   if (scalar_loop->inner)
1082     duplicate_outer_loop = true;
1083   /* Check whether duplication is possible.  */
1084   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1085     {
1086       free (bbs);
1087       return NULL;
1088     }
1089
1090   /* Generate new loop structure.  */
1091   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1092   duplicate_subloops (scalar_loop, new_loop);
1093
1094   exit_dest = exit->dest;
1095   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1096                                           exit_dest) == loop->header ?
1097                  true : false);
1098
1099   /* Also copy the pre-header, this avoids jumping through hoops to
1100      duplicate the loop entry PHI arguments.  Create an empty
1101      pre-header unconditionally for this.  */
1102   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1103   edge entry_e = single_pred_edge (preheader);
1104   bbs[0] = preheader;
1105   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1106
1107   exit = single_exit (scalar_loop);
1108   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1109             &exit, 1, &new_exit, NULL,
1110             at_exit ? loop->latch : e->src, true);
1111   exit = single_exit (loop);
1112   basic_block new_preheader = new_bbs[0];
1113
1114   /* Before installing PHI arguments make sure that the edges
1115      into them match that of the scalar loop we analyzed.  This
1116      makes sure the SLP tree matches up between the main vectorized
1117      loop and the epilogue vectorized copies.  */
1118   if (single_succ_edge (preheader)->dest_idx
1119       != single_succ_edge (new_bbs[0])->dest_idx)
1120     {
1121       basic_block swap_bb = new_bbs[1];
1122       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1123       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1124       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1125       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1126     }
1127   if (duplicate_outer_loop)
1128     {
1129       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1130       if (loop_preheader_edge (scalar_loop)->dest_idx
1131           != loop_preheader_edge (new_inner_loop)->dest_idx)
1132         {
1133           basic_block swap_bb = new_inner_loop->header;
1134           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1135           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1136           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1137           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1138         }
1139     }
1140
1141   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1142
1143   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1144   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1145     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1146
1147   if (scalar_loop != loop)
1148     {
1149       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1150          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1151          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1152          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1153          header) to have current_def set, so copy them over.  */
1154       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1155                                                 exit);
1156       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1157                                                            0),
1158                                                 EDGE_SUCC (loop->latch, 0));
1159     }
1160
1161   if (at_exit) /* Add the loop copy at exit.  */
1162     {
1163       if (scalar_loop != loop)
1164         {
1165           gphi_iterator gsi;
1166           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1167
1168           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1169                gsi_next (&gsi))
1170             {
1171               gphi *phi = gsi.phi ();
1172               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1173               location_t orig_locus
1174                 = gimple_phi_arg_location_from_edge (phi, e);
1175
1176               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1177             }
1178         }
1179       redirect_edge_and_branch_force (e, new_preheader);
1180       flush_pending_stmts (e);
1181       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1182       if (was_imm_dom || duplicate_outer_loop)
1183         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1184
1185       /* And remove the non-necessary forwarder again.  Keep the other
1186          one so we have a proper pre-header for the loop at the exit edge.  */
1187       redirect_edge_pred (single_succ_edge (preheader),
1188                           single_pred (preheader));
1189       delete_basic_block (preheader);
1190       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1191                                loop_preheader_edge (scalar_loop)->src);
1192     }
1193   else /* Add the copy at entry.  */
1194     {
1195       if (scalar_loop != loop)
1196         {
1197           /* Remove the non-necessary forwarder of scalar_loop again.  */
1198           redirect_edge_pred (single_succ_edge (preheader),
1199                               single_pred (preheader));
1200           delete_basic_block (preheader);
1201           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1202                                    loop_preheader_edge (scalar_loop)->src);
1203           preheader = split_edge (loop_preheader_edge (loop));
1204           entry_e = single_pred_edge (preheader);
1205         }
1206
1207       redirect_edge_and_branch_force (entry_e, new_preheader);
1208       flush_pending_stmts (entry_e);
1209       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1210
1211       redirect_edge_and_branch_force (new_exit, preheader);
1212       flush_pending_stmts (new_exit);
1213       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1214
1215       /* And remove the non-necessary forwarder again.  Keep the other
1216          one so we have a proper pre-header for the loop at the exit edge.  */
1217       redirect_edge_pred (single_succ_edge (new_preheader),
1218                           single_pred (new_preheader));
1219       delete_basic_block (new_preheader);
1220       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1221                                loop_preheader_edge (new_loop)->src);
1222     }
1223
1224   if (scalar_loop != loop)
1225     {
1226       /* Update new_loop->header PHIs, so that on the preheader
1227          edge they are the ones from loop rather than scalar_loop.  */
1228       gphi_iterator gsi_orig, gsi_new;
1229       edge orig_e = loop_preheader_edge (loop);
1230       edge new_e = loop_preheader_edge (new_loop);
1231
1232       for (gsi_orig = gsi_start_phis (loop->header),
1233            gsi_new = gsi_start_phis (new_loop->header);
1234            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1235            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1236         {
1237           gphi *orig_phi = gsi_orig.phi ();
1238           gphi *new_phi = gsi_new.phi ();
1239           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1240           location_t orig_locus
1241             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1242
1243           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1244         }
1245     }
1246
1247   free (new_bbs);
1248   free (bbs);
1249
1250   checking_verify_dominators (CDI_DOMINATORS);
1251
1252   return new_loop;
1253 }
1254
1255
1256 /* Given the condition expression COND, put it as the last statement of
1257    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1258    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1259    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1260    new edge as irreducible if IRREDUCIBLE_P is true.  */
1261
1262 static edge
1263 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1264                        basic_block guard_to, basic_block dom_bb,
1265                        profile_probability probability, bool irreducible_p)
1266 {
1267   gimple_stmt_iterator gsi;
1268   edge new_e, enter_e;
1269   gcond *cond_stmt;
1270   gimple_seq gimplify_stmt_list = NULL;
1271
1272   enter_e = EDGE_SUCC (guard_bb, 0);
1273   enter_e->flags &= ~EDGE_FALLTHRU;
1274   enter_e->flags |= EDGE_FALSE_VALUE;
1275   gsi = gsi_last_bb (guard_bb);
1276
1277   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1278                                  is_gimple_condexpr_for_cond, NULL_TREE);
1279   if (gimplify_stmt_list)
1280     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1281
1282   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1283   gsi = gsi_last_bb (guard_bb);
1284   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1285
1286   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1287   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1288
1289   new_e->probability = probability;
1290   if (irreducible_p)
1291     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1292
1293   enter_e->probability = probability.invert ();
1294   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1295
1296   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1297   if (enter_e->dest->loop_father->header == enter_e->dest)
1298     split_edge (enter_e);
1299
1300   return new_e;
1301 }
1302
1303
1304 /* This function verifies that the following restrictions apply to LOOP:
1305    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1306        for innermost loop and 5 basic blocks for outer-loop.
1307    (2) it is single entry, single exit
1308    (3) its exit condition is the last stmt in the header
1309    (4) E is the entry/exit edge of LOOP.
1310  */
1311
1312 bool
1313 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1314 {
1315   edge exit_e = single_exit (loop);
1316   edge entry_e = loop_preheader_edge (loop);
1317   gcond *orig_cond = get_loop_exit_condition (loop);
1318   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1319   unsigned int num_bb = loop->inner? 5 : 2;
1320
1321   /* All loops have an outer scope; the only case loop->outer is NULL is for
1322      the function itself.  */
1323   if (!loop_outer (loop)
1324       || loop->num_nodes != num_bb
1325       || !empty_block_p (loop->latch)
1326       || !single_exit (loop)
1327       /* Verify that new loop exit condition can be trivially modified.  */
1328       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1329       || (e != exit_e && e != entry_e))
1330     return false;
1331
1332   return true;
1333 }
1334
1335 /* Function vect_get_loop_location.
1336
1337    Extract the location of the loop in the source code.
1338    If the loop is not well formed for vectorization, an estimated
1339    location is calculated.
1340    Return the loop location if succeed and NULL if not.  */
1341
1342 dump_user_location_t
1343 find_loop_location (class loop *loop)
1344 {
1345   gimple *stmt = NULL;
1346   basic_block bb;
1347   gimple_stmt_iterator si;
1348
1349   if (!loop)
1350     return dump_user_location_t ();
1351
1352   stmt = get_loop_exit_condition (loop);
1353
1354   if (stmt
1355       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1356     return stmt;
1357
1358   /* If we got here the loop is probably not "well formed",
1359      try to estimate the loop location */
1360
1361   if (!loop->header)
1362     return dump_user_location_t ();
1363
1364   bb = loop->header;
1365
1366   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1367     {
1368       stmt = gsi_stmt (si);
1369       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1370         return stmt;
1371     }
1372
1373   return dump_user_location_t ();
1374 }
1375
1376 /* Return true if the phi described by STMT_INFO defines an IV of the
1377    loop to be vectorized.  */
1378
1379 static bool
1380 iv_phi_p (stmt_vec_info stmt_info)
1381 {
1382   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1383   if (virtual_operand_p (PHI_RESULT (phi)))
1384     return false;
1385
1386   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1387       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1388     return false;
1389
1390   return true;
1391 }
1392
1393 /* Function vect_can_advance_ivs_p
1394
1395    In case the number of iterations that LOOP iterates is unknown at compile
1396    time, an epilog loop will be generated, and the loop induction variables
1397    (IVs) will be "advanced" to the value they are supposed to take just before
1398    the epilog loop.  Here we check that the access function of the loop IVs
1399    and the expression that represents the loop bound are simple enough.
1400    These restrictions will be relaxed in the future.  */
1401
1402 bool
1403 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1404 {
1405   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1406   basic_block bb = loop->header;
1407   gphi_iterator gsi;
1408
1409   /* Analyze phi functions of the loop header.  */
1410
1411   if (dump_enabled_p ())
1412     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1413   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1414     {
1415       tree evolution_part;
1416       enum vect_induction_op_type induction_type;
1417
1418       gphi *phi = gsi.phi ();
1419       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1420       if (dump_enabled_p ())
1421         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1422                          phi_info->stmt);
1423
1424       /* Skip virtual phi's. The data dependences that are associated with
1425          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1426
1427          Skip reduction phis.  */
1428       if (!iv_phi_p (phi_info))
1429         {
1430           if (dump_enabled_p ())
1431             dump_printf_loc (MSG_NOTE, vect_location,
1432                              "reduc or virtual phi. skip.\n");
1433           continue;
1434         }
1435
1436       induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1437       if (induction_type != vect_step_op_add)
1438         {
1439           if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
1440             return false;
1441
1442           continue;
1443         }
1444
1445       /* Analyze the evolution function.  */
1446
1447       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1448       if (evolution_part == NULL_TREE)
1449         {
1450           if (dump_enabled_p ())
1451             dump_printf (MSG_MISSED_OPTIMIZATION,
1452                          "No access function or evolution.\n");
1453           return false;
1454         }
1455
1456       /* FORNOW: We do not transform initial conditions of IVs
1457          which evolution functions are not invariants in the loop.  */
1458
1459       if (!expr_invariant_in_loop_p (loop, evolution_part))
1460         {
1461           if (dump_enabled_p ())
1462             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1463                              "evolution not invariant in loop.\n");
1464           return false;
1465         }
1466
1467       /* FORNOW: We do not transform initial conditions of IVs
1468          which evolution functions are a polynomial of degree >= 2.  */
1469
1470       if (tree_is_chrec (evolution_part))
1471         {
1472           if (dump_enabled_p ())
1473             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474                              "evolution is chrec.\n");
1475           return false;
1476         }
1477     }
1478
1479   return true;
1480 }
1481
1482
1483 /*   Function vect_update_ivs_after_vectorizer.
1484
1485      "Advance" the induction variables of LOOP to the value they should take
1486      after the execution of LOOP.  This is currently necessary because the
1487      vectorizer does not handle induction variables that are used after the
1488      loop.  Such a situation occurs when the last iterations of LOOP are
1489      peeled, because:
1490      1. We introduced new uses after LOOP for IVs that were not originally used
1491         after LOOP: the IVs of LOOP are now used by an epilog loop.
1492      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1493         times, whereas the loop IVs should be bumped N times.
1494
1495      Input:
1496      - LOOP - a loop that is going to be vectorized. The last few iterations
1497               of LOOP were peeled.
1498      - NITERS - the number of iterations that LOOP executes (before it is
1499                 vectorized). i.e, the number of times the ivs should be bumped.
1500      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1501                   coming out from LOOP on which there are uses of the LOOP ivs
1502                   (this is the path from LOOP->exit to epilog_loop->preheader).
1503
1504                   The new definitions of the ivs are placed in LOOP->exit.
1505                   The phi args associated with the edge UPDATE_E in the bb
1506                   UPDATE_E->dest are updated accordingly.
1507
1508      Assumption 1: Like the rest of the vectorizer, this function assumes
1509      a single loop exit that has a single predecessor.
1510
1511      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1512      organized in the same order.
1513
1514      Assumption 3: The access function of the ivs is simple enough (see
1515      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1516
1517      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1518      coming out of LOOP on which the ivs of LOOP are used (this is the path
1519      that leads to the epilog loop; other paths skip the epilog loop).  This
1520      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1521      needs to have its phis updated.
1522  */
1523
1524 static void
1525 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1526                                   tree niters, edge update_e)
1527 {
1528   gphi_iterator gsi, gsi1;
1529   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1530   basic_block update_bb = update_e->dest;
1531   basic_block exit_bb = single_exit (loop)->dest;
1532
1533   /* Make sure there exists a single-predecessor exit bb:  */
1534   gcc_assert (single_pred_p (exit_bb));
1535   gcc_assert (single_succ_edge (exit_bb) == update_e);
1536
1537   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1538        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1539        gsi_next (&gsi), gsi_next (&gsi1))
1540     {
1541       tree init_expr;
1542       tree step_expr, off;
1543       tree type;
1544       tree var, ni, ni_name;
1545       gimple_stmt_iterator last_gsi;
1546
1547       gphi *phi = gsi.phi ();
1548       gphi *phi1 = gsi1.phi ();
1549       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1550       if (dump_enabled_p ())
1551         dump_printf_loc (MSG_NOTE, vect_location,
1552                          "vect_update_ivs_after_vectorizer: phi: %G",
1553                          (gimple *) phi);
1554
1555       /* Skip reduction and virtual phis.  */
1556       if (!iv_phi_p (phi_info))
1557         {
1558           if (dump_enabled_p ())
1559             dump_printf_loc (MSG_NOTE, vect_location,
1560                              "reduc or virtual phi. skip.\n");
1561           continue;
1562         }
1563
1564       type = TREE_TYPE (gimple_phi_result (phi));
1565       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1566       step_expr = unshare_expr (step_expr);
1567
1568       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1569          of degree >= 2 or exponential.  */
1570       gcc_assert (!tree_is_chrec (step_expr));
1571
1572       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1573       gimple_seq stmts = NULL;
1574       enum vect_induction_op_type induction_type
1575         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1576
1577       if (induction_type == vect_step_op_add)
1578         {
1579           off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1580                              fold_convert (TREE_TYPE (step_expr), niters),
1581                              step_expr);
1582           if (POINTER_TYPE_P (type))
1583             ni = fold_build_pointer_plus (init_expr, off);
1584           else
1585             ni = fold_build2 (PLUS_EXPR, type,
1586                               init_expr, fold_convert (type, off));
1587         }
1588       /* Don't bother call vect_peel_nonlinear_iv_init.  */
1589       else if (induction_type == vect_step_op_neg)
1590         ni = init_expr;
1591       else
1592         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
1593                                           niters, step_expr,
1594                                           induction_type);
1595
1596       var = create_tmp_var (type, "tmp");
1597
1598       last_gsi = gsi_last_bb (exit_bb);
1599       gimple_seq new_stmts = NULL;
1600       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1601       /* Exit_bb shouldn't be empty.  */
1602       if (!gsi_end_p (last_gsi))
1603         {
1604           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
1605           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1606         }
1607       else
1608         {
1609           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
1610           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1611         }
1612
1613       /* Fix phi expressions in the successor bb.  */
1614       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1615     }
1616 }
1617
1618 /* Return a gimple value containing the misalignment (measured in vector
1619    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1620    it is away from a perfectly aligned address.  Add any new statements
1621    to SEQ.  */
1622
1623 static tree
1624 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1625 {
1626   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1627   stmt_vec_info stmt_info = dr_info->stmt;
1628   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1629
1630   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1631   unsigned HOST_WIDE_INT target_align_c;
1632   tree target_align_minus_1;
1633
1634   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1635                                         size_zero_node) < 0;
1636   tree offset = (negative
1637                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1638                              * TREE_INT_CST_LOW
1639                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1640                  : size_zero_node);
1641   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1642                                                           stmt_info, seq,
1643                                                           offset);
1644   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1645   if (target_align.is_constant (&target_align_c))
1646     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1647   else
1648     {
1649       tree vla = build_int_cst (type, target_align);
1650       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1651                                     fold_build2 (MINUS_EXPR, type,
1652                                                  build_int_cst (type, 0), vla));
1653       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1654                                           build_int_cst (type, 1));
1655     }
1656
1657   HOST_WIDE_INT elem_size
1658     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1659   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1660
1661   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1662   tree int_start_addr = fold_convert (type, start_addr);
1663   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1664                                         target_align_minus_1);
1665
1666   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1667   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1668                                         elem_size_log);
1669
1670   return misalign_in_elems;
1671 }
1672
1673 /* Function vect_gen_prolog_loop_niters
1674
1675    Generate the number of iterations which should be peeled as prolog for the
1676    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1677    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1678    As a result, after the execution of this loop, the data reference DR will
1679    refer to an aligned location.  The following computation is generated:
1680
1681    If the misalignment of DR is known at compile time:
1682      addr_mis = int mis = DR_MISALIGNMENT (dr);
1683    Else, compute address misalignment in bytes:
1684      addr_mis = addr & (target_align - 1)
1685
1686    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1687
1688    (elem_size = element type size; an element is the scalar element whose type
1689    is the inner type of the vectype)
1690
1691    The computations will be emitted at the end of BB.  We also compute and
1692    store upper bound (included) of the result in BOUND.
1693
1694    When the step of the data-ref in the loop is not 1 (as in interleaved data
1695    and SLP), the number of iterations of the prolog must be divided by the step
1696    (which is equal to the size of interleaved group).
1697
1698    The above formulas assume that VF == number of elements in the vector. This
1699    may not hold when there are multiple-types in the loop.
1700    In this case, for some data-references in the loop the VF does not represent
1701    the number of elements that fit in the vector.  Therefore, instead of VF we
1702    use TYPE_VECTOR_SUBPARTS.  */
1703
1704 static tree
1705 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1706                              basic_block bb, int *bound)
1707 {
1708   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1709   tree var;
1710   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1711   gimple_seq stmts = NULL, new_stmts = NULL;
1712   tree iters, iters_name;
1713   stmt_vec_info stmt_info = dr_info->stmt;
1714   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1715   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1716
1717   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1718     {
1719       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1720
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_NOTE, vect_location,
1723                          "known peeling = %d.\n", npeel);
1724
1725       iters = build_int_cst (niters_type, npeel);
1726       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1727     }
1728   else
1729     {
1730       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1731       tree type = TREE_TYPE (misalign_in_elems);
1732       HOST_WIDE_INT elem_size
1733         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1734       /* We only do prolog peeling if the target alignment is known at compile
1735          time.  */
1736       poly_uint64 align_in_elems =
1737         exact_div (target_align, elem_size);
1738       tree align_in_elems_minus_1 =
1739         build_int_cst (type, align_in_elems - 1);
1740       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1741
1742       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1743                                  & (align_in_elems - 1)).  */
1744       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1745                                             size_zero_node) < 0;
1746       if (negative)
1747         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1748                              align_in_elems_tree);
1749       else
1750         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1751                              misalign_in_elems);
1752       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1753       iters = fold_convert (niters_type, iters);
1754       unsigned HOST_WIDE_INT align_in_elems_c;
1755       if (align_in_elems.is_constant (&align_in_elems_c))
1756         *bound = align_in_elems_c - 1;
1757       else
1758         *bound = -1;
1759     }
1760
1761   if (dump_enabled_p ())
1762     dump_printf_loc (MSG_NOTE, vect_location,
1763                      "niters for prolog loop: %T\n", iters);
1764
1765   var = create_tmp_var (niters_type, "prolog_loop_niters");
1766   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1767
1768   if (new_stmts)
1769     gimple_seq_add_seq (&stmts, new_stmts);
1770   if (stmts)
1771     {
1772       gcc_assert (single_succ_p (bb));
1773       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1774       if (gsi_end_p (gsi))
1775         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1776       else
1777         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1778     }
1779   return iters_name;
1780 }
1781
1782
1783 /* Function vect_update_init_of_dr
1784
1785    If CODE is PLUS, the vector loop starts NITERS iterations after the
1786    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1787    iterations before the scalar one (using masking to skip inactive
1788    elements).  This function updates the information recorded in DR to
1789    account for the difference.  Specifically, it updates the OFFSET
1790    field of DR_INFO.  */
1791
1792 static void
1793 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1794 {
1795   struct data_reference *dr = dr_info->dr;
1796   tree offset = dr_info->offset;
1797   if (!offset)
1798     offset = build_zero_cst (sizetype);
1799
1800   niters = fold_build2 (MULT_EXPR, sizetype,
1801                         fold_convert (sizetype, niters),
1802                         fold_convert (sizetype, DR_STEP (dr)));
1803   offset = fold_build2 (code, sizetype,
1804                         fold_convert (sizetype, offset), niters);
1805   dr_info->offset = offset;
1806 }
1807
1808
1809 /* Function vect_update_inits_of_drs
1810
1811    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1812    CODE and NITERS are as for vect_update_inits_of_dr.  */
1813
1814 void
1815 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1816                           tree_code code)
1817 {
1818   unsigned int i;
1819   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1820   struct data_reference *dr;
1821
1822   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1823
1824   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1825      here, but since we might use these niters to update the epilogues niters
1826      and data references we can't insert them here as this definition might not
1827      always dominate its uses.  */
1828   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1829     niters = fold_convert (sizetype, niters);
1830
1831   FOR_EACH_VEC_ELT (datarefs, i, dr)
1832     {
1833       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1834       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1835           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1836         vect_update_init_of_dr (dr_info, niters, code);
1837     }
1838 }
1839
1840 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1841    by masking.  This involves calculating the number of iterations to
1842    be peeled and then aligning all memory references appropriately.  */
1843
1844 void
1845 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1846 {
1847   tree misalign_in_elems;
1848   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1849
1850   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1851
1852   /* From the information recorded in LOOP_VINFO get the number of iterations
1853      that need to be skipped via masking.  */
1854   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1855     {
1856       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1857                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1858       misalign_in_elems = build_int_cst (type, misalign);
1859     }
1860   else
1861     {
1862       gimple_seq seq1 = NULL, seq2 = NULL;
1863       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1864       misalign_in_elems = fold_convert (type, misalign_in_elems);
1865       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1866                                                 &seq2, true, NULL_TREE);
1867       gimple_seq_add_seq (&seq1, seq2);
1868       if (seq1)
1869         {
1870           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1871           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1872           gcc_assert (!new_bb);
1873         }
1874     }
1875
1876   if (dump_enabled_p ())
1877     dump_printf_loc (MSG_NOTE, vect_location,
1878                      "misalignment for fully-masked loop: %T\n",
1879                      misalign_in_elems);
1880
1881   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1882
1883   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1884 }
1885
1886 /* This function builds ni_name = number of iterations.  Statements
1887    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1888    it to TRUE if new ssa_var is generated.  */
1889
1890 tree
1891 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1892 {
1893   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1894   if (TREE_CODE (ni) == INTEGER_CST)
1895     return ni;
1896   else
1897     {
1898       tree ni_name, var;
1899       gimple_seq stmts = NULL;
1900       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1901
1902       var = create_tmp_var (TREE_TYPE (ni), "niters");
1903       ni_name = force_gimple_operand (ni, &stmts, false, var);
1904       if (stmts)
1905         {
1906           gsi_insert_seq_on_edge_immediate (pe, stmts);
1907           if (new_var_p != NULL)
1908             *new_var_p = true;
1909         }
1910
1911       return ni_name;
1912     }
1913 }
1914
1915 /* Calculate the number of iterations above which vectorized loop will be
1916    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1917    of prolog loop.  If it's integer const, the integer number is also passed
1918    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1919    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1920    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1921    threshold below which the scalar (rather than vectorized) loop will be
1922    executed.  This function stores the upper bound (inclusive) of the result
1923    in BOUND_SCALAR.  */
1924
1925 static tree
1926 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1927                              int bound_prolog, poly_int64 bound_epilog, int th,
1928                              poly_uint64 *bound_scalar,
1929                              bool check_profitability)
1930 {
1931   tree type = TREE_TYPE (niters_prolog);
1932   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1933                              build_int_cst (type, bound_epilog));
1934
1935   *bound_scalar = bound_prolog + bound_epilog;
1936   if (check_profitability)
1937     {
1938       /* TH indicates the minimum niters of vectorized loop, while we
1939          compute the maximum niters of scalar loop.  */
1940       th--;
1941       /* Peeling for constant times.  */
1942       if (int_niters_prolog >= 0)
1943         {
1944           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1945           return build_int_cst (type, *bound_scalar);
1946         }
1947       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1948          and BOUND_EPILOG are inclusive upper bounds.  */
1949       if (known_ge (th, bound_prolog + bound_epilog))
1950         {
1951           *bound_scalar = th;
1952           return build_int_cst (type, th);
1953         }
1954       /* Need to do runtime comparison.  */
1955       else if (maybe_gt (th, bound_epilog))
1956         {
1957           *bound_scalar = upper_bound (*bound_scalar, th);
1958           return fold_build2 (MAX_EXPR, type,
1959                               build_int_cst (type, th), niters);
1960         }
1961     }
1962   return niters;
1963 }
1964
1965 /* NITERS is the number of times that the original scalar loop executes
1966    after peeling.  Work out the maximum number of iterations N that can
1967    be handled by the vectorized form of the loop and then either:
1968
1969    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1970
1971         niters_vector = N
1972
1973    b) set *STEP_VECTOR_PTR to one and generate:
1974
1975         niters_vector = N / vf
1976
1977    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1978    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1979    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1980
1981 void
1982 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1983                              tree *niters_vector_ptr, tree *step_vector_ptr,
1984                              bool niters_no_overflow)
1985 {
1986   tree ni_minus_gap, var;
1987   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1988   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1990   tree log_vf = NULL_TREE;
1991
1992   /* If epilogue loop is required because of data accesses with gaps, we
1993      subtract one iteration from the total number of iterations here for
1994      correct calculation of RATIO.  */
1995   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1996     {
1997       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1998                                   build_one_cst (type));
1999       if (!is_gimple_val (ni_minus_gap))
2000         {
2001           var = create_tmp_var (type, "ni_gap");
2002           gimple *stmts = NULL;
2003           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2004                                                true, var);
2005           gsi_insert_seq_on_edge_immediate (pe, stmts);
2006         }
2007     }
2008   else
2009     ni_minus_gap = niters;
2010
2011   /* To silence some unexpected warnings, simply initialize to 0. */
2012   unsigned HOST_WIDE_INT const_vf = 0;
2013   if (vf.is_constant (&const_vf)
2014       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2015     {
2016       /* Create: niters >> log2(vf) */
2017       /* If it's known that niters == number of latch executions + 1 doesn't
2018          overflow, we can generate niters >> log2(vf); otherwise we generate
2019          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2020          will be at least one.  */
2021       log_vf = build_int_cst (type, exact_log2 (const_vf));
2022       if (niters_no_overflow)
2023         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2024       else
2025         niters_vector
2026           = fold_build2 (PLUS_EXPR, type,
2027                          fold_build2 (RSHIFT_EXPR, type,
2028                                       fold_build2 (MINUS_EXPR, type,
2029                                                    ni_minus_gap,
2030                                                    build_int_cst (type, vf)),
2031                                       log_vf),
2032                          build_int_cst (type, 1));
2033       step_vector = build_one_cst (type);
2034     }
2035   else
2036     {
2037       niters_vector = ni_minus_gap;
2038       step_vector = build_int_cst (type, vf);
2039     }
2040
2041   if (!is_gimple_val (niters_vector))
2042     {
2043       var = create_tmp_var (type, "bnd");
2044       gimple_seq stmts = NULL;
2045       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2046       gsi_insert_seq_on_edge_immediate (pe, stmts);
2047       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2048          we set range information to make niters analyzer's life easier.
2049          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2050          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2051       if (stmts != NULL && log_vf)
2052         {
2053           if (niters_no_overflow)
2054             {
2055               value_range vr (type,
2056                               wi::one (TYPE_PRECISION (type)),
2057                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2058                                                          TYPE_SIGN (type)),
2059                                           exact_log2 (const_vf),
2060                                           TYPE_SIGN (type)));
2061               set_range_info (niters_vector, vr);
2062             }
2063           /* For VF == 1 the vector IV might also overflow so we cannot
2064              assert a minimum value of 1.  */
2065           else if (const_vf > 1)
2066             {
2067               value_range vr (type,
2068                               wi::one (TYPE_PRECISION (type)),
2069                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2070                                                          TYPE_SIGN (type))
2071                                           - (const_vf - 1),
2072                                           exact_log2 (const_vf), TYPE_SIGN (type))
2073                               + 1);
2074               set_range_info (niters_vector, vr);
2075             }
2076         }
2077     }
2078   *niters_vector_ptr = niters_vector;
2079   *step_vector_ptr = step_vector;
2080
2081   return;
2082 }
2083
2084 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2085    loop specified by LOOP_VINFO after vectorization, compute the number
2086    of iterations before vectorization (niters_vector * vf) and store it
2087    to NITERS_VECTOR_MULT_VF_PTR.  */
2088
2089 static void
2090 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2091                                      tree niters_vector,
2092                                      tree *niters_vector_mult_vf_ptr)
2093 {
2094   /* We should be using a step_vector of VF if VF is variable.  */
2095   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2096   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2097   tree type = TREE_TYPE (niters_vector);
2098   tree log_vf = build_int_cst (type, exact_log2 (vf));
2099   basic_block exit_bb = single_exit (loop)->dest;
2100
2101   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2102   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2103                                             niters_vector, log_vf);
2104   if (!is_gimple_val (niters_vector_mult_vf))
2105     {
2106       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2107       gimple_seq stmts = NULL;
2108       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2109                                                     &stmts, true, var);
2110       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2111       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2112     }
2113   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2114 }
2115
2116 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2117    this function searches for the corresponding lcssa phi node in exit
2118    bb of LOOP.  If it is found, return the phi result; otherwise return
2119    NULL.  */
2120
2121 static tree
2122 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2123                 gphi *lcssa_phi)
2124 {
2125   gphi_iterator gsi;
2126   edge e = single_exit (loop);
2127
2128   gcc_assert (single_pred_p (e->dest));
2129   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2130     {
2131       gphi *phi = gsi.phi ();
2132       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2133                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2134         return PHI_RESULT (phi);
2135     }
2136   return NULL_TREE;
2137 }
2138
2139 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2140    from SECOND/FIRST and puts it at the original loop's preheader/exit
2141    edge, the two loops are arranged as below:
2142
2143        preheader_a:
2144      first_loop:
2145        header_a:
2146          i_1 = PHI<i_0, i_2>;
2147          ...
2148          i_2 = i_1 + 1;
2149          if (cond_a)
2150            goto latch_a;
2151          else
2152            goto between_bb;
2153        latch_a:
2154          goto header_a;
2155
2156        between_bb:
2157          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2158
2159      second_loop:
2160        header_b:
2161          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2162                                  or with i_2 if no LCSSA phi is created
2163                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2164          ...
2165          i_4 = i_3 + 1;
2166          if (cond_b)
2167            goto latch_b;
2168          else
2169            goto exit_bb;
2170        latch_b:
2171          goto header_b;
2172
2173        exit_bb:
2174
2175    This function creates loop closed SSA for the first loop; update the
2176    second loop's PHI nodes by replacing argument on incoming edge with the
2177    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2178    is false, Loop closed ssa phis will only be created for non-iv phis for
2179    the first loop.
2180
2181    This function assumes exit bb of the first loop is preheader bb of the
2182    second loop, i.e, between_bb in the example code.  With PHIs updated,
2183    the second loop will execute rest iterations of the first.  */
2184
2185 static void
2186 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2187                                    class loop *first, class loop *second,
2188                                    bool create_lcssa_for_iv_phis)
2189 {
2190   gphi_iterator gsi_update, gsi_orig;
2191   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2192
2193   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2194   edge second_preheader_e = loop_preheader_edge (second);
2195   basic_block between_bb = single_exit (first)->dest;
2196
2197   gcc_assert (between_bb == second_preheader_e->src);
2198   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2199   /* Either the first loop or the second is the loop to be vectorized.  */
2200   gcc_assert (loop == first || loop == second);
2201
2202   for (gsi_orig = gsi_start_phis (first->header),
2203        gsi_update = gsi_start_phis (second->header);
2204        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2205        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2206     {
2207       gphi *orig_phi = gsi_orig.phi ();
2208       gphi *update_phi = gsi_update.phi ();
2209
2210       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2211       /* Generate lcssa PHI node for the first loop.  */
2212       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2213       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2214       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2215         {
2216           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2217           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2218           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2219           arg = new_res;
2220         }
2221
2222       /* Update PHI node in the second loop by replacing arg on the loop's
2223          incoming edge.  */
2224       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2225     }
2226
2227   /* For epilogue peeling we have to make sure to copy all LC PHIs
2228      for correct vectorization of live stmts.  */
2229   if (loop == first)
2230     {
2231       basic_block orig_exit = single_exit (second)->dest;
2232       for (gsi_orig = gsi_start_phis (orig_exit);
2233            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2234         {
2235           gphi *orig_phi = gsi_orig.phi ();
2236           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2237           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2238             continue;
2239
2240           /* Already created in the above loop.   */
2241           if (find_guard_arg (first, second, orig_phi))
2242             continue;
2243
2244           tree new_res = copy_ssa_name (orig_arg);
2245           gphi *lcphi = create_phi_node (new_res, between_bb);
2246           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2247         }
2248     }
2249 }
2250
2251 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2252    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2253    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2254    appear like below:
2255
2256        guard_bb:
2257          if (cond)
2258            goto merge_bb;
2259          else
2260            goto skip_loop;
2261
2262      skip_loop:
2263        header_a:
2264          i_1 = PHI<i_0, i_2>;
2265          ...
2266          i_2 = i_1 + 1;
2267          if (cond_a)
2268            goto latch_a;
2269          else
2270            goto exit_a;
2271        latch_a:
2272          goto header_a;
2273
2274        exit_a:
2275          i_5 = PHI<i_2>;
2276
2277        merge_bb:
2278          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2279
2280      update_loop:
2281        header_b:
2282          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2283          ...
2284          i_4 = i_3 + 1;
2285          if (cond_b)
2286            goto latch_b;
2287          else
2288            goto exit_bb;
2289        latch_b:
2290          goto header_b;
2291
2292        exit_bb:
2293
2294    This function creates PHI nodes at merge_bb and replaces the use of i_5
2295    in the update_loop's PHI node with the result of new PHI result.  */
2296
2297 static void
2298 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2299                                     class loop *update_loop,
2300                                     edge guard_edge, edge merge_edge)
2301 {
2302   location_t merge_loc, guard_loc;
2303   edge orig_e = loop_preheader_edge (skip_loop);
2304   edge update_e = loop_preheader_edge (update_loop);
2305   gphi_iterator gsi_orig, gsi_update;
2306
2307   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2308         gsi_update = gsi_start_phis (update_loop->header));
2309        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2310        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2311     {
2312       gphi *orig_phi = gsi_orig.phi ();
2313       gphi *update_phi = gsi_update.phi ();
2314
2315       /* Generate new phi node at merge bb of the guard.  */
2316       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2317       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2318
2319       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2320          args in NEW_PHI for these edges.  */
2321       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2322       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2323       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2324       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2325       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2326       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2327
2328       /* Update phi in UPDATE_PHI.  */
2329       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2330     }
2331 }
2332
2333 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2334    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2335    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2336    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2337    The CFG looks like:
2338
2339      loop:
2340        header_a:
2341          i_1 = PHI<i_0, i_2>;
2342          ...
2343          i_2 = i_1 + 1;
2344          if (cond_a)
2345            goto latch_a;
2346          else
2347            goto exit_a;
2348        latch_a:
2349          goto header_a;
2350
2351        exit_a:
2352
2353        guard_bb:
2354          if (cond)
2355            goto merge_bb;
2356          else
2357            goto epilog_loop;
2358
2359        ;; fall_through_bb
2360
2361      epilog_loop:
2362        header_b:
2363          i_3 = PHI<i_2, i_4>;
2364          ...
2365          i_4 = i_3 + 1;
2366          if (cond_b)
2367            goto latch_b;
2368          else
2369            goto merge_bb;
2370        latch_b:
2371          goto header_b;
2372
2373        merge_bb:
2374          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2375
2376        exit_bb:
2377          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2378
2379    For each name used out side EPILOG (i.e - for each name that has a lcssa
2380    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2381    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2382    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2383    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2384    in exit_bb will also be updated.  */
2385
2386 static void
2387 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2388                                     edge guard_edge, edge merge_edge)
2389 {
2390   gphi_iterator gsi;
2391   basic_block merge_bb = guard_edge->dest;
2392
2393   gcc_assert (single_succ_p (merge_bb));
2394   edge e = single_succ_edge (merge_bb);
2395   basic_block exit_bb = e->dest;
2396   gcc_assert (single_pred_p (exit_bb));
2397   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2398
2399   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2400     {
2401       gphi *update_phi = gsi.phi ();
2402       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2403
2404       tree merge_arg = NULL_TREE;
2405
2406       /* If the old argument is a SSA_NAME use its current_def.  */
2407       if (TREE_CODE (old_arg) == SSA_NAME)
2408         merge_arg = get_current_def (old_arg);
2409       /* If it's a constant or doesn't have a current_def, just use the old
2410          argument.  */
2411       if (!merge_arg)
2412         merge_arg = old_arg;
2413
2414       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2415       /* If the var is live after loop but not a reduction, we simply
2416          use the old arg.  */
2417       if (!guard_arg)
2418         guard_arg = old_arg;
2419
2420       /* Create new phi node in MERGE_BB:  */
2421       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2422       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2423
2424       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2425          the two PHI args in merge_phi for these edges.  */
2426       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2427       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2428
2429       /* Update the original phi in exit_bb.  */
2430       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2431     }
2432 }
2433
2434 /* EPILOG loop is duplicated from the original loop for vectorizing,
2435    the arg of its loop closed ssa PHI needs to be updated.  */
2436
2437 static void
2438 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2439 {
2440   gphi_iterator gsi;
2441   basic_block exit_bb = single_exit (epilog)->dest;
2442
2443   gcc_assert (single_pred_p (exit_bb));
2444   edge e = EDGE_PRED (exit_bb, 0);
2445   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2446     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2447 }
2448
2449 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2450    iterate exactly CONST_NITERS times.  Make a final decision about
2451    whether the epilogue loop should be used, returning true if so.  */
2452
2453 static bool
2454 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2455                              unsigned HOST_WIDE_INT const_niters)
2456 {
2457   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2458      using an epilogue loop for a single scalar iteration, even if
2459      we could in principle implement that using partial vectors.  */
2460   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2461   if (const_niters <= gap_niters + 1)
2462     return false;
2463
2464   /* Install the number of iterations.  */
2465   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2466   tree niters_tree = build_int_cst (niters_type, const_niters);
2467   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2468
2469   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2470   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2471
2472   /* Decide what to do if the number of epilogue iterations is not
2473      a multiple of the epilogue loop's vectorization factor.  */
2474   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2475 }
2476
2477 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2478    Return a value that equals:
2479
2480    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2481    - SKIP_VALUE when the main loop is skipped.  */
2482
2483 tree
2484 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2485                            tree skip_value)
2486 {
2487   gcc_assert (loop_vinfo->main_loop_edge);
2488
2489   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2490   basic_block bb = loop_vinfo->main_loop_edge->dest;
2491   gphi *new_phi = create_phi_node (phi_result, bb);
2492   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2493                UNKNOWN_LOCATION);
2494   add_phi_arg (new_phi, skip_value,
2495                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2496   return phi_result;
2497 }
2498
2499 /* Function vect_do_peeling.
2500
2501    Input:
2502    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2503
2504        preheader:
2505      LOOP:
2506        header_bb:
2507          loop_body
2508          if (exit_loop_cond) goto exit_bb
2509          else                goto header_bb
2510        exit_bb:
2511
2512    - NITERS: The number of iterations of the loop.
2513    - NITERSM1: The number of iterations of the loop's latch.
2514    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2515    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2516                               CHECK_PROFITABILITY is true.
2517    Output:
2518    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2519      iterate after vectorization; see vect_set_loop_condition for details.
2520    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2521      should be set to the number of scalar iterations handled by the
2522      vector loop.  The SSA name is only used on exit from the loop.
2523
2524    This function peels prolog and epilog from the loop, adds guards skipping
2525    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2526    would look like:
2527
2528        guard_bb_1:
2529          if (prefer_scalar_loop) goto merge_bb_1
2530          else                    goto guard_bb_2
2531
2532        guard_bb_2:
2533          if (skip_prolog) goto merge_bb_2
2534          else             goto prolog_preheader
2535
2536        prolog_preheader:
2537      PROLOG:
2538        prolog_header_bb:
2539          prolog_body
2540          if (exit_prolog_cond) goto prolog_exit_bb
2541          else                  goto prolog_header_bb
2542        prolog_exit_bb:
2543
2544        merge_bb_2:
2545
2546        vector_preheader:
2547      VECTOR LOOP:
2548        vector_header_bb:
2549          vector_body
2550          if (exit_vector_cond) goto vector_exit_bb
2551          else                  goto vector_header_bb
2552        vector_exit_bb:
2553
2554        guard_bb_3:
2555          if (skip_epilog) goto merge_bb_3
2556          else             goto epilog_preheader
2557
2558        merge_bb_1:
2559
2560        epilog_preheader:
2561      EPILOG:
2562        epilog_header_bb:
2563          epilog_body
2564          if (exit_epilog_cond) goto merge_bb_3
2565          else                  goto epilog_header_bb
2566
2567        merge_bb_3:
2568
2569    Note this function peels prolog and epilog only if it's necessary,
2570    as well as guards.
2571    This function returns the epilogue loop if a decision was made to vectorize
2572    it, otherwise NULL.
2573
2574    The analysis resulting in this epilogue loop's loop_vec_info was performed
2575    in the same vect_analyze_loop call as the main loop's.  At that time
2576    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2577    vectorization factors than the main loop.  This list is stored in the main
2578    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2579    vectorize the epilogue loop for a lower vectorization factor,  the
2580    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2581    updated and linked to the epilogue loop.  This is later used to vectorize
2582    the epilogue.  The reason the loop_vec_info needs updating is that it was
2583    constructed based on the original main loop, and the epilogue loop is a
2584    copy of this loop, so all links pointing to statements in the original loop
2585    need updating.  Furthermore, these loop_vec_infos share the
2586    data_reference's records, which will also need to be updated.
2587
2588    TODO: Guard for prefer_scalar_loop should be emitted along with
2589    versioning conditions if loop versioning is needed.  */
2590
2591
2592 class loop *
2593 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2594                  tree *niters_vector, tree *step_vector,
2595                  tree *niters_vector_mult_vf_var, int th,
2596                  bool check_profitability, bool niters_no_overflow,
2597                  tree *advance)
2598 {
2599   edge e, guard_e;
2600   tree type = TREE_TYPE (niters), guard_cond;
2601   basic_block guard_bb, guard_to;
2602   profile_probability prob_prolog, prob_vector, prob_epilog;
2603   int estimated_vf;
2604   int prolog_peeling = 0;
2605   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2606   bool vect_epilogues_updated_niters = false;
2607   /* We currently do not support prolog peeling if the target alignment is not
2608      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2609      target alignment being constant.  */
2610   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2611   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2612     return NULL;
2613
2614   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2615     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2616
2617   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2618   poly_uint64 bound_epilog = 0;
2619   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2620       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2621     bound_epilog += vf - 1;
2622   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2623     bound_epilog += 1;
2624   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2625   poly_uint64 bound_scalar = bound_epilog;
2626
2627   if (!prolog_peeling && !epilog_peeling)
2628     return NULL;
2629
2630   /* Before doing any peeling make sure to reset debug binds outside of
2631      the loop refering to defs not in LC SSA.  */
2632   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2633   for (unsigned i = 0; i < loop->num_nodes; ++i)
2634     {
2635       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2636       imm_use_iterator ui;
2637       gimple *use_stmt;
2638       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2639            gsi_next (&gsi))
2640         {
2641           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2642             if (gimple_debug_bind_p (use_stmt)
2643                 && loop != gimple_bb (use_stmt)->loop_father
2644                 && !flow_loop_nested_p (loop,
2645                                         gimple_bb (use_stmt)->loop_father))
2646               {
2647                 gimple_debug_bind_reset_value (use_stmt);
2648                 update_stmt (use_stmt);
2649               }
2650         }
2651       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2652            gsi_next (&gsi))
2653         {
2654           ssa_op_iter op_iter;
2655           def_operand_p def_p;
2656           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2657             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2658               if (gimple_debug_bind_p (use_stmt)
2659                   && loop != gimple_bb (use_stmt)->loop_father
2660                   && !flow_loop_nested_p (loop,
2661                                           gimple_bb (use_stmt)->loop_father))
2662                 {
2663                   gimple_debug_bind_reset_value (use_stmt);
2664                   update_stmt (use_stmt);
2665                 }
2666         }
2667     }
2668
2669   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2670   estimated_vf = vect_vf_for_cost (loop_vinfo);
2671   if (estimated_vf == 2)
2672     estimated_vf = 3;
2673   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2674                         .apply_scale (estimated_vf - 1, estimated_vf);
2675
2676   class loop *prolog, *epilog = NULL;
2677   class loop *first_loop = loop;
2678   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2679
2680   /* SSA form needs to be up-to-date since we are going to manually
2681      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2682      update SSA state after that, so we have to make sure to not lose any
2683      pending update needs.  */
2684   gcc_assert (!need_ssa_update_p (cfun));
2685
2686   /* If we're vectorizing an epilogue loop, we have ensured that the
2687      virtual operand is in SSA form throughout the vectorized main loop.
2688      Normally it is possible to trace the updated
2689      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2690      back to scalar-stmt vuses, meaning that the effect of the SSA update
2691      remains local to the main loop.  However, there are rare cases in
2692      which the vectorized loop should have vdefs even when the original scalar
2693      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2694      introduces clobbers of the temporary vector array, which in turn
2695      needs new vdefs.  If the scalar loop doesn't write to memory, these
2696      new vdefs will be the only ones in the vector loop.
2697      We are currently defering updating virtual SSA form and creating
2698      of a virtual PHI for this case so we do not have to make sure the
2699      newly introduced virtual def is in LCSSA form.  */
2700
2701   if (MAY_HAVE_DEBUG_BIND_STMTS)
2702     {
2703       gcc_assert (!adjust_vec.exists ());
2704       adjust_vec.create (32);
2705     }
2706   initialize_original_copy_tables ();
2707
2708   /* Record the anchor bb at which the guard should be placed if the scalar
2709      loop might be preferred.  */
2710   basic_block anchor = loop_preheader_edge (loop)->src;
2711
2712   /* Generate the number of iterations for the prolog loop.  We do this here
2713      so that we can also get the upper bound on the number of iterations.  */
2714   tree niters_prolog;
2715   int bound_prolog = 0;
2716   if (prolog_peeling)
2717     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2718                                                   &bound_prolog);
2719   else
2720     niters_prolog = build_int_cst (type, 0);
2721
2722   loop_vec_info epilogue_vinfo = NULL;
2723   if (vect_epilogues)
2724     {
2725       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2726       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2727     }
2728
2729   tree niters_vector_mult_vf = NULL_TREE;
2730   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2731   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2732   edge update_e = NULL, skip_e = NULL;
2733   unsigned int lowest_vf = constant_lower_bound (vf);
2734   /* If we know the number of scalar iterations for the main loop we should
2735      check whether after the main loop there are enough iterations left over
2736      for the epilogue.  */
2737   if (vect_epilogues
2738       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2739       && prolog_peeling >= 0
2740       && known_eq (vf, lowest_vf))
2741     {
2742       unsigned HOST_WIDE_INT eiters
2743         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2744            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2745
2746       eiters -= prolog_peeling;
2747       eiters
2748         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2749
2750       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2751         {
2752           delete epilogue_vinfo;
2753           epilogue_vinfo = NULL;
2754           if (loop_vinfo->epilogue_vinfos.length () == 0)
2755             {
2756               vect_epilogues = false;
2757               break;
2758             }
2759           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2760           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2761         }
2762       vect_epilogues_updated_niters = true;
2763     }
2764   /* Prolog loop may be skipped.  */
2765   bool skip_prolog = (prolog_peeling != 0);
2766   /* Skip this loop to epilog when there are not enough iterations to enter this
2767      vectorized loop.  If true we should perform runtime checks on the NITERS
2768      to check whether we should skip the current vectorized loop.  If we know
2769      the number of scalar iterations we may choose to add a runtime check if
2770      this number "maybe" smaller than the number of iterations required
2771      when we know the number of scalar iterations may potentially
2772      be smaller than the number of iterations required to enter this loop, for
2773      this we use the upper bounds on the prolog and epilog peeling.  When we
2774      don't know the number of iterations and don't require versioning it is
2775      because we have asserted that there are enough scalar iterations to enter
2776      the main loop, so this skip is not necessary.  When we are versioning then
2777      we only add such a skip if we have chosen to vectorize the epilogue.  */
2778   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2779                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2780                                   bound_prolog + bound_epilog)
2781                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2782                          || vect_epilogues));
2783   /* Epilog loop must be executed if the number of iterations for epilog
2784      loop is known at compile time, otherwise we need to add a check at
2785      the end of vector loop and skip to the end of epilog loop.  */
2786   bool skip_epilog = (prolog_peeling < 0
2787                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2788                       || !vf.is_constant ());
2789   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2790   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2791     skip_epilog = false;
2792
2793   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2794   auto_vec<profile_count> original_counts;
2795   basic_block *original_bbs = NULL;
2796
2797   if (skip_vector)
2798     {
2799       split_edge (loop_preheader_edge (loop));
2800
2801       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
2802         {
2803           original_bbs = get_loop_body (loop);
2804           for (unsigned int i = 0; i < loop->num_nodes; i++)
2805             original_counts.safe_push(original_bbs[i]->count);
2806         }
2807
2808       /* Due to the order in which we peel prolog and epilog, we first
2809          propagate probability to the whole loop.  The purpose is to
2810          avoid adjusting probabilities of both prolog and vector loops
2811          separately.  Note in this case, the probability of epilog loop
2812          needs to be scaled back later.  */
2813       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2814       if (prob_vector.initialized_p ())
2815         {
2816           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2817           scale_loop_profile (loop, prob_vector, 0);
2818         }
2819     }
2820
2821   dump_user_location_t loop_loc = find_loop_location (loop);
2822   if (vect_epilogues)
2823     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2824        use the original scalar loop as remaining epilogue if necessary.  */
2825     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2826       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2827
2828   if (prolog_peeling)
2829     {
2830       e = loop_preheader_edge (loop);
2831       if (!slpeel_can_duplicate_loop_p (loop, e))
2832         {
2833           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2834                            "loop can't be duplicated to preheader edge.\n");
2835           gcc_unreachable ();
2836         }
2837       /* Peel prolog and put it on preheader edge of loop.  */
2838       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2839       if (!prolog)
2840         {
2841           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2842                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2843           gcc_unreachable ();
2844         }
2845       prolog->force_vectorize = false;
2846       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2847       first_loop = prolog;
2848       reset_original_copy_tables ();
2849
2850       /* Update the number of iterations for prolog loop.  */
2851       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2852       vect_set_loop_condition (prolog, NULL, niters_prolog,
2853                                step_prolog, NULL_TREE, false);
2854
2855       /* Skip the prolog loop.  */
2856       if (skip_prolog)
2857         {
2858           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2859                                     niters_prolog, build_int_cst (type, 0));
2860           guard_bb = loop_preheader_edge (prolog)->src;
2861           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2862           guard_to = split_edge (loop_preheader_edge (loop));
2863           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2864                                            guard_to, guard_bb,
2865                                            prob_prolog.invert (),
2866                                            irred_flag);
2867           e = EDGE_PRED (guard_to, 0);
2868           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2869           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2870
2871           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2872           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2873         }
2874
2875       /* Update init address of DRs.  */
2876       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2877       /* Update niters for vector loop.  */
2878       LOOP_VINFO_NITERS (loop_vinfo)
2879         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2880       LOOP_VINFO_NITERSM1 (loop_vinfo)
2881         = fold_build2 (MINUS_EXPR, type,
2882                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2883       bool new_var_p = false;
2884       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2885       /* It's guaranteed that vector loop bound before vectorization is at
2886          least VF, so set range information for newly generated var.  */
2887       if (new_var_p)
2888         {
2889           value_range vr (type,
2890                           wi::to_wide (build_int_cst (type, vf)),
2891                           wi::to_wide (TYPE_MAX_VALUE (type)));
2892           set_range_info (niters, vr);
2893         }
2894
2895       /* Prolog iterates at most bound_prolog times, latch iterates at
2896          most bound_prolog - 1 times.  */
2897       record_niter_bound (prolog, bound_prolog - 1, false, true);
2898       delete_update_ssa ();
2899       adjust_vec_debug_stmts ();
2900       scev_reset ();
2901     }
2902
2903   if (epilog_peeling)
2904     {
2905       e = single_exit (loop);
2906       if (!slpeel_can_duplicate_loop_p (loop, e))
2907         {
2908           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2909                            "loop can't be duplicated to exit edge.\n");
2910           gcc_unreachable ();
2911         }
2912       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2913          said epilog then we should use a copy of the main loop as a starting
2914          point.  This loop may have already had some preliminary transformations
2915          to allow for more optimal vectorization, for example if-conversion.
2916          If we are not vectorizing the epilog then we should use the scalar loop
2917          as the transformations mentioned above make less or no sense when not
2918          vectorizing.  */
2919       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2920       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2921       if (!epilog)
2922         {
2923           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2924                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2925           gcc_unreachable ();
2926         }
2927       epilog->force_vectorize = false;
2928       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2929
2930       /* Scalar version loop may be preferred.  In this case, add guard
2931          and skip to epilog.  Note this only happens when the number of
2932          iterations of loop is unknown at compile time, otherwise this
2933          won't be vectorized.  */
2934       if (skip_vector)
2935         {
2936           /* Additional epilogue iteration is peeled if gap exists.  */
2937           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2938                                                 bound_prolog, bound_epilog,
2939                                                 th, &bound_scalar,
2940                                                 check_profitability);
2941           /* Build guard against NITERSM1 since NITERS may overflow.  */
2942           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2943           guard_bb = anchor;
2944           guard_to = split_edge (loop_preheader_edge (epilog));
2945           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2946                                            guard_to, guard_bb,
2947                                            prob_vector.invert (),
2948                                            irred_flag);
2949           skip_e = guard_e;
2950           e = EDGE_PRED (guard_to, 0);
2951           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2952           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2953
2954           /* Simply propagate profile info from guard_bb to guard_to which is
2955              a merge point of control flow.  */
2956           guard_to->count = guard_bb->count;
2957
2958           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
2959           if (vect_epilogues || scalar_loop == NULL)
2960             {
2961               gcc_assert(epilog->num_nodes == loop->num_nodes);
2962               basic_block *bbs = get_loop_body (epilog);
2963               for (unsigned int i = 0; i < epilog->num_nodes; i++)
2964                 {
2965                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
2966                   bbs[i]->count = original_counts[i];
2967                 }
2968               free (bbs);
2969               free (original_bbs);
2970             }
2971         }
2972
2973       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2974       /* If loop is peeled for non-zero constant times, now niters refers to
2975          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2976          overflows.  */
2977       niters_no_overflow |= (prolog_peeling > 0);
2978       vect_gen_vector_loop_niters (loop_vinfo, niters,
2979                                    niters_vector, step_vector,
2980                                    niters_no_overflow);
2981       if (!integer_onep (*step_vector))
2982         {
2983           /* On exit from the loop we will have an easy way of calcalating
2984              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2985              until then.  */
2986           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2987           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2988           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2989         }
2990       else
2991         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2992                                              &niters_vector_mult_vf);
2993       /* Update IVs of original loop as if they were advanced by
2994          niters_vector_mult_vf steps.  */
2995       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2996       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2997       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2998                                         update_e);
2999
3000       if (skip_epilog)
3001         {
3002           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3003                                     niters, niters_vector_mult_vf);
3004           guard_bb = single_exit (loop)->dest;
3005           guard_to = split_edge (single_exit (epilog));
3006           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3007                                            skip_vector ? anchor : guard_bb,
3008                                            prob_epilog.invert (),
3009                                            irred_flag);
3010           if (vect_epilogues)
3011             epilogue_vinfo->skip_this_loop_edge = guard_e;
3012           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3013                                               single_exit (epilog));
3014           /* Only need to handle basic block before epilog loop if it's not
3015              the guard_bb, which is the case when skip_vector is true.  */
3016           if (guard_bb != bb_before_epilog)
3017             {
3018               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3019
3020               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3021             }
3022           scale_loop_profile (epilog, prob_epilog, 0);
3023         }
3024       else
3025         slpeel_update_phi_nodes_for_lcssa (epilog);
3026
3027       unsigned HOST_WIDE_INT bound;
3028       if (bound_scalar.is_constant (&bound))
3029         {
3030           gcc_assert (bound != 0);
3031           /* -1 to convert loop iterations to latch iterations.  */
3032           record_niter_bound (epilog, bound - 1, false, true);
3033         }
3034
3035       delete_update_ssa ();
3036       adjust_vec_debug_stmts ();
3037       scev_reset ();
3038     }
3039
3040   if (vect_epilogues)
3041     {
3042       epilog->aux = epilogue_vinfo;
3043       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3044
3045       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3046
3047       /* We now must calculate the number of NITERS performed by the previous
3048          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3049       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3050                                  niters_prolog, niters_vector_mult_vf);
3051
3052       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3053          determine whether we are coming from the previous vectorized loop
3054          using the update_e edge or the skip_vector basic block using the
3055          skip_e edge.  */
3056       if (skip_vector)
3057         {
3058           gcc_assert (update_e != NULL
3059                       && skip_e != NULL
3060                       && !vect_epilogues_updated_niters);
3061           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3062                                            update_e->dest);
3063           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3064           gimple *stmt = gimple_build_assign (new_ssa, niters);
3065           gimple_stmt_iterator gsi;
3066           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3067               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3068             {
3069               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3070               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3071             }
3072           else
3073             {
3074               gsi = gsi_last_bb (update_e->src);
3075               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3076             }
3077
3078           niters = new_ssa;
3079           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3080           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3081                        UNKNOWN_LOCATION);
3082           niters = PHI_RESULT (new_phi);
3083           epilogue_vinfo->main_loop_edge = update_e;
3084           epilogue_vinfo->skip_main_loop_edge = skip_e;
3085         }
3086
3087       /* Set ADVANCE to the number of iterations performed by the previous
3088          loop and its prologue.  */
3089       *advance = niters;
3090
3091       if (!vect_epilogues_updated_niters)
3092         {
3093           /* Subtract the number of iterations performed by the vectorized loop
3094              from the number of total iterations.  */
3095           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3096                                               before_loop_niters,
3097                                               niters);
3098
3099           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3100           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3101             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3102                            epilogue_niters,
3103                            build_one_cst (TREE_TYPE (epilogue_niters)));
3104
3105           /* Decide what to do if the number of epilogue iterations is not
3106              a multiple of the epilogue loop's vectorization factor.
3107              We should have rejected the loop during the analysis phase
3108              if this fails.  */
3109           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3110                                                            true))
3111             gcc_unreachable ();
3112         }
3113     }
3114
3115   adjust_vec.release ();
3116   free_original_copy_tables ();
3117
3118   return vect_epilogues ? epilog : NULL;
3119 }
3120
3121 /* Function vect_create_cond_for_niters_checks.
3122
3123    Create a conditional expression that represents the run-time checks for
3124    loop's niter.  The loop is guaranteed to terminate if the run-time
3125    checks hold.
3126
3127    Input:
3128    COND_EXPR  - input conditional expression.  New conditions will be chained
3129                 with logical AND operation.  If it is NULL, then the function
3130                 is used to return the number of alias checks.
3131    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3132                 to be checked.
3133
3134    Output:
3135    COND_EXPR - conditional expression.
3136
3137    The returned COND_EXPR is the conditional expression to be used in the
3138    if statement that controls which version of the loop gets executed at
3139    runtime.  */
3140
3141 static void
3142 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3143 {
3144   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3145
3146   if (*cond_expr)
3147     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3148                               *cond_expr, part_cond_expr);
3149   else
3150     *cond_expr = part_cond_expr;
3151 }
3152
3153 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3154    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3155
3156 static void
3157 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3158 {
3159   if (*cond_expr)
3160     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3161                               *cond_expr, part_cond_expr);
3162   else
3163     *cond_expr = part_cond_expr;
3164 }
3165
3166 /* Function vect_create_cond_for_align_checks.
3167
3168    Create a conditional expression that represents the alignment checks for
3169    all of data references (array element references) whose alignment must be
3170    checked at runtime.
3171
3172    Input:
3173    COND_EXPR  - input conditional expression.  New conditions will be chained
3174                 with logical AND operation.
3175    LOOP_VINFO - two fields of the loop information are used.
3176                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3177                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3178
3179    Output:
3180    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3181                          expression.
3182    The returned value is the conditional expression to be used in the if
3183    statement that controls which version of the loop gets executed at runtime.
3184
3185    The algorithm makes two assumptions:
3186      1) The number of bytes "n" in a vector is a power of 2.
3187      2) An address "a" is aligned if a%n is zero and that this
3188         test can be done as a&(n-1) == 0.  For example, for 16
3189         byte vectors the test is a&0xf == 0.  */
3190
3191 static void
3192 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3193                                    tree *cond_expr,
3194                                    gimple_seq *cond_expr_stmt_list)
3195 {
3196   const vec<stmt_vec_info> &may_misalign_stmts
3197     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3198   stmt_vec_info stmt_info;
3199   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3200   tree mask_cst;
3201   unsigned int i;
3202   tree int_ptrsize_type;
3203   char tmp_name[20];
3204   tree or_tmp_name = NULL_TREE;
3205   tree and_tmp_name;
3206   gimple *and_stmt;
3207   tree ptrsize_zero;
3208   tree part_cond_expr;
3209
3210   /* Check that mask is one less than a power of 2, i.e., mask is
3211      all zeros followed by all ones.  */
3212   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3213
3214   int_ptrsize_type = signed_type_for (ptr_type_node);
3215
3216   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3217      of the first vector of the i'th data reference. */
3218
3219   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3220     {
3221       gimple_seq new_stmt_list = NULL;
3222       tree addr_base;
3223       tree addr_tmp_name;
3224       tree new_or_tmp_name;
3225       gimple *addr_stmt, *or_stmt;
3226       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3227       bool negative = tree_int_cst_compare
3228         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3229       tree offset = negative
3230         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3231                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3232         : size_zero_node;
3233
3234       /* create: addr_tmp = (int)(address_of_first_vector) */
3235       addr_base =
3236         vect_create_addr_base_for_vector_ref (loop_vinfo,
3237                                               stmt_info, &new_stmt_list,
3238                                               offset);
3239       if (new_stmt_list != NULL)
3240         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3241
3242       sprintf (tmp_name, "addr2int%d", i);
3243       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3244       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3245       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3246
3247       /* The addresses are OR together.  */
3248
3249       if (or_tmp_name != NULL_TREE)
3250         {
3251           /* create: or_tmp = or_tmp | addr_tmp */
3252           sprintf (tmp_name, "orptrs%d", i);
3253           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3254           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3255                                          or_tmp_name, addr_tmp_name);
3256           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3257           or_tmp_name = new_or_tmp_name;
3258         }
3259       else
3260         or_tmp_name = addr_tmp_name;
3261
3262     } /* end for i */
3263
3264   mask_cst = build_int_cst (int_ptrsize_type, mask);
3265
3266   /* create: and_tmp = or_tmp & mask  */
3267   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3268
3269   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3270                                   or_tmp_name, mask_cst);
3271   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3272
3273   /* Make and_tmp the left operand of the conditional test against zero.
3274      if and_tmp has a nonzero bit then some address is unaligned.  */
3275   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3276   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3277                                 and_tmp_name, ptrsize_zero);
3278   chain_cond_expr (cond_expr, part_cond_expr);
3279 }
3280
3281 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3282    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3283    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3284    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3285
3286 static void
3287 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3288 {
3289   const vec<vec_object_pair> &pairs
3290     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3291   unsigned int i;
3292   vec_object_pair *pair;
3293   FOR_EACH_VEC_ELT (pairs, i, pair)
3294     {
3295       tree addr1 = build_fold_addr_expr (pair->first);
3296       tree addr2 = build_fold_addr_expr (pair->second);
3297       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3298                                          addr1, addr2);
3299       chain_cond_expr (cond_expr, part_cond_expr);
3300     }
3301 }
3302
3303 /* Create an expression that is true when all lower-bound conditions for
3304    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3305
3306 static void
3307 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3308 {
3309   const vec<vec_lower_bound> &lower_bounds
3310     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3311   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3312     {
3313       tree expr = lower_bounds[i].expr;
3314       tree type = unsigned_type_for (TREE_TYPE (expr));
3315       expr = fold_convert (type, expr);
3316       poly_uint64 bound = lower_bounds[i].min_value;
3317       if (!lower_bounds[i].unsigned_p)
3318         {
3319           expr = fold_build2 (PLUS_EXPR, type, expr,
3320                               build_int_cstu (type, bound - 1));
3321           bound += bound - 1;
3322         }
3323       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3324                                          build_int_cstu (type, bound));
3325       chain_cond_expr (cond_expr, part_cond_expr);
3326     }
3327 }
3328
3329 /* Function vect_create_cond_for_alias_checks.
3330
3331    Create a conditional expression that represents the run-time checks for
3332    overlapping of address ranges represented by a list of data references
3333    relations passed as input.
3334
3335    Input:
3336    COND_EXPR  - input conditional expression.  New conditions will be chained
3337                 with logical AND operation.  If it is NULL, then the function
3338                 is used to return the number of alias checks.
3339    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3340                 to be checked.
3341
3342    Output:
3343    COND_EXPR - conditional expression.
3344
3345    The returned COND_EXPR is the conditional expression to be used in the if
3346    statement that controls which version of the loop gets executed at runtime.
3347 */
3348
3349 void
3350 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3351 {
3352   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3353     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3354
3355   if (comp_alias_ddrs.is_empty ())
3356     return;
3357
3358   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3359                                &comp_alias_ddrs, cond_expr);
3360   if (dump_enabled_p ())
3361     dump_printf_loc (MSG_NOTE, vect_location,
3362                      "created %u versioning for alias checks.\n",
3363                      comp_alias_ddrs.length ());
3364 }
3365
3366
3367 /* Function vect_loop_versioning.
3368
3369    If the loop has data references that may or may not be aligned or/and
3370    has data reference relations whose independence was not proven then
3371    two versions of the loop need to be generated, one which is vectorized
3372    and one which isn't.  A test is then generated to control which of the
3373    loops is executed.  The test checks for the alignment of all of the
3374    data references that may or may not be aligned.  An additional
3375    sequence of runtime tests is generated for each pairs of DDRs whose
3376    independence was not proven.  The vectorized version of loop is
3377    executed only if both alias and alignment tests are passed.
3378
3379    The test generated to check which version of loop is executed
3380    is modified to also check for profitability as indicated by the
3381    cost model threshold TH.
3382
3383    The versioning precondition(s) are placed in *COND_EXPR and
3384    *COND_EXPR_STMT_LIST.  */
3385
3386 class loop *
3387 vect_loop_versioning (loop_vec_info loop_vinfo,
3388                       gimple *loop_vectorized_call)
3389 {
3390   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3391   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3392   basic_block condition_bb;
3393   gphi_iterator gsi;
3394   gimple_stmt_iterator cond_exp_gsi;
3395   basic_block merge_bb;
3396   basic_block new_exit_bb;
3397   edge new_exit_e, e;
3398   gphi *orig_phi, *new_phi;
3399   tree cond_expr = NULL_TREE;
3400   gimple_seq cond_expr_stmt_list = NULL;
3401   tree arg;
3402   profile_probability prob = profile_probability::likely ();
3403   gimple_seq gimplify_stmt_list = NULL;
3404   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3405   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3406   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3407   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3408   poly_uint64 versioning_threshold
3409     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3410   tree version_simd_if_cond
3411     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3412   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3413
3414   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3415       && !ordered_p (th, versioning_threshold))
3416     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3417                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3418                                             th - 1));
3419   if (maybe_ne (versioning_threshold, 0U))
3420     {
3421       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3422                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3423                                               versioning_threshold - 1));
3424       if (cond_expr)
3425         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3426                                  expr, cond_expr);
3427       else
3428         cond_expr = expr;
3429     }
3430
3431   tree cost_name = NULL_TREE;
3432   profile_probability prob2 = profile_probability::uninitialized ();
3433   if (cond_expr
3434       && !integer_truep (cond_expr)
3435       && (version_niter
3436           || version_align
3437           || version_alias
3438           || version_simd_if_cond))
3439     {
3440       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3441                                                       &cond_expr_stmt_list,
3442                                                       is_gimple_val, NULL_TREE);
3443       /* Split prob () into two so that the overall probability of passing
3444          both the cost-model and versioning checks is the orig prob.  */
3445       prob2 = prob.split (prob);
3446     }
3447
3448   if (version_niter)
3449     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3450
3451   if (cond_expr)
3452     {
3453       gimple_seq tem = NULL;
3454       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3455                                           &tem, is_gimple_condexpr_for_cond,
3456                                           NULL_TREE);
3457       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3458     }
3459
3460   if (version_align)
3461     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3462                                        &cond_expr_stmt_list);
3463
3464   if (version_alias)
3465     {
3466       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3467       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3468       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3469     }
3470
3471   if (version_simd_if_cond)
3472     {
3473       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3474       if (flag_checking)
3475         if (basic_block bb
3476             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3477           gcc_assert (bb != loop->header
3478                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3479                       && (scalar_loop == NULL
3480                           || (bb != scalar_loop->header
3481                               && dominated_by_p (CDI_DOMINATORS,
3482                                                  scalar_loop->header, bb))));
3483       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3484       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3485                             version_simd_if_cond, zero);
3486       if (cond_expr)
3487         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3488                                  c, cond_expr);
3489       else
3490         cond_expr = c;
3491       if (dump_enabled_p ())
3492         dump_printf_loc (MSG_NOTE, vect_location,
3493                          "created versioning for simd if condition check.\n");
3494     }
3495
3496   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3497                                       &gimplify_stmt_list,
3498                                       is_gimple_condexpr_for_cond, NULL_TREE);
3499   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3500
3501   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3502      invariant in.  */
3503   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3504   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3505        !gsi_end_p (gsi); gsi_next (&gsi))
3506     {
3507       gimple *stmt = gsi_stmt (gsi);
3508       update_stmt (stmt);
3509       ssa_op_iter iter;
3510       use_operand_p use_p;
3511       basic_block def_bb;
3512       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3513         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3514             && flow_bb_inside_loop_p (outermost, def_bb))
3515           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3516     }
3517
3518   /* Search for the outermost loop we can version.  Avoid versioning of
3519      non-perfect nests but allow if-conversion versioned loops inside.  */
3520   class loop *loop_to_version = loop;
3521   if (flow_loop_nested_p (outermost, loop))
3522     {
3523       if (dump_enabled_p ())
3524         dump_printf_loc (MSG_NOTE, vect_location,
3525                          "trying to apply versioning to outer loop %d\n",
3526                          outermost->num);
3527       if (outermost->num == 0)
3528         outermost = superloop_at_depth (loop, 1);
3529       /* And avoid applying versioning on non-perfect nests.  */
3530       while (loop_to_version != outermost
3531              && (e = single_exit (loop_outer (loop_to_version)))
3532              && !(e->flags & EDGE_COMPLEX)
3533              && (!loop_outer (loop_to_version)->inner->next
3534                  || vect_loop_vectorized_call (loop_to_version))
3535              && (!loop_outer (loop_to_version)->inner->next
3536                  || !loop_outer (loop_to_version)->inner->next->next))
3537         loop_to_version = loop_outer (loop_to_version);
3538     }
3539
3540   /* Apply versioning.  If there is already a scalar version created by
3541      if-conversion re-use that.  Note we cannot re-use the copy of
3542      an if-converted outer-loop when vectorizing the inner loop only.  */
3543   gcond *cond;
3544   if ((!loop_to_version->inner || loop == loop_to_version)
3545       && loop_vectorized_call)
3546     {
3547       gcc_assert (scalar_loop);
3548       condition_bb = gimple_bb (loop_vectorized_call);
3549       cond = as_a <gcond *> (last_stmt (condition_bb));
3550       gimple_cond_set_condition_from_tree (cond, cond_expr);
3551       update_stmt (cond);
3552
3553       if (cond_expr_stmt_list)
3554         {
3555           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3556           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3557                                  GSI_SAME_STMT);
3558         }
3559
3560       /* if-conversion uses profile_probability::always () for both paths,
3561          reset the paths probabilities appropriately.  */
3562       edge te, fe;
3563       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3564       te->probability = prob;
3565       fe->probability = prob.invert ();
3566       /* We can scale loops counts immediately but have to postpone
3567          scaling the scalar loop because we re-use it during peeling.  */
3568       scale_loop_frequencies (loop_to_version, te->probability);
3569       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3570
3571       nloop = scalar_loop;
3572       if (dump_enabled_p ())
3573         dump_printf_loc (MSG_NOTE, vect_location,
3574                          "reusing %sloop version created by if conversion\n",
3575                          loop_to_version != loop ? "outer " : "");
3576     }
3577   else
3578     {
3579       if (loop_to_version != loop
3580           && dump_enabled_p ())
3581         dump_printf_loc (MSG_NOTE, vect_location,
3582                          "applying loop versioning to outer loop %d\n",
3583                          loop_to_version->num);
3584
3585       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3586
3587       initialize_original_copy_tables ();
3588       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3589                             prob, prob.invert (), prob, prob.invert (), true);
3590       gcc_assert (nloop);
3591       nloop = get_loop_copy (loop);
3592
3593       /* For cycle vectorization with SLP we rely on the PHI arguments
3594          appearing in the same order as the SLP node operands which for the
3595          loop PHI nodes means the preheader edge dest index needs to remain
3596          the same for the analyzed loop which also becomes the vectorized one.
3597          Make it so in case the state after versioning differs by redirecting
3598          the first edge into the header to the same destination which moves
3599          it last.  */
3600       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3601         {
3602           edge e = EDGE_PRED (loop->header, 0);
3603           ssa_redirect_edge (e, e->dest);
3604           flush_pending_stmts (e);
3605         }
3606       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3607
3608       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3609          reap those otherwise;  they also refer to the original
3610          loops.  */
3611       class loop *l = loop;
3612       while (gimple *call = vect_loop_vectorized_call (l))
3613         {
3614           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3615           fold_loop_internal_call (call, boolean_false_node);
3616           l = loop_outer (l);
3617         }
3618       free_original_copy_tables ();
3619
3620       if (cond_expr_stmt_list)
3621         {
3622           cond_exp_gsi = gsi_last_bb (condition_bb);
3623           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3624                                  GSI_SAME_STMT);
3625         }
3626
3627       /* Loop versioning violates an assumption we try to maintain during
3628          vectorization - that the loop exit block has a single predecessor.
3629          After versioning, the exit block of both loop versions is the same
3630          basic block (i.e. it has two predecessors). Just in order to simplify
3631          following transformations in the vectorizer, we fix this situation
3632          here by adding a new (empty) block on the exit-edge of the loop,
3633          with the proper loop-exit phis to maintain loop-closed-form.
3634          If loop versioning wasn't done from loop, but scalar_loop instead,
3635          merge_bb will have already just a single successor.  */
3636
3637       merge_bb = single_exit (loop_to_version)->dest;
3638       if (EDGE_COUNT (merge_bb->preds) >= 2)
3639         {
3640           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3641           new_exit_bb = split_edge (single_exit (loop_to_version));
3642           new_exit_e = single_exit (loop_to_version);
3643           e = EDGE_SUCC (new_exit_bb, 0);
3644
3645           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3646                gsi_next (&gsi))
3647             {
3648               tree new_res;
3649               orig_phi = gsi.phi ();
3650               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3651               new_phi = create_phi_node (new_res, new_exit_bb);
3652               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3653               add_phi_arg (new_phi, arg, new_exit_e,
3654                            gimple_phi_arg_location_from_edge (orig_phi, e));
3655               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3656             }
3657         }
3658
3659       update_ssa (TODO_update_ssa_no_phi);
3660     }
3661
3662   /* Split the cost model check off to a separate BB.  Costing assumes
3663      this is the only thing we perform when we enter the scalar loop
3664      from a failed cost decision.  */
3665   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3666     {
3667       gimple *def = SSA_NAME_DEF_STMT (cost_name);
3668       /* All uses of the cost check are 'true' after the check we
3669          are going to insert.  */
3670       replace_uses_by (cost_name, boolean_true_node);
3671       /* And we're going to build the new single use of it.  */
3672       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3673                                        NULL_TREE, NULL_TREE);
3674       edge e = split_block (gimple_bb (def), def);
3675       gimple_stmt_iterator gsi = gsi_for_stmt (def);
3676       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3677       edge true_e, false_e;
3678       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3679       e->flags &= ~EDGE_FALLTHRU;
3680       e->flags |= EDGE_TRUE_VALUE;
3681       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3682       e->probability = prob2;
3683       e2->probability = prob2.invert ();
3684       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3685       auto_vec<basic_block, 3> adj;
3686       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3687            son;
3688            son = next_dom_son (CDI_DOMINATORS, son))
3689         if (EDGE_COUNT (son->preds) > 1)
3690           adj.safe_push (son);
3691       for (auto son : adj)
3692         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3693     }
3694
3695   if (version_niter)
3696     {
3697       /* The versioned loop could be infinite, we need to clear existing
3698          niter information which is copied from the original loop.  */
3699       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3700       vect_free_loop_info_assumptions (nloop);
3701     }
3702
3703   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3704       && dump_enabled_p ())
3705     {
3706       if (version_alias)
3707         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3708                          vect_location,
3709                          "loop versioned for vectorization because of "
3710                          "possible aliasing\n");
3711       if (version_align)
3712         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3713                          vect_location,
3714                          "loop versioned for vectorization to enhance "
3715                          "alignment\n");
3716
3717     }
3718
3719   return nloop;
3720 }