gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 316                                *indices);
 317 }
 318
 319 /* Try to use permutes to define the masks in DEST_RGM using the masks
 320    in SRC_RGM, given that the former has twice as many masks as the
 321    latter.  Return true on success, adding any new statements to SEQ.  */
 322
 323 static bool
 324 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 325                                rgroup_controls *src_rgm)
 326 {
 327   tree src_masktype = src_rgm->type;
 328   tree dest_masktype = dest_rgm->type;
 329   machine_mode src_mode = TYPE_MODE (src_masktype);
 330   insn_code icode1, icode2;
 331   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 332       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 333                                   src_mode)) != CODE_FOR_nothing
 334       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 335                                   src_mode)) != CODE_FOR_nothing)
 336     {
 337       /* Unpacking the source masks gives at least as many mask bits as
 338          we need.  We can then VIEW_CONVERT any excess bits away.  */
 339       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 340       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 341       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 342       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 343         {
 344           tree src = src_rgm->controls[i / 2];
 345           tree dest = dest_rgm->controls[i];
 346           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 347                             ? VEC_UNPACK_HI_EXPR
 348                             : VEC_UNPACK_LO_EXPR);
 349           gassign *stmt;
 350           if (dest_masktype == unpack_masktype)
 351             stmt = gimple_build_assign (dest, code, src);
 352           else
 353             {
 354               tree temp = make_ssa_name (unpack_masktype);
 355               stmt = gimple_build_assign (temp, code, src);
 356               gimple_seq_add_stmt (seq, stmt);
 357               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 358                                           build1 (VIEW_CONVERT_EXPR,
 359                                                   dest_masktype, temp));
 360             }
 361           gimple_seq_add_stmt (seq, stmt);
 362         }
 363       return true;
 364     }
 365   vec_perm_indices indices[2];
 366   if (dest_masktype == src_masktype
 367       && interleave_supported_p (&indices[0], src_masktype, 0)
 368       && interleave_supported_p (&indices[1], src_masktype, 1))
 369     {
 370       /* The destination requires twice as many mask bits as the source, so
 371          we can use interleaving permutes to double up the number of bits.  */
 372       tree masks[2];
 373       for (unsigned int i = 0; i < 2; ++i)
 374         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 375       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 376         {
 377           tree src = src_rgm->controls[i / 2];
 378           tree dest = dest_rgm->controls[i];
 379           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 380                                               src, src, masks[i & 1]);
 381           gimple_seq_add_stmt (seq, stmt);
 382         }
 383       return true;
 384     }
 385   return false;
 386 }
 387
 388 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 389    for all the rgroup controls in RGC and return a control that is nonzero
 390    when the loop needs to iterate.  Add any new preheader statements to
 391    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 392
 393    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 394    times and has been vectorized according to LOOP_VINFO.
 395
 396    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 397    starts with NITERS_SKIP dummy iterations of the scalar loop before
 398    the real work starts.  The mask elements for these dummy iterations
 399    must be 0, to ensure that the extra iterations do not have an effect.
 400
 401    It is known that:
 402
 403      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 404
 405    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 406    variable that starts at 0 and has step:
 407
 408      VF * RGC->max_nscalars_per_iter * RGC->factor
 409
 410    might overflow before hitting a value above:
 411
 412      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 413
 414    This means that we cannot guarantee that such an induction variable
 415    would ever hit a value that produces a set of all-false masks or zero
 416    lengths for RGC.
 417
 418    Note: the cost of the code generated by this function is modeled
 419    by vect_estimate_min_profitable_iters, so changes here may need
 420    corresponding changes there.  */
 421
 422 static tree
 423 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 424                                  gimple_seq *preheader_seq,
 425                                  gimple_seq *header_seq,
 426                                  gimple_stmt_iterator loop_cond_gsi,
 427                                  rgroup_controls *rgc, tree niters,
 428                                  tree niters_skip, bool might_wrap_p)
 429 {
 430   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 431   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 432   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 433
 434   tree ctrl_type = rgc->type;
 435   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 436   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 437   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 438   tree length_limit = NULL_TREE;
 439   /* For length, we need length_limit to ensure length in range.  */
 440   if (!use_masks_p)
 441     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 442
 443   /* Calculate the maximum number of item values that the rgroup
 444      handles in total, the number that it handles for each iteration
 445      of the vector loop, and the number that it should skip during the
 446      first iteration of the vector loop.  */
 447   tree nitems_total = niters;
 448   tree nitems_step = build_int_cst (iv_type, vf);
 449   tree nitems_skip = niters_skip;
 450   if (nitems_per_iter != 1)
 451     {
 452       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 453          these multiplications don't overflow.  */
 454       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 455       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 456       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 457                                    nitems_total, compare_factor);
 458       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 459                                   nitems_step, iv_factor);
 460       if (nitems_skip)
 461         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 462                                     nitems_skip, compare_factor);
 463     }
 464
 465   /* Create an induction variable that counts the number of items
 466      processed.  */
 467   tree index_before_incr, index_after_incr;
 468   gimple_stmt_iterator incr_gsi;
 469   bool insert_after;
 470   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 471   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 472              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 473
 474   tree zero_index = build_int_cst (compare_type, 0);
 475   tree test_index, test_limit, first_limit;
 476   gimple_stmt_iterator *test_gsi;
 477   if (might_wrap_p)
 478     {
 479       /* In principle the loop should stop iterating once the incremented
 480          IV reaches a value greater than or equal to:
 481
 482            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 483
 484          However, there's no guarantee that this addition doesn't overflow
 485          the comparison type, or that the IV hits a value above it before
 486          wrapping around.  We therefore adjust the limit down by one
 487          IV step:
 488
 489            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 490            -[infinite-prec] NITEMS_STEP
 491
 492          and compare the IV against this limit _before_ incrementing it.
 493          Since the comparison type is unsigned, we actually want the
 494          subtraction to saturate at zero:
 495
 496            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 497            -[sat] NITEMS_STEP
 498
 499          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 500
 501            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 502
 503          where the rightmost subtraction can be done directly in
 504          COMPARE_TYPE.  */
 505       test_index = index_before_incr;
 506       tree adjust = gimple_convert (preheader_seq, compare_type,
 507                                     nitems_step);
 508       if (nitems_skip)
 509         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 510                                adjust, nitems_skip);
 511       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 512                                  nitems_total, adjust);
 513       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 514                                  test_limit, adjust);
 515       test_gsi = &incr_gsi;
 516
 517       /* Get a safe limit for the first iteration.  */
 518       if (nitems_skip)
 519         {
 520           /* The first vector iteration can handle at most NITEMS_STEP
 521              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 522              NITEMS_SKIP to that cannot overflow.  */
 523           tree const_limit = build_int_cst (compare_type,
 524                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 525                                             * nitems_per_iter);
 526           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 527                                       nitems_total, const_limit);
 528           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 529                                       first_limit, nitems_skip);
 530         }
 531       else
 532         /* For the first iteration it doesn't matter whether the IV hits
 533            a value above NITEMS_TOTAL.  That only matters for the latch
 534            condition.  */
 535         first_limit = nitems_total;
 536     }
 537   else
 538     {
 539       /* Test the incremented IV, which will always hit a value above
 540          the bound before wrapping.  */
 541       test_index = index_after_incr;
 542       test_limit = nitems_total;
 543       if (nitems_skip)
 544         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 545                                    test_limit, nitems_skip);
 546       test_gsi = &loop_cond_gsi;
 547
 548       first_limit = test_limit;
 549     }
 550
 551   /* Convert the IV value to the comparison type (either a no-op or
 552      a demotion).  */
 553   gimple_seq test_seq = NULL;
 554   test_index = gimple_convert (&test_seq, compare_type, test_index);
 555   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 556
 557   /* Provide a definition of each control in the group.  */
 558   tree next_ctrl = NULL_TREE;
 559   tree ctrl;
 560   unsigned int i;
 561   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 562     {
 563       /* Previous controls will cover BIAS items.  This control covers the
 564          next batch.  */
 565       poly_uint64 bias = nitems_per_ctrl * i;
 566       tree bias_tree = build_int_cst (compare_type, bias);
 567
 568       /* See whether the first iteration of the vector loop is known
 569          to have a full control.  */
 570       poly_uint64 const_limit;
 571       bool first_iteration_full
 572         = (poly_int_tree_p (first_limit, &const_limit)
 573            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 574
 575       /* Rather than have a new IV that starts at BIAS and goes up to
 576          TEST_LIMIT, prefer to use the same 0-based IV for each control
 577          and adjust the bound down by BIAS.  */
 578       tree this_test_limit = test_limit;
 579       if (i != 0)
 580         {
 581           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 582                                           compare_type, this_test_limit,
 583                                           bias_tree);
 584           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 585                                           compare_type, this_test_limit,
 586                                           bias_tree);
 587         }
 588
 589       /* Create the initial control.  First include all items that
 590          are within the loop limit.  */
 591       tree init_ctrl = NULL_TREE;
 592       if (!first_iteration_full)
 593         {
 594           tree start, end;
 595           if (first_limit == test_limit)
 596             {
 597               /* Use a natural test between zero (the initial IV value)
 598                  and the loop limit.  The "else" block would be valid too,
 599                  but this choice can avoid the need to load BIAS_TREE into
 600                  a register.  */
 601               start = zero_index;
 602               end = this_test_limit;
 603             }
 604           else
 605             {
 606               /* FIRST_LIMIT is the maximum number of items handled by the
 607                  first iteration of the vector loop.  Test the portion
 608                  associated with this control.  */
 609               start = bias_tree;
 610               end = first_limit;
 611             }
 612
 613           if (use_masks_p)
 614             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 615                                         start, end, "max_mask");
 616           else
 617             {
 618               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 619               gimple_seq seq = vect_gen_len (init_ctrl, start,
 620                                              end, length_limit);
 621               gimple_seq_add_seq (preheader_seq, seq);
 622             }
 623         }
 624
 625       /* Now AND out the bits that are within the number of skipped
 626          items.  */
 627       poly_uint64 const_skip;
 628       if (nitems_skip
 629           && !(poly_int_tree_p (nitems_skip, &const_skip)
 630                && known_le (const_skip, bias)))
 631         {
 632           gcc_assert (use_masks_p);
 633           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 634                                                     bias_tree, nitems_skip);
 635           if (init_ctrl)
 636             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 637                                       init_ctrl, unskipped_mask);
 638           else
 639             init_ctrl = unskipped_mask;
 640         }
 641
 642       if (!init_ctrl)
 643         {
 644           /* First iteration is full.  */
 645           if (use_masks_p)
 646             init_ctrl = build_minus_one_cst (ctrl_type);
 647           else
 648             init_ctrl = length_limit;
 649         }
 650
 651       /* Get the control value for the next iteration of the loop.  */
 652       if (use_masks_p)
 653         {
 654           gimple_seq stmts = NULL;
 655           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 656                                       this_test_limit, "next_mask");
 657           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 658         }
 659       else
 660         {
 661           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 662           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 663                                          length_limit);
 664           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 665         }
 666
 667       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 668     }
 669
 670   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 671   if (partial_load_bias != 0)
 672     {
 673       tree adjusted_len = rgc->bias_adjusted_ctrl;
 674       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 675                                             rgc->controls[0],
 676                                             build_int_cst
 677                                             (TREE_TYPE (rgc->controls[0]),
 678                                              partial_load_bias));
 679       gimple_seq_add_stmt (header_seq, minus);
 680     }
 681
 682   return next_ctrl;
 683 }
 684
 685 /* Set up the iteration condition and rgroup controls for LOOP, given
 686    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 687    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 688    the number of iterations of the original scalar loop that should be
 689    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 690    for vect_set_loop_condition.
 691
 692    Insert the branch-back condition before LOOP_COND_GSI and return the
 693    final gcond.  */
 694
 695 static gcond *
 696 vect_set_loop_condition_partial_vectors (class loop *loop,
 697                                          loop_vec_info loop_vinfo, tree niters,
 698                                          tree final_iv, bool niters_maybe_zero,
 699                                          gimple_stmt_iterator loop_cond_gsi)
 700 {
 701   gimple_seq preheader_seq = NULL;
 702   gimple_seq header_seq = NULL;
 703
 704   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 705   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 706   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 707   tree orig_niters = niters;
 708
 709   /* Type of the initial value of NITERS.  */
 710   tree ni_actual_type = TREE_TYPE (niters);
 711   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 712   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 713
 714   /* Convert NITERS to the same size as the compare.  */
 715   if (compare_precision > ni_actual_precision
 716       && niters_maybe_zero)
 717     {
 718       /* We know that there is always at least one iteration, so if the
 719          count is zero then it must have wrapped.  Cope with this by
 720          subtracting 1 before the conversion and adding 1 to the result.  */
 721       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 722       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 723                              niters, build_minus_one_cst (ni_actual_type));
 724       niters = gimple_convert (&preheader_seq, compare_type, niters);
 725       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 726                              niters, build_one_cst (compare_type));
 727     }
 728   else
 729     niters = gimple_convert (&preheader_seq, compare_type, niters);
 730
 731   /* Iterate over all the rgroups and fill in their controls.  We could use
 732      the first control from any rgroup for the loop condition; here we
 733      arbitrarily pick the last.  */
 734   tree test_ctrl = NULL_TREE;
 735   rgroup_controls *rgc;
 736   unsigned int i;
 737   auto_vec<rgroup_controls> *controls = use_masks_p
 738                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 739                                           : &LOOP_VINFO_LENS (loop_vinfo);
 740   FOR_EACH_VEC_ELT (*controls, i, rgc)
 741     if (!rgc->controls.is_empty ())
 742       {
 743         /* First try using permutes.  This adds a single vector
 744            instruction to the loop for each mask, but needs no extra
 745            loop invariants or IVs.  */
 746         unsigned int nmasks = i + 1;
 747         if (use_masks_p && (nmasks & 1) == 0)
 748           {
 749             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 750             if (!half_rgc->controls.is_empty ()
 751                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 752               continue;
 753           }
 754
 755         /* See whether zero-based IV would ever generate all-false masks
 756            or zero length before wrapping around.  */
 757         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 758
 759         /* Set up all controls for this group.  */
 760         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 761                                                      &preheader_seq,
 762                                                      &header_seq,
 763                                                      loop_cond_gsi, rgc,
 764                                                      niters, niters_skip,
 765                                                      might_wrap_p);
 766       }
 767
 768   /* Emit all accumulated statements.  */
 769   add_preheader_seq (loop, preheader_seq);
 770   add_header_seq (loop, header_seq);
 771
 772   /* Get a boolean result that tells us whether to iterate.  */
 773   edge exit_edge = single_exit (loop);
 774   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 775   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 776   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 777                                         NULL_TREE, NULL_TREE);
 778   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 779
 780   /* The loop iterates (NITERS - 1) / VF + 1 times.
 781      Subtract one from this to get the latch count.  */
 782   tree step = build_int_cst (compare_type,
 783                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 784   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 785                                        build_minus_one_cst (compare_type));
 786   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 787                                      niters_minus_one, step);
 788
 789   if (final_iv)
 790     {
 791       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 792       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 793     }
 794
 795   return cond_stmt;
 796 }
 797
 798 /* Like vect_set_loop_condition, but handle the case in which the vector
 799    loop handles exactly VF scalars per iteration.  */
 800
 801 static gcond *
 802 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 803                                 tree final_iv, bool niters_maybe_zero,
 804                                 gimple_stmt_iterator loop_cond_gsi)
 805 {
 806   tree indx_before_incr, indx_after_incr;
 807   gcond *cond_stmt;
 808   gcond *orig_cond;
 809   edge pe = loop_preheader_edge (loop);
 810   edge exit_edge = single_exit (loop);
 811   gimple_stmt_iterator incr_gsi;
 812   bool insert_after;
 813   enum tree_code code;
 814   tree niters_type = TREE_TYPE (niters);
 815
 816   orig_cond = get_loop_exit_condition (loop);
 817   gcc_assert (orig_cond);
 818   loop_cond_gsi = gsi_for_stmt (orig_cond);
 819
 820   tree init, limit;
 821   if (!niters_maybe_zero && integer_onep (step))
 822     {
 823       /* In this case we can use a simple 0-based IV:
 824
 825          A:
 826            x = 0;
 827            do
 828              {
 829                ...
 830                x += 1;
 831              }
 832            while (x < NITERS);  */
 833       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 834       init = build_zero_cst (niters_type);
 835       limit = niters;
 836     }
 837   else
 838     {
 839       /* The following works for all values of NITERS except 0:
 840
 841          B:
 842            x = 0;
 843            do
 844              {
 845                ...
 846                x += STEP;
 847              }
 848            while (x <= NITERS - STEP);
 849
 850          so that the loop continues to iterate if x + STEP - 1 < NITERS
 851          but stops if x + STEP - 1 >= NITERS.
 852
 853          However, if NITERS is zero, x never hits a value above NITERS - STEP
 854          before wrapping around.  There are two obvious ways of dealing with
 855          this:
 856
 857          - start at STEP - 1 and compare x before incrementing it
 858          - start at -1 and compare x after incrementing it
 859
 860          The latter is simpler and is what we use.  The loop in this case
 861          looks like:
 862
 863          C:
 864            x = -1;
 865            do
 866              {
 867                ...
 868                x += STEP;
 869              }
 870            while (x < NITERS - STEP);
 871
 872          In both cases the loop limit is NITERS - STEP.  */
 873       gimple_seq seq = NULL;
 874       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 875       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 876       if (seq)
 877         {
 878           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 879           gcc_assert (!new_bb);
 880         }
 881       if (niters_maybe_zero)
 882         {
 883           /* Case C.  */
 884           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 885           init = build_all_ones_cst (niters_type);
 886         }
 887       else
 888         {
 889           /* Case B.  */
 890           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 891           init = build_zero_cst (niters_type);
 892         }
 893     }
 894
 895   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 896   create_iv (init, step, NULL_TREE, loop,
 897              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 898   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 899                                               true, NULL_TREE, true,
 900                                               GSI_SAME_STMT);
 901   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 902                                      true, GSI_SAME_STMT);
 903
 904   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 905                                  NULL_TREE);
 906
 907   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 908
 909   /* Record the number of latch iterations.  */
 910   if (limit == niters)
 911     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 912        latch count.  */
 913     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 914                                        build_int_cst (niters_type, 1));
 915   else
 916     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 917        Subtract one from this to get the latch count.  */
 918     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 919                                        limit, step);
 920
 921   if (final_iv)
 922     {
 923       gassign *assign;
 924       edge exit = single_exit (loop);
 925       gcc_assert (single_pred_p (exit->dest));
 926       tree phi_dest
 927         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
 928       /* Make sure to maintain LC SSA form here and elide the subtraction
 929          if the value is zero.  */
 930       gphi *phi = create_phi_node (phi_dest, exit->dest);
 931       add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
 932       if (!integer_zerop (init))
 933         {
 934           assign = gimple_build_assign (final_iv, MINUS_EXPR,
 935                                         phi_dest, init);
 936           gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
 937           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
 938         }
 939     }
 940
 941   return cond_stmt;
 942 }
 943
 944 /* If we're using fully-masked loops, make LOOP iterate:
 945
 946       N == (NITERS - 1) / STEP + 1
 947
 948    times.  When NITERS is zero, this is equivalent to making the loop
 949    execute (1 << M) / STEP times, where M is the precision of NITERS.
 950    NITERS_MAYBE_ZERO is true if this last case might occur.
 951
 952    If we're not using fully-masked loops, make LOOP iterate:
 953
 954       N == (NITERS - STEP) / STEP + 1
 955
 956    times, where NITERS is known to be outside the range [1, STEP - 1].
 957    This is equivalent to making the loop execute NITERS / STEP times
 958    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 959    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 960
 961    If FINAL_IV is nonnull, it is an SSA name that should be set to
 962    N * STEP on exit from the loop.
 963
 964    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 965
 966 void
 967 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 968                          tree niters, tree step, tree final_iv,
 969                          bool niters_maybe_zero)
 970 {
 971   gcond *cond_stmt;
 972   gcond *orig_cond = get_loop_exit_condition (loop);
 973   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 974
 975   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 976     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 977                                                          niters, final_iv,
 978                                                          niters_maybe_zero,
 979                                                          loop_cond_gsi);
 980   else
 981     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 982                                                 niters_maybe_zero,
 983                                                 loop_cond_gsi);
 984
 985   /* Remove old loop exit test.  */
 986   stmt_vec_info orig_cond_info;
 987   if (loop_vinfo
 988       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 989     loop_vinfo->remove_stmt (orig_cond_info);
 990   else
 991     gsi_remove (&loop_cond_gsi, true);
 992
 993   if (dump_enabled_p ())
 994     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 995                      (gimple *) cond_stmt);
 996 }
 997
 998 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 999    For all PHI arguments in FROM->dest and TO->dest from those
1000    edges ensure that TO->dest PHI arguments have current_def
1001    to that in from.  */
1002
1003 static void
1004 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1005 {
1006   gimple_stmt_iterator gsi_from, gsi_to;
1007
1008   for (gsi_from = gsi_start_phis (from->dest),
1009        gsi_to = gsi_start_phis (to->dest);
1010        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1011     {
1012       gimple *from_phi = gsi_stmt (gsi_from);
1013       gimple *to_phi = gsi_stmt (gsi_to);
1014       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1015       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1016       if (virtual_operand_p (from_arg))
1017         {
1018           gsi_next (&gsi_from);
1019           continue;
1020         }
1021       if (virtual_operand_p (to_arg))
1022         {
1023           gsi_next (&gsi_to);
1024           continue;
1025         }
1026       if (TREE_CODE (from_arg) != SSA_NAME)
1027         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1028       else if (TREE_CODE (to_arg) == SSA_NAME
1029                && from_arg != to_arg)
1030         {
1031           if (get_current_def (to_arg) == NULL_TREE)
1032             {
1033               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1034                                               TREE_TYPE (get_current_def
1035                                                            (from_arg))));
1036               set_current_def (to_arg, get_current_def (from_arg));
1037             }
1038         }
1039       gsi_next (&gsi_from);
1040       gsi_next (&gsi_to);
1041     }
1042
1043   gphi *from_phi = get_virtual_phi (from->dest);
1044   gphi *to_phi = get_virtual_phi (to->dest);
1045   if (from_phi)
1046     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1047                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1048 }
1049
1050
1051 /* Given LOOP this function generates a new copy of it and puts it
1052    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1053    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1054    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1055    entry or exit of LOOP.  */
1056
1057 class loop *
1058 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1059                                         class loop *scalar_loop, edge e)
1060 {
1061   class loop *new_loop;
1062   basic_block *new_bbs, *bbs, *pbbs;
1063   bool at_exit;
1064   bool was_imm_dom;
1065   basic_block exit_dest;
1066   edge exit, new_exit;
1067   bool duplicate_outer_loop = false;
1068
1069   exit = single_exit (loop);
1070   at_exit = (e == exit);
1071   if (!at_exit && e != loop_preheader_edge (loop))
1072     return NULL;
1073
1074   if (scalar_loop == NULL)
1075     scalar_loop = loop;
1076
1077   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1078   pbbs = bbs + 1;
1079   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1080   /* Allow duplication of outer loops.  */
1081   if (scalar_loop->inner)
1082     duplicate_outer_loop = true;
1083   /* Check whether duplication is possible.  */
1084   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1085     {
1086       free (bbs);
1087       return NULL;
1088     }
1089
1090   /* Generate new loop structure.  */
1091   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1092   duplicate_subloops (scalar_loop, new_loop);
1093
1094   exit_dest = exit->dest;
1095   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1096                                           exit_dest) == loop->header ?
1097                  true : false);
1098
1099   /* Also copy the pre-header, this avoids jumping through hoops to
1100      duplicate the loop entry PHI arguments.  Create an empty
1101      pre-header unconditionally for this.  */
1102   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1103   edge entry_e = single_pred_edge (preheader);
1104   bbs[0] = preheader;
1105   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1106
1107   exit = single_exit (scalar_loop);
1108   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1109             &exit, 1, &new_exit, NULL,
1110             at_exit ? loop->latch : e->src, true);
1111   exit = single_exit (loop);
1112   basic_block new_preheader = new_bbs[0];
1113
1114   /* Before installing PHI arguments make sure that the edges
1115      into them match that of the scalar loop we analyzed.  This
1116      makes sure the SLP tree matches up between the main vectorized
1117      loop and the epilogue vectorized copies.  */
1118   if (single_succ_edge (preheader)->dest_idx
1119       != single_succ_edge (new_bbs[0])->dest_idx)
1120     {
1121       basic_block swap_bb = new_bbs[1];
1122       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1123       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1124       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1125       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1126     }
1127   if (duplicate_outer_loop)
1128     {
1129       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1130       if (loop_preheader_edge (scalar_loop)->dest_idx
1131           != loop_preheader_edge (new_inner_loop)->dest_idx)
1132         {
1133           basic_block swap_bb = new_inner_loop->header;
1134           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1135           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1136           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1137           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1138         }
1139     }
1140
1141   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1142
1143   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1144   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1145     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1146
1147   if (scalar_loop != loop)
1148     {
1149       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1150          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1151          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1152          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1153          header) to have current_def set, so copy them over.  */
1154       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1155                                                 exit);
1156       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1157                                                            0),
1158                                                 EDGE_SUCC (loop->latch, 0));
1159     }
1160
1161   if (at_exit) /* Add the loop copy at exit.  */
1162     {
1163       if (scalar_loop != loop)
1164         {
1165           gphi_iterator gsi;
1166           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1167
1168           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1169                gsi_next (&gsi))
1170             {
1171               gphi *phi = gsi.phi ();
1172               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1173               location_t orig_locus
1174                 = gimple_phi_arg_location_from_edge (phi, e);
1175
1176               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1177             }
1178         }
1179       redirect_edge_and_branch_force (e, new_preheader);
1180       flush_pending_stmts (e);
1181       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1182       if (was_imm_dom || duplicate_outer_loop)
1183         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1184
1185       /* And remove the non-necessary forwarder again.  Keep the other
1186          one so we have a proper pre-header for the loop at the exit edge.  */
1187       redirect_edge_pred (single_succ_edge (preheader),
1188                           single_pred (preheader));
1189       delete_basic_block (preheader);
1190       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1191                                loop_preheader_edge (scalar_loop)->src);
1192     }
1193   else /* Add the copy at entry.  */
1194     {
1195       if (scalar_loop != loop)
1196         {
1197           /* Remove the non-necessary forwarder of scalar_loop again.  */
1198           redirect_edge_pred (single_succ_edge (preheader),
1199                               single_pred (preheader));
1200           delete_basic_block (preheader);
1201           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1202                                    loop_preheader_edge (scalar_loop)->src);
1203           preheader = split_edge (loop_preheader_edge (loop));
1204           entry_e = single_pred_edge (preheader);
1205         }
1206
1207       redirect_edge_and_branch_force (entry_e, new_preheader);
1208       flush_pending_stmts (entry_e);
1209       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1210
1211       redirect_edge_and_branch_force (new_exit, preheader);
1212       flush_pending_stmts (new_exit);
1213       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1214
1215       /* And remove the non-necessary forwarder again.  Keep the other
1216          one so we have a proper pre-header for the loop at the exit edge.  */
1217       redirect_edge_pred (single_succ_edge (new_preheader),
1218                           single_pred (new_preheader));
1219       delete_basic_block (new_preheader);
1220       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1221                                loop_preheader_edge (new_loop)->src);
1222     }
1223
1224   if (scalar_loop != loop)
1225     {
1226       /* Update new_loop->header PHIs, so that on the preheader
1227          edge they are the ones from loop rather than scalar_loop.  */
1228       gphi_iterator gsi_orig, gsi_new;
1229       edge orig_e = loop_preheader_edge (loop);
1230       edge new_e = loop_preheader_edge (new_loop);
1231
1232       for (gsi_orig = gsi_start_phis (loop->header),
1233            gsi_new = gsi_start_phis (new_loop->header);
1234            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1235            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1236         {
1237           gphi *orig_phi = gsi_orig.phi ();
1238           gphi *new_phi = gsi_new.phi ();
1239           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1240           location_t orig_locus
1241             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1242
1243           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1244         }
1245     }
1246
1247   free (new_bbs);
1248   free (bbs);
1249
1250   checking_verify_dominators (CDI_DOMINATORS);
1251
1252   return new_loop;
1253 }
1254
1255
1256 /* Given the condition expression COND, put it as the last statement of
1257    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1258    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1259    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1260    new edge as irreducible if IRREDUCIBLE_P is true.  */
1261
1262 static edge
1263 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1264                        basic_block guard_to, basic_block dom_bb,
1265                        profile_probability probability, bool irreducible_p)
1266 {
1267   gimple_stmt_iterator gsi;
1268   edge new_e, enter_e;
1269   gcond *cond_stmt;
1270   gimple_seq gimplify_stmt_list = NULL;
1271
1272   enter_e = EDGE_SUCC (guard_bb, 0);
1273   enter_e->flags &= ~EDGE_FALLTHRU;
1274   enter_e->flags |= EDGE_FALSE_VALUE;
1275   gsi = gsi_last_bb (guard_bb);
1276
1277   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1278                                  is_gimple_condexpr_for_cond, NULL_TREE);
1279   if (gimplify_stmt_list)
1280     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1281
1282   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1283   gsi = gsi_last_bb (guard_bb);
1284   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1285
1286   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1287   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1288
1289   new_e->probability = probability;
1290   if (irreducible_p)
1291     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1292
1293   enter_e->probability = probability.invert ();
1294   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1295
1296   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1297   if (enter_e->dest->loop_father->header == enter_e->dest)
1298     split_edge (enter_e);
1299
1300   return new_e;
1301 }
1302
1303
1304 /* This function verifies that the following restrictions apply to LOOP:
1305    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1306        for innermost loop and 5 basic blocks for outer-loop.
1307    (2) it is single entry, single exit
1308    (3) its exit condition is the last stmt in the header
1309    (4) E is the entry/exit edge of LOOP.
1310  */
1311
1312 bool
1313 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1314 {
1315   edge exit_e = single_exit (loop);
1316   edge entry_e = loop_preheader_edge (loop);
1317   gcond *orig_cond = get_loop_exit_condition (loop);
1318   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1319   unsigned int num_bb = loop->inner? 5 : 2;
1320
1321   /* All loops have an outer scope; the only case loop->outer is NULL is for
1322      the function itself.  */
1323   if (!loop_outer (loop)
1324       || loop->num_nodes != num_bb
1325       || !empty_block_p (loop->latch)
1326       || !single_exit (loop)
1327       /* Verify that new loop exit condition can be trivially modified.  */
1328       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1329       || (e != exit_e && e != entry_e))
1330     return false;
1331
1332   return true;
1333 }
1334
1335 /* Function vect_get_loop_location.
1336
1337    Extract the location of the loop in the source code.
1338    If the loop is not well formed for vectorization, an estimated
1339    location is calculated.
1340    Return the loop location if succeed and NULL if not.  */
1341
1342 dump_user_location_t
1343 find_loop_location (class loop *loop)
1344 {
1345   gimple *stmt = NULL;
1346   basic_block bb;
1347   gimple_stmt_iterator si;
1348
1349   if (!loop)
1350     return dump_user_location_t ();
1351
1352   stmt = get_loop_exit_condition (loop);
1353
1354   if (stmt
1355       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1356     return stmt;
1357
1358   /* If we got here the loop is probably not "well formed",
1359      try to estimate the loop location */
1360
1361   if (!loop->header)
1362     return dump_user_location_t ();
1363
1364   bb = loop->header;
1365
1366   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1367     {
1368       stmt = gsi_stmt (si);
1369       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1370         return stmt;
1371     }
1372
1373   return dump_user_location_t ();
1374 }
1375
1376 /* Return true if the phi described by STMT_INFO defines an IV of the
1377    loop to be vectorized.  */
1378
1379 static bool
1380 iv_phi_p (stmt_vec_info stmt_info)
1381 {
1382   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1383   if (virtual_operand_p (PHI_RESULT (phi)))
1384     return false;
1385
1386   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1387       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1388     return false;
1389
1390   return true;
1391 }
1392
1393 /* Function vect_can_advance_ivs_p
1394
1395    In case the number of iterations that LOOP iterates is unknown at compile
1396    time, an epilog loop will be generated, and the loop induction variables
1397    (IVs) will be "advanced" to the value they are supposed to take just before
1398    the epilog loop.  Here we check that the access function of the loop IVs
1399    and the expression that represents the loop bound are simple enough.
1400    These restrictions will be relaxed in the future.  */
1401
1402 bool
1403 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1404 {
1405   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1406   basic_block bb = loop->header;
1407   gphi_iterator gsi;
1408
1409   /* Analyze phi functions of the loop header.  */
1410
1411   if (dump_enabled_p ())
1412     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1413   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1414     {
1415       tree evolution_part;
1416
1417       gphi *phi = gsi.phi ();
1418       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1419       if (dump_enabled_p ())
1420         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1421                          phi_info->stmt);
1422
1423       /* Skip virtual phi's. The data dependences that are associated with
1424          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1425
1426          Skip reduction phis.  */
1427       if (!iv_phi_p (phi_info))
1428         {
1429           if (dump_enabled_p ())
1430             dump_printf_loc (MSG_NOTE, vect_location,
1431                              "reduc or virtual phi. skip.\n");
1432           continue;
1433         }
1434
1435       /* Analyze the evolution function.  */
1436
1437       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1438       if (evolution_part == NULL_TREE)
1439         {
1440           if (dump_enabled_p ())
1441             dump_printf (MSG_MISSED_OPTIMIZATION,
1442                          "No access function or evolution.\n");
1443           return false;
1444         }
1445
1446       /* FORNOW: We do not transform initial conditions of IVs
1447          which evolution functions are not invariants in the loop.  */
1448
1449       if (!expr_invariant_in_loop_p (loop, evolution_part))
1450         {
1451           if (dump_enabled_p ())
1452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                              "evolution not invariant in loop.\n");
1454           return false;
1455         }
1456
1457       /* FORNOW: We do not transform initial conditions of IVs
1458          which evolution functions are a polynomial of degree >= 2.  */
1459
1460       if (tree_is_chrec (evolution_part))
1461         {
1462           if (dump_enabled_p ())
1463             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                              "evolution is chrec.\n");
1465           return false;
1466         }
1467     }
1468
1469   return true;
1470 }
1471
1472
1473 /*   Function vect_update_ivs_after_vectorizer.
1474
1475      "Advance" the induction variables of LOOP to the value they should take
1476      after the execution of LOOP.  This is currently necessary because the
1477      vectorizer does not handle induction variables that are used after the
1478      loop.  Such a situation occurs when the last iterations of LOOP are
1479      peeled, because:
1480      1. We introduced new uses after LOOP for IVs that were not originally used
1481         after LOOP: the IVs of LOOP are now used by an epilog loop.
1482      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1483         times, whereas the loop IVs should be bumped N times.
1484
1485      Input:
1486      - LOOP - a loop that is going to be vectorized. The last few iterations
1487               of LOOP were peeled.
1488      - NITERS - the number of iterations that LOOP executes (before it is
1489                 vectorized). i.e, the number of times the ivs should be bumped.
1490      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1491                   coming out from LOOP on which there are uses of the LOOP ivs
1492                   (this is the path from LOOP->exit to epilog_loop->preheader).
1493
1494                   The new definitions of the ivs are placed in LOOP->exit.
1495                   The phi args associated with the edge UPDATE_E in the bb
1496                   UPDATE_E->dest are updated accordingly.
1497
1498      Assumption 1: Like the rest of the vectorizer, this function assumes
1499      a single loop exit that has a single predecessor.
1500
1501      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1502      organized in the same order.
1503
1504      Assumption 3: The access function of the ivs is simple enough (see
1505      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1506
1507      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1508      coming out of LOOP on which the ivs of LOOP are used (this is the path
1509      that leads to the epilog loop; other paths skip the epilog loop).  This
1510      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1511      needs to have its phis updated.
1512  */
1513
1514 static void
1515 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1516                                   tree niters, edge update_e)
1517 {
1518   gphi_iterator gsi, gsi1;
1519   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520   basic_block update_bb = update_e->dest;
1521   basic_block exit_bb = single_exit (loop)->dest;
1522
1523   /* Make sure there exists a single-predecessor exit bb:  */
1524   gcc_assert (single_pred_p (exit_bb));
1525   gcc_assert (single_succ_edge (exit_bb) == update_e);
1526
1527   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1528        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1529        gsi_next (&gsi), gsi_next (&gsi1))
1530     {
1531       tree init_expr;
1532       tree step_expr, off;
1533       tree type;
1534       tree var, ni, ni_name;
1535       gimple_stmt_iterator last_gsi;
1536
1537       gphi *phi = gsi.phi ();
1538       gphi *phi1 = gsi1.phi ();
1539       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1540       if (dump_enabled_p ())
1541         dump_printf_loc (MSG_NOTE, vect_location,
1542                          "vect_update_ivs_after_vectorizer: phi: %G",
1543                          (gimple *) phi);
1544
1545       /* Skip reduction and virtual phis.  */
1546       if (!iv_phi_p (phi_info))
1547         {
1548           if (dump_enabled_p ())
1549             dump_printf_loc (MSG_NOTE, vect_location,
1550                              "reduc or virtual phi. skip.\n");
1551           continue;
1552         }
1553
1554       type = TREE_TYPE (gimple_phi_result (phi));
1555       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1556       step_expr = unshare_expr (step_expr);
1557
1558       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1559          of degree >= 2 or exponential.  */
1560       gcc_assert (!tree_is_chrec (step_expr));
1561
1562       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1563       gimple_seq stmts = NULL;
1564       enum vect_induction_op_type induction_type
1565         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1566
1567       if (induction_type == vect_step_op_add)
1568         {
1569           off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1570                              fold_convert (TREE_TYPE (step_expr), niters),
1571                              step_expr);
1572           if (POINTER_TYPE_P (type))
1573             ni = fold_build_pointer_plus (init_expr, off);
1574           else
1575             ni = fold_build2 (PLUS_EXPR, type,
1576                               init_expr, fold_convert (type, off));
1577         }
1578       /* Don't bother call vect_peel_nonlinear_iv_init.  */
1579       else if (induction_type == vect_step_op_neg)
1580         ni = init_expr;
1581       else
1582         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
1583                                           niters, step_expr,
1584                                           induction_type);
1585
1586       var = create_tmp_var (type, "tmp");
1587
1588       last_gsi = gsi_last_bb (exit_bb);
1589       gimple_seq new_stmts = NULL;
1590       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1591       /* Exit_bb shouldn't be empty.  */
1592       if (!gsi_end_p (last_gsi))
1593         {
1594           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
1595           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1596         }
1597       else
1598         {
1599           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
1600           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1601         }
1602
1603       /* Fix phi expressions in the successor bb.  */
1604       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1605     }
1606 }
1607
1608 /* Return a gimple value containing the misalignment (measured in vector
1609    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1610    it is away from a perfectly aligned address.  Add any new statements
1611    to SEQ.  */
1612
1613 static tree
1614 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1615 {
1616   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1617   stmt_vec_info stmt_info = dr_info->stmt;
1618   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1619
1620   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1621   unsigned HOST_WIDE_INT target_align_c;
1622   tree target_align_minus_1;
1623
1624   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1625                                         size_zero_node) < 0;
1626   tree offset = (negative
1627                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1628                              * TREE_INT_CST_LOW
1629                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1630                  : size_zero_node);
1631   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1632                                                           stmt_info, seq,
1633                                                           offset);
1634   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1635   if (target_align.is_constant (&target_align_c))
1636     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1637   else
1638     {
1639       tree vla = build_int_cst (type, target_align);
1640       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1641                                     fold_build2 (MINUS_EXPR, type,
1642                                                  build_int_cst (type, 0), vla));
1643       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1644                                           build_int_cst (type, 1));
1645     }
1646
1647   HOST_WIDE_INT elem_size
1648     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1649   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1650
1651   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1652   tree int_start_addr = fold_convert (type, start_addr);
1653   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1654                                         target_align_minus_1);
1655
1656   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1657   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1658                                         elem_size_log);
1659
1660   return misalign_in_elems;
1661 }
1662
1663 /* Function vect_gen_prolog_loop_niters
1664
1665    Generate the number of iterations which should be peeled as prolog for the
1666    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1667    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1668    As a result, after the execution of this loop, the data reference DR will
1669    refer to an aligned location.  The following computation is generated:
1670
1671    If the misalignment of DR is known at compile time:
1672      addr_mis = int mis = DR_MISALIGNMENT (dr);
1673    Else, compute address misalignment in bytes:
1674      addr_mis = addr & (target_align - 1)
1675
1676    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1677
1678    (elem_size = element type size; an element is the scalar element whose type
1679    is the inner type of the vectype)
1680
1681    The computations will be emitted at the end of BB.  We also compute and
1682    store upper bound (included) of the result in BOUND.
1683
1684    When the step of the data-ref in the loop is not 1 (as in interleaved data
1685    and SLP), the number of iterations of the prolog must be divided by the step
1686    (which is equal to the size of interleaved group).
1687
1688    The above formulas assume that VF == number of elements in the vector. This
1689    may not hold when there are multiple-types in the loop.
1690    In this case, for some data-references in the loop the VF does not represent
1691    the number of elements that fit in the vector.  Therefore, instead of VF we
1692    use TYPE_VECTOR_SUBPARTS.  */
1693
1694 static tree
1695 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1696                              basic_block bb, int *bound)
1697 {
1698   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1699   tree var;
1700   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1701   gimple_seq stmts = NULL, new_stmts = NULL;
1702   tree iters, iters_name;
1703   stmt_vec_info stmt_info = dr_info->stmt;
1704   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1705   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1706
1707   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1708     {
1709       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1710
1711       if (dump_enabled_p ())
1712         dump_printf_loc (MSG_NOTE, vect_location,
1713                          "known peeling = %d.\n", npeel);
1714
1715       iters = build_int_cst (niters_type, npeel);
1716       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1717     }
1718   else
1719     {
1720       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1721       tree type = TREE_TYPE (misalign_in_elems);
1722       HOST_WIDE_INT elem_size
1723         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1724       /* We only do prolog peeling if the target alignment is known at compile
1725          time.  */
1726       poly_uint64 align_in_elems =
1727         exact_div (target_align, elem_size);
1728       tree align_in_elems_minus_1 =
1729         build_int_cst (type, align_in_elems - 1);
1730       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1731
1732       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1733                                  & (align_in_elems - 1)).  */
1734       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1735                                             size_zero_node) < 0;
1736       if (negative)
1737         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1738                              align_in_elems_tree);
1739       else
1740         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1741                              misalign_in_elems);
1742       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1743       iters = fold_convert (niters_type, iters);
1744       unsigned HOST_WIDE_INT align_in_elems_c;
1745       if (align_in_elems.is_constant (&align_in_elems_c))
1746         *bound = align_in_elems_c - 1;
1747       else
1748         *bound = -1;
1749     }
1750
1751   if (dump_enabled_p ())
1752     dump_printf_loc (MSG_NOTE, vect_location,
1753                      "niters for prolog loop: %T\n", iters);
1754
1755   var = create_tmp_var (niters_type, "prolog_loop_niters");
1756   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1757
1758   if (new_stmts)
1759     gimple_seq_add_seq (&stmts, new_stmts);
1760   if (stmts)
1761     {
1762       gcc_assert (single_succ_p (bb));
1763       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1764       if (gsi_end_p (gsi))
1765         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1766       else
1767         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1768     }
1769   return iters_name;
1770 }
1771
1772
1773 /* Function vect_update_init_of_dr
1774
1775    If CODE is PLUS, the vector loop starts NITERS iterations after the
1776    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1777    iterations before the scalar one (using masking to skip inactive
1778    elements).  This function updates the information recorded in DR to
1779    account for the difference.  Specifically, it updates the OFFSET
1780    field of DR_INFO.  */
1781
1782 static void
1783 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1784 {
1785   struct data_reference *dr = dr_info->dr;
1786   tree offset = dr_info->offset;
1787   if (!offset)
1788     offset = build_zero_cst (sizetype);
1789
1790   niters = fold_build2 (MULT_EXPR, sizetype,
1791                         fold_convert (sizetype, niters),
1792                         fold_convert (sizetype, DR_STEP (dr)));
1793   offset = fold_build2 (code, sizetype,
1794                         fold_convert (sizetype, offset), niters);
1795   dr_info->offset = offset;
1796 }
1797
1798
1799 /* Function vect_update_inits_of_drs
1800
1801    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1802    CODE and NITERS are as for vect_update_inits_of_dr.  */
1803
1804 void
1805 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1806                           tree_code code)
1807 {
1808   unsigned int i;
1809   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1810   struct data_reference *dr;
1811
1812   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1813
1814   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1815      here, but since we might use these niters to update the epilogues niters
1816      and data references we can't insert them here as this definition might not
1817      always dominate its uses.  */
1818   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1819     niters = fold_convert (sizetype, niters);
1820
1821   FOR_EACH_VEC_ELT (datarefs, i, dr)
1822     {
1823       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1824       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1825           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1826         vect_update_init_of_dr (dr_info, niters, code);
1827     }
1828 }
1829
1830 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1831    by masking.  This involves calculating the number of iterations to
1832    be peeled and then aligning all memory references appropriately.  */
1833
1834 void
1835 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1836 {
1837   tree misalign_in_elems;
1838   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1839
1840   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1841
1842   /* From the information recorded in LOOP_VINFO get the number of iterations
1843      that need to be skipped via masking.  */
1844   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1845     {
1846       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1847                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1848       misalign_in_elems = build_int_cst (type, misalign);
1849     }
1850   else
1851     {
1852       gimple_seq seq1 = NULL, seq2 = NULL;
1853       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1854       misalign_in_elems = fold_convert (type, misalign_in_elems);
1855       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1856                                                 &seq2, true, NULL_TREE);
1857       gimple_seq_add_seq (&seq1, seq2);
1858       if (seq1)
1859         {
1860           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1861           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1862           gcc_assert (!new_bb);
1863         }
1864     }
1865
1866   if (dump_enabled_p ())
1867     dump_printf_loc (MSG_NOTE, vect_location,
1868                      "misalignment for fully-masked loop: %T\n",
1869                      misalign_in_elems);
1870
1871   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1872
1873   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1874 }
1875
1876 /* This function builds ni_name = number of iterations.  Statements
1877    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1878    it to TRUE if new ssa_var is generated.  */
1879
1880 tree
1881 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1882 {
1883   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1884   if (TREE_CODE (ni) == INTEGER_CST)
1885     return ni;
1886   else
1887     {
1888       tree ni_name, var;
1889       gimple_seq stmts = NULL;
1890       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1891
1892       var = create_tmp_var (TREE_TYPE (ni), "niters");
1893       ni_name = force_gimple_operand (ni, &stmts, false, var);
1894       if (stmts)
1895         {
1896           gsi_insert_seq_on_edge_immediate (pe, stmts);
1897           if (new_var_p != NULL)
1898             *new_var_p = true;
1899         }
1900
1901       return ni_name;
1902     }
1903 }
1904
1905 /* Calculate the number of iterations above which vectorized loop will be
1906    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1907    of prolog loop.  If it's integer const, the integer number is also passed
1908    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1909    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1910    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1911    threshold below which the scalar (rather than vectorized) loop will be
1912    executed.  This function stores the upper bound (inclusive) of the result
1913    in BOUND_SCALAR.  */
1914
1915 static tree
1916 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1917                              int bound_prolog, poly_int64 bound_epilog, int th,
1918                              poly_uint64 *bound_scalar,
1919                              bool check_profitability)
1920 {
1921   tree type = TREE_TYPE (niters_prolog);
1922   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1923                              build_int_cst (type, bound_epilog));
1924
1925   *bound_scalar = bound_prolog + bound_epilog;
1926   if (check_profitability)
1927     {
1928       /* TH indicates the minimum niters of vectorized loop, while we
1929          compute the maximum niters of scalar loop.  */
1930       th--;
1931       /* Peeling for constant times.  */
1932       if (int_niters_prolog >= 0)
1933         {
1934           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1935           return build_int_cst (type, *bound_scalar);
1936         }
1937       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1938          and BOUND_EPILOG are inclusive upper bounds.  */
1939       if (known_ge (th, bound_prolog + bound_epilog))
1940         {
1941           *bound_scalar = th;
1942           return build_int_cst (type, th);
1943         }
1944       /* Need to do runtime comparison.  */
1945       else if (maybe_gt (th, bound_epilog))
1946         {
1947           *bound_scalar = upper_bound (*bound_scalar, th);
1948           return fold_build2 (MAX_EXPR, type,
1949                               build_int_cst (type, th), niters);
1950         }
1951     }
1952   return niters;
1953 }
1954
1955 /* NITERS is the number of times that the original scalar loop executes
1956    after peeling.  Work out the maximum number of iterations N that can
1957    be handled by the vectorized form of the loop and then either:
1958
1959    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1960
1961         niters_vector = N
1962
1963    b) set *STEP_VECTOR_PTR to one and generate:
1964
1965         niters_vector = N / vf
1966
1967    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1968    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1969    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1970
1971 void
1972 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1973                              tree *niters_vector_ptr, tree *step_vector_ptr,
1974                              bool niters_no_overflow)
1975 {
1976   tree ni_minus_gap, var;
1977   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1978   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1980   tree log_vf = NULL_TREE;
1981
1982   /* If epilogue loop is required because of data accesses with gaps, we
1983      subtract one iteration from the total number of iterations here for
1984      correct calculation of RATIO.  */
1985   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1986     {
1987       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1988                                   build_one_cst (type));
1989       if (!is_gimple_val (ni_minus_gap))
1990         {
1991           var = create_tmp_var (type, "ni_gap");
1992           gimple *stmts = NULL;
1993           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1994                                                true, var);
1995           gsi_insert_seq_on_edge_immediate (pe, stmts);
1996         }
1997     }
1998   else
1999     ni_minus_gap = niters;
2000
2001   /* To silence some unexpected warnings, simply initialize to 0. */
2002   unsigned HOST_WIDE_INT const_vf = 0;
2003   if (vf.is_constant (&const_vf)
2004       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2005     {
2006       /* Create: niters >> log2(vf) */
2007       /* If it's known that niters == number of latch executions + 1 doesn't
2008          overflow, we can generate niters >> log2(vf); otherwise we generate
2009          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2010          will be at least one.  */
2011       log_vf = build_int_cst (type, exact_log2 (const_vf));
2012       if (niters_no_overflow)
2013         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2014       else
2015         niters_vector
2016           = fold_build2 (PLUS_EXPR, type,
2017                          fold_build2 (RSHIFT_EXPR, type,
2018                                       fold_build2 (MINUS_EXPR, type,
2019                                                    ni_minus_gap,
2020                                                    build_int_cst (type, vf)),
2021                                       log_vf),
2022                          build_int_cst (type, 1));
2023       step_vector = build_one_cst (type);
2024     }
2025   else
2026     {
2027       niters_vector = ni_minus_gap;
2028       step_vector = build_int_cst (type, vf);
2029     }
2030
2031   if (!is_gimple_val (niters_vector))
2032     {
2033       var = create_tmp_var (type, "bnd");
2034       gimple_seq stmts = NULL;
2035       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2036       gsi_insert_seq_on_edge_immediate (pe, stmts);
2037       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2038          we set range information to make niters analyzer's life easier.
2039          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2040          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2041       if (stmts != NULL && log_vf)
2042         {
2043           if (niters_no_overflow)
2044             {
2045               value_range vr (type,
2046                               wi::one (TYPE_PRECISION (type)),
2047                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2048                                                          TYPE_SIGN (type)),
2049                                           exact_log2 (const_vf),
2050                                           TYPE_SIGN (type)));
2051               set_range_info (niters_vector, vr);
2052             }
2053           /* For VF == 1 the vector IV might also overflow so we cannot
2054              assert a minimum value of 1.  */
2055           else if (const_vf > 1)
2056             {
2057               value_range vr (type,
2058                               wi::one (TYPE_PRECISION (type)),
2059                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2060                                                          TYPE_SIGN (type))
2061                                           - (const_vf - 1),
2062                                           exact_log2 (const_vf), TYPE_SIGN (type))
2063                               + 1);
2064               set_range_info (niters_vector, vr);
2065             }
2066         }
2067     }
2068   *niters_vector_ptr = niters_vector;
2069   *step_vector_ptr = step_vector;
2070
2071   return;
2072 }
2073
2074 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2075    loop specified by LOOP_VINFO after vectorization, compute the number
2076    of iterations before vectorization (niters_vector * vf) and store it
2077    to NITERS_VECTOR_MULT_VF_PTR.  */
2078
2079 static void
2080 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2081                                      tree niters_vector,
2082                                      tree *niters_vector_mult_vf_ptr)
2083 {
2084   /* We should be using a step_vector of VF if VF is variable.  */
2085   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2086   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2087   tree type = TREE_TYPE (niters_vector);
2088   tree log_vf = build_int_cst (type, exact_log2 (vf));
2089   basic_block exit_bb = single_exit (loop)->dest;
2090
2091   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2092   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2093                                             niters_vector, log_vf);
2094   if (!is_gimple_val (niters_vector_mult_vf))
2095     {
2096       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2097       gimple_seq stmts = NULL;
2098       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2099                                                     &stmts, true, var);
2100       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2101       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2102     }
2103   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2104 }
2105
2106 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2107    this function searches for the corresponding lcssa phi node in exit
2108    bb of LOOP.  If it is found, return the phi result; otherwise return
2109    NULL.  */
2110
2111 static tree
2112 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2113                 gphi *lcssa_phi)
2114 {
2115   gphi_iterator gsi;
2116   edge e = single_exit (loop);
2117
2118   gcc_assert (single_pred_p (e->dest));
2119   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2120     {
2121       gphi *phi = gsi.phi ();
2122       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2123                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2124         return PHI_RESULT (phi);
2125     }
2126   return NULL_TREE;
2127 }
2128
2129 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2130    from SECOND/FIRST and puts it at the original loop's preheader/exit
2131    edge, the two loops are arranged as below:
2132
2133        preheader_a:
2134      first_loop:
2135        header_a:
2136          i_1 = PHI<i_0, i_2>;
2137          ...
2138          i_2 = i_1 + 1;
2139          if (cond_a)
2140            goto latch_a;
2141          else
2142            goto between_bb;
2143        latch_a:
2144          goto header_a;
2145
2146        between_bb:
2147          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2148
2149      second_loop:
2150        header_b:
2151          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2152                                  or with i_2 if no LCSSA phi is created
2153                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2154          ...
2155          i_4 = i_3 + 1;
2156          if (cond_b)
2157            goto latch_b;
2158          else
2159            goto exit_bb;
2160        latch_b:
2161          goto header_b;
2162
2163        exit_bb:
2164
2165    This function creates loop closed SSA for the first loop; update the
2166    second loop's PHI nodes by replacing argument on incoming edge with the
2167    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2168    is false, Loop closed ssa phis will only be created for non-iv phis for
2169    the first loop.
2170
2171    This function assumes exit bb of the first loop is preheader bb of the
2172    second loop, i.e, between_bb in the example code.  With PHIs updated,
2173    the second loop will execute rest iterations of the first.  */
2174
2175 static void
2176 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2177                                    class loop *first, class loop *second,
2178                                    bool create_lcssa_for_iv_phis)
2179 {
2180   gphi_iterator gsi_update, gsi_orig;
2181   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2182
2183   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2184   edge second_preheader_e = loop_preheader_edge (second);
2185   basic_block between_bb = single_exit (first)->dest;
2186
2187   gcc_assert (between_bb == second_preheader_e->src);
2188   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2189   /* Either the first loop or the second is the loop to be vectorized.  */
2190   gcc_assert (loop == first || loop == second);
2191
2192   for (gsi_orig = gsi_start_phis (first->header),
2193        gsi_update = gsi_start_phis (second->header);
2194        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2195        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2196     {
2197       gphi *orig_phi = gsi_orig.phi ();
2198       gphi *update_phi = gsi_update.phi ();
2199
2200       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2201       /* Generate lcssa PHI node for the first loop.  */
2202       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2203       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2204       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2205         {
2206           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2207           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2208           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2209           arg = new_res;
2210         }
2211
2212       /* Update PHI node in the second loop by replacing arg on the loop's
2213          incoming edge.  */
2214       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2215     }
2216
2217   /* For epilogue peeling we have to make sure to copy all LC PHIs
2218      for correct vectorization of live stmts.  */
2219   if (loop == first)
2220     {
2221       basic_block orig_exit = single_exit (second)->dest;
2222       for (gsi_orig = gsi_start_phis (orig_exit);
2223            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2224         {
2225           gphi *orig_phi = gsi_orig.phi ();
2226           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2227           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2228             continue;
2229
2230           /* Already created in the above loop.   */
2231           if (find_guard_arg (first, second, orig_phi))
2232             continue;
2233
2234           tree new_res = copy_ssa_name (orig_arg);
2235           gphi *lcphi = create_phi_node (new_res, between_bb);
2236           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2237         }
2238     }
2239 }
2240
2241 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2242    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2243    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2244    appear like below:
2245
2246        guard_bb:
2247          if (cond)
2248            goto merge_bb;
2249          else
2250            goto skip_loop;
2251
2252      skip_loop:
2253        header_a:
2254          i_1 = PHI<i_0, i_2>;
2255          ...
2256          i_2 = i_1 + 1;
2257          if (cond_a)
2258            goto latch_a;
2259          else
2260            goto exit_a;
2261        latch_a:
2262          goto header_a;
2263
2264        exit_a:
2265          i_5 = PHI<i_2>;
2266
2267        merge_bb:
2268          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2269
2270      update_loop:
2271        header_b:
2272          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2273          ...
2274          i_4 = i_3 + 1;
2275          if (cond_b)
2276            goto latch_b;
2277          else
2278            goto exit_bb;
2279        latch_b:
2280          goto header_b;
2281
2282        exit_bb:
2283
2284    This function creates PHI nodes at merge_bb and replaces the use of i_5
2285    in the update_loop's PHI node with the result of new PHI result.  */
2286
2287 static void
2288 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2289                                     class loop *update_loop,
2290                                     edge guard_edge, edge merge_edge)
2291 {
2292   location_t merge_loc, guard_loc;
2293   edge orig_e = loop_preheader_edge (skip_loop);
2294   edge update_e = loop_preheader_edge (update_loop);
2295   gphi_iterator gsi_orig, gsi_update;
2296
2297   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2298         gsi_update = gsi_start_phis (update_loop->header));
2299        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2300        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2301     {
2302       gphi *orig_phi = gsi_orig.phi ();
2303       gphi *update_phi = gsi_update.phi ();
2304
2305       /* Generate new phi node at merge bb of the guard.  */
2306       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2307       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2308
2309       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2310          args in NEW_PHI for these edges.  */
2311       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2312       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2313       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2314       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2315       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2316       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2317
2318       /* Update phi in UPDATE_PHI.  */
2319       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2320     }
2321 }
2322
2323 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2324    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2325    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2326    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2327    The CFG looks like:
2328
2329      loop:
2330        header_a:
2331          i_1 = PHI<i_0, i_2>;
2332          ...
2333          i_2 = i_1 + 1;
2334          if (cond_a)
2335            goto latch_a;
2336          else
2337            goto exit_a;
2338        latch_a:
2339          goto header_a;
2340
2341        exit_a:
2342
2343        guard_bb:
2344          if (cond)
2345            goto merge_bb;
2346          else
2347            goto epilog_loop;
2348
2349        ;; fall_through_bb
2350
2351      epilog_loop:
2352        header_b:
2353          i_3 = PHI<i_2, i_4>;
2354          ...
2355          i_4 = i_3 + 1;
2356          if (cond_b)
2357            goto latch_b;
2358          else
2359            goto merge_bb;
2360        latch_b:
2361          goto header_b;
2362
2363        merge_bb:
2364          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2365
2366        exit_bb:
2367          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2368
2369    For each name used out side EPILOG (i.e - for each name that has a lcssa
2370    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2371    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2372    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2373    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2374    in exit_bb will also be updated.  */
2375
2376 static void
2377 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2378                                     edge guard_edge, edge merge_edge)
2379 {
2380   gphi_iterator gsi;
2381   basic_block merge_bb = guard_edge->dest;
2382
2383   gcc_assert (single_succ_p (merge_bb));
2384   edge e = single_succ_edge (merge_bb);
2385   basic_block exit_bb = e->dest;
2386   gcc_assert (single_pred_p (exit_bb));
2387   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2388
2389   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2390     {
2391       gphi *update_phi = gsi.phi ();
2392       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2393
2394       tree merge_arg = NULL_TREE;
2395
2396       /* If the old argument is a SSA_NAME use its current_def.  */
2397       if (TREE_CODE (old_arg) == SSA_NAME)
2398         merge_arg = get_current_def (old_arg);
2399       /* If it's a constant or doesn't have a current_def, just use the old
2400          argument.  */
2401       if (!merge_arg)
2402         merge_arg = old_arg;
2403
2404       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2405       /* If the var is live after loop but not a reduction, we simply
2406          use the old arg.  */
2407       if (!guard_arg)
2408         guard_arg = old_arg;
2409
2410       /* Create new phi node in MERGE_BB:  */
2411       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2412       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2413
2414       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2415          the two PHI args in merge_phi for these edges.  */
2416       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2417       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2418
2419       /* Update the original phi in exit_bb.  */
2420       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2421     }
2422 }
2423
2424 /* EPILOG loop is duplicated from the original loop for vectorizing,
2425    the arg of its loop closed ssa PHI needs to be updated.  */
2426
2427 static void
2428 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2429 {
2430   gphi_iterator gsi;
2431   basic_block exit_bb = single_exit (epilog)->dest;
2432
2433   gcc_assert (single_pred_p (exit_bb));
2434   edge e = EDGE_PRED (exit_bb, 0);
2435   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2436     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2437 }
2438
2439 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2440    iterate exactly CONST_NITERS times.  Make a final decision about
2441    whether the epilogue loop should be used, returning true if so.  */
2442
2443 static bool
2444 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2445                              unsigned HOST_WIDE_INT const_niters)
2446 {
2447   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2448      using an epilogue loop for a single scalar iteration, even if
2449      we could in principle implement that using partial vectors.  */
2450   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2451   if (const_niters <= gap_niters + 1)
2452     return false;
2453
2454   /* Install the number of iterations.  */
2455   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2456   tree niters_tree = build_int_cst (niters_type, const_niters);
2457   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2458
2459   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2460   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2461
2462   /* Decide what to do if the number of epilogue iterations is not
2463      a multiple of the epilogue loop's vectorization factor.  */
2464   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2465 }
2466
2467 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2468    Return a value that equals:
2469
2470    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2471    - SKIP_VALUE when the main loop is skipped.  */
2472
2473 tree
2474 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2475                            tree skip_value)
2476 {
2477   gcc_assert (loop_vinfo->main_loop_edge);
2478
2479   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2480   basic_block bb = loop_vinfo->main_loop_edge->dest;
2481   gphi *new_phi = create_phi_node (phi_result, bb);
2482   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2483                UNKNOWN_LOCATION);
2484   add_phi_arg (new_phi, skip_value,
2485                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2486   return phi_result;
2487 }
2488
2489 /* Function vect_do_peeling.
2490
2491    Input:
2492    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2493
2494        preheader:
2495      LOOP:
2496        header_bb:
2497          loop_body
2498          if (exit_loop_cond) goto exit_bb
2499          else                goto header_bb
2500        exit_bb:
2501
2502    - NITERS: The number of iterations of the loop.
2503    - NITERSM1: The number of iterations of the loop's latch.
2504    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2505    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2506                               CHECK_PROFITABILITY is true.
2507    Output:
2508    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2509      iterate after vectorization; see vect_set_loop_condition for details.
2510    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2511      should be set to the number of scalar iterations handled by the
2512      vector loop.  The SSA name is only used on exit from the loop.
2513
2514    This function peels prolog and epilog from the loop, adds guards skipping
2515    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2516    would look like:
2517
2518        guard_bb_1:
2519          if (prefer_scalar_loop) goto merge_bb_1
2520          else                    goto guard_bb_2
2521
2522        guard_bb_2:
2523          if (skip_prolog) goto merge_bb_2
2524          else             goto prolog_preheader
2525
2526        prolog_preheader:
2527      PROLOG:
2528        prolog_header_bb:
2529          prolog_body
2530          if (exit_prolog_cond) goto prolog_exit_bb
2531          else                  goto prolog_header_bb
2532        prolog_exit_bb:
2533
2534        merge_bb_2:
2535
2536        vector_preheader:
2537      VECTOR LOOP:
2538        vector_header_bb:
2539          vector_body
2540          if (exit_vector_cond) goto vector_exit_bb
2541          else                  goto vector_header_bb
2542        vector_exit_bb:
2543
2544        guard_bb_3:
2545          if (skip_epilog) goto merge_bb_3
2546          else             goto epilog_preheader
2547
2548        merge_bb_1:
2549
2550        epilog_preheader:
2551      EPILOG:
2552        epilog_header_bb:
2553          epilog_body
2554          if (exit_epilog_cond) goto merge_bb_3
2555          else                  goto epilog_header_bb
2556
2557        merge_bb_3:
2558
2559    Note this function peels prolog and epilog only if it's necessary,
2560    as well as guards.
2561    This function returns the epilogue loop if a decision was made to vectorize
2562    it, otherwise NULL.
2563
2564    The analysis resulting in this epilogue loop's loop_vec_info was performed
2565    in the same vect_analyze_loop call as the main loop's.  At that time
2566    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2567    vectorization factors than the main loop.  This list is stored in the main
2568    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2569    vectorize the epilogue loop for a lower vectorization factor,  the
2570    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2571    updated and linked to the epilogue loop.  This is later used to vectorize
2572    the epilogue.  The reason the loop_vec_info needs updating is that it was
2573    constructed based on the original main loop, and the epilogue loop is a
2574    copy of this loop, so all links pointing to statements in the original loop
2575    need updating.  Furthermore, these loop_vec_infos share the
2576    data_reference's records, which will also need to be updated.
2577
2578    TODO: Guard for prefer_scalar_loop should be emitted along with
2579    versioning conditions if loop versioning is needed.  */
2580
2581
2582 class loop *
2583 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2584                  tree *niters_vector, tree *step_vector,
2585                  tree *niters_vector_mult_vf_var, int th,
2586                  bool check_profitability, bool niters_no_overflow,
2587                  tree *advance)
2588 {
2589   edge e, guard_e;
2590   tree type = TREE_TYPE (niters), guard_cond;
2591   basic_block guard_bb, guard_to;
2592   profile_probability prob_prolog, prob_vector, prob_epilog;
2593   int estimated_vf;
2594   int prolog_peeling = 0;
2595   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2596   bool vect_epilogues_updated_niters = false;
2597   /* We currently do not support prolog peeling if the target alignment is not
2598      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2599      target alignment being constant.  */
2600   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2601   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2602     return NULL;
2603
2604   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2605     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2606
2607   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2608   poly_uint64 bound_epilog = 0;
2609   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2610       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2611     bound_epilog += vf - 1;
2612   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2613     bound_epilog += 1;
2614   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2615   poly_uint64 bound_scalar = bound_epilog;
2616
2617   if (!prolog_peeling && !epilog_peeling)
2618     return NULL;
2619
2620   /* Before doing any peeling make sure to reset debug binds outside of
2621      the loop refering to defs not in LC SSA.  */
2622   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2623   for (unsigned i = 0; i < loop->num_nodes; ++i)
2624     {
2625       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2626       imm_use_iterator ui;
2627       gimple *use_stmt;
2628       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2629            gsi_next (&gsi))
2630         {
2631           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2632             if (gimple_debug_bind_p (use_stmt)
2633                 && loop != gimple_bb (use_stmt)->loop_father
2634                 && !flow_loop_nested_p (loop,
2635                                         gimple_bb (use_stmt)->loop_father))
2636               {
2637                 gimple_debug_bind_reset_value (use_stmt);
2638                 update_stmt (use_stmt);
2639               }
2640         }
2641       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2642            gsi_next (&gsi))
2643         {
2644           ssa_op_iter op_iter;
2645           def_operand_p def_p;
2646           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2647             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2648               if (gimple_debug_bind_p (use_stmt)
2649                   && loop != gimple_bb (use_stmt)->loop_father
2650                   && !flow_loop_nested_p (loop,
2651                                           gimple_bb (use_stmt)->loop_father))
2652                 {
2653                   gimple_debug_bind_reset_value (use_stmt);
2654                   update_stmt (use_stmt);
2655                 }
2656         }
2657     }
2658
2659   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2660   estimated_vf = vect_vf_for_cost (loop_vinfo);
2661   if (estimated_vf == 2)
2662     estimated_vf = 3;
2663   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2664                         .apply_scale (estimated_vf - 1, estimated_vf);
2665
2666   class loop *prolog, *epilog = NULL;
2667   class loop *first_loop = loop;
2668   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2669
2670   /* SSA form needs to be up-to-date since we are going to manually
2671      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2672      update SSA state after that, so we have to make sure to not lose any
2673      pending update needs.  */
2674   gcc_assert (!need_ssa_update_p (cfun));
2675
2676   /* If we're vectorizing an epilogue loop, we have ensured that the
2677      virtual operand is in SSA form throughout the vectorized main loop.
2678      Normally it is possible to trace the updated
2679      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2680      back to scalar-stmt vuses, meaning that the effect of the SSA update
2681      remains local to the main loop.  However, there are rare cases in
2682      which the vectorized loop should have vdefs even when the original scalar
2683      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2684      introduces clobbers of the temporary vector array, which in turn
2685      needs new vdefs.  If the scalar loop doesn't write to memory, these
2686      new vdefs will be the only ones in the vector loop.
2687      We are currently defering updating virtual SSA form and creating
2688      of a virtual PHI for this case so we do not have to make sure the
2689      newly introduced virtual def is in LCSSA form.  */
2690
2691   if (MAY_HAVE_DEBUG_BIND_STMTS)
2692     {
2693       gcc_assert (!adjust_vec.exists ());
2694       adjust_vec.create (32);
2695     }
2696   initialize_original_copy_tables ();
2697
2698   /* Record the anchor bb at which the guard should be placed if the scalar
2699      loop might be preferred.  */
2700   basic_block anchor = loop_preheader_edge (loop)->src;
2701
2702   /* Generate the number of iterations for the prolog loop.  We do this here
2703      so that we can also get the upper bound on the number of iterations.  */
2704   tree niters_prolog;
2705   int bound_prolog = 0;
2706   if (prolog_peeling)
2707     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2708                                                   &bound_prolog);
2709   else
2710     niters_prolog = build_int_cst (type, 0);
2711
2712   loop_vec_info epilogue_vinfo = NULL;
2713   if (vect_epilogues)
2714     {
2715       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2716       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2717     }
2718
2719   tree niters_vector_mult_vf = NULL_TREE;
2720   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2721   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2722   edge update_e = NULL, skip_e = NULL;
2723   unsigned int lowest_vf = constant_lower_bound (vf);
2724   /* If we know the number of scalar iterations for the main loop we should
2725      check whether after the main loop there are enough iterations left over
2726      for the epilogue.  */
2727   if (vect_epilogues
2728       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2729       && prolog_peeling >= 0
2730       && known_eq (vf, lowest_vf))
2731     {
2732       unsigned HOST_WIDE_INT eiters
2733         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2734            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2735
2736       eiters -= prolog_peeling;
2737       eiters
2738         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2739
2740       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2741         {
2742           delete epilogue_vinfo;
2743           epilogue_vinfo = NULL;
2744           if (loop_vinfo->epilogue_vinfos.length () == 0)
2745             {
2746               vect_epilogues = false;
2747               break;
2748             }
2749           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2750           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2751         }
2752       vect_epilogues_updated_niters = true;
2753     }
2754   /* Prolog loop may be skipped.  */
2755   bool skip_prolog = (prolog_peeling != 0);
2756   /* Skip this loop to epilog when there are not enough iterations to enter this
2757      vectorized loop.  If true we should perform runtime checks on the NITERS
2758      to check whether we should skip the current vectorized loop.  If we know
2759      the number of scalar iterations we may choose to add a runtime check if
2760      this number "maybe" smaller than the number of iterations required
2761      when we know the number of scalar iterations may potentially
2762      be smaller than the number of iterations required to enter this loop, for
2763      this we use the upper bounds on the prolog and epilog peeling.  When we
2764      don't know the number of iterations and don't require versioning it is
2765      because we have asserted that there are enough scalar iterations to enter
2766      the main loop, so this skip is not necessary.  When we are versioning then
2767      we only add such a skip if we have chosen to vectorize the epilogue.  */
2768   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2769                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2770                                   bound_prolog + bound_epilog)
2771                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2772                          || vect_epilogues));
2773   /* Epilog loop must be executed if the number of iterations for epilog
2774      loop is known at compile time, otherwise we need to add a check at
2775      the end of vector loop and skip to the end of epilog loop.  */
2776   bool skip_epilog = (prolog_peeling < 0
2777                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2778                       || !vf.is_constant ());
2779   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2780   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2781     skip_epilog = false;
2782
2783   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2784   auto_vec<profile_count> original_counts;
2785   basic_block *original_bbs = NULL;
2786
2787   if (skip_vector)
2788     {
2789       split_edge (loop_preheader_edge (loop));
2790
2791       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
2792         {
2793           original_bbs = get_loop_body (loop);
2794           for (unsigned int i = 0; i < loop->num_nodes; i++)
2795             original_counts.safe_push(original_bbs[i]->count);
2796         }
2797
2798       /* Due to the order in which we peel prolog and epilog, we first
2799          propagate probability to the whole loop.  The purpose is to
2800          avoid adjusting probabilities of both prolog and vector loops
2801          separately.  Note in this case, the probability of epilog loop
2802          needs to be scaled back later.  */
2803       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2804       if (prob_vector.initialized_p ())
2805         {
2806           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2807           scale_loop_profile (loop, prob_vector, 0);
2808         }
2809     }
2810
2811   dump_user_location_t loop_loc = find_loop_location (loop);
2812   if (vect_epilogues)
2813     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2814        use the original scalar loop as remaining epilogue if necessary.  */
2815     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2816       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2817
2818   if (prolog_peeling)
2819     {
2820       e = loop_preheader_edge (loop);
2821       if (!slpeel_can_duplicate_loop_p (loop, e))
2822         {
2823           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2824                            "loop can't be duplicated to preheader edge.\n");
2825           gcc_unreachable ();
2826         }
2827       /* Peel prolog and put it on preheader edge of loop.  */
2828       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2829       if (!prolog)
2830         {
2831           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2832                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2833           gcc_unreachable ();
2834         }
2835       prolog->force_vectorize = false;
2836       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2837       first_loop = prolog;
2838       reset_original_copy_tables ();
2839
2840       /* Update the number of iterations for prolog loop.  */
2841       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2842       vect_set_loop_condition (prolog, NULL, niters_prolog,
2843                                step_prolog, NULL_TREE, false);
2844
2845       /* Skip the prolog loop.  */
2846       if (skip_prolog)
2847         {
2848           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2849                                     niters_prolog, build_int_cst (type, 0));
2850           guard_bb = loop_preheader_edge (prolog)->src;
2851           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2852           guard_to = split_edge (loop_preheader_edge (loop));
2853           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2854                                            guard_to, guard_bb,
2855                                            prob_prolog.invert (),
2856                                            irred_flag);
2857           e = EDGE_PRED (guard_to, 0);
2858           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2859           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2860
2861           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2862           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2863         }
2864
2865       /* Update init address of DRs.  */
2866       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2867       /* Update niters for vector loop.  */
2868       LOOP_VINFO_NITERS (loop_vinfo)
2869         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2870       LOOP_VINFO_NITERSM1 (loop_vinfo)
2871         = fold_build2 (MINUS_EXPR, type,
2872                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2873       bool new_var_p = false;
2874       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2875       /* It's guaranteed that vector loop bound before vectorization is at
2876          least VF, so set range information for newly generated var.  */
2877       if (new_var_p)
2878         {
2879           value_range vr (type,
2880                           wi::to_wide (build_int_cst (type, vf)),
2881                           wi::to_wide (TYPE_MAX_VALUE (type)));
2882           set_range_info (niters, vr);
2883         }
2884
2885       /* Prolog iterates at most bound_prolog times, latch iterates at
2886          most bound_prolog - 1 times.  */
2887       record_niter_bound (prolog, bound_prolog - 1, false, true);
2888       delete_update_ssa ();
2889       adjust_vec_debug_stmts ();
2890       scev_reset ();
2891     }
2892
2893   if (epilog_peeling)
2894     {
2895       e = single_exit (loop);
2896       if (!slpeel_can_duplicate_loop_p (loop, e))
2897         {
2898           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2899                            "loop can't be duplicated to exit edge.\n");
2900           gcc_unreachable ();
2901         }
2902       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2903          said epilog then we should use a copy of the main loop as a starting
2904          point.  This loop may have already had some preliminary transformations
2905          to allow for more optimal vectorization, for example if-conversion.
2906          If we are not vectorizing the epilog then we should use the scalar loop
2907          as the transformations mentioned above make less or no sense when not
2908          vectorizing.  */
2909       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2910       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2911       if (!epilog)
2912         {
2913           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2914                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2915           gcc_unreachable ();
2916         }
2917       epilog->force_vectorize = false;
2918       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2919
2920       /* Scalar version loop may be preferred.  In this case, add guard
2921          and skip to epilog.  Note this only happens when the number of
2922          iterations of loop is unknown at compile time, otherwise this
2923          won't be vectorized.  */
2924       if (skip_vector)
2925         {
2926           /* Additional epilogue iteration is peeled if gap exists.  */
2927           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2928                                                 bound_prolog, bound_epilog,
2929                                                 th, &bound_scalar,
2930                                                 check_profitability);
2931           /* Build guard against NITERSM1 since NITERS may overflow.  */
2932           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2933           guard_bb = anchor;
2934           guard_to = split_edge (loop_preheader_edge (epilog));
2935           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2936                                            guard_to, guard_bb,
2937                                            prob_vector.invert (),
2938                                            irred_flag);
2939           skip_e = guard_e;
2940           e = EDGE_PRED (guard_to, 0);
2941           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2942           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2943
2944           /* Simply propagate profile info from guard_bb to guard_to which is
2945              a merge point of control flow.  */
2946           guard_to->count = guard_bb->count;
2947
2948           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
2949           if (vect_epilogues || scalar_loop == NULL)
2950             {
2951               gcc_assert(epilog->num_nodes == loop->num_nodes);
2952               basic_block *bbs = get_loop_body (epilog);
2953               for (unsigned int i = 0; i < epilog->num_nodes; i++)
2954                 {
2955                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
2956                   bbs[i]->count = original_counts[i];
2957                 }
2958               free (bbs);
2959               free (original_bbs);
2960             }
2961         }
2962
2963       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2964       /* If loop is peeled for non-zero constant times, now niters refers to
2965          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2966          overflows.  */
2967       niters_no_overflow |= (prolog_peeling > 0);
2968       vect_gen_vector_loop_niters (loop_vinfo, niters,
2969                                    niters_vector, step_vector,
2970                                    niters_no_overflow);
2971       if (!integer_onep (*step_vector))
2972         {
2973           /* On exit from the loop we will have an easy way of calcalating
2974              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2975              until then.  */
2976           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2977           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2978           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2979         }
2980       else
2981         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2982                                              &niters_vector_mult_vf);
2983       /* Update IVs of original loop as if they were advanced by
2984          niters_vector_mult_vf steps.  */
2985       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2986       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2987       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2988                                         update_e);
2989
2990       if (skip_epilog)
2991         {
2992           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2993                                     niters, niters_vector_mult_vf);
2994           guard_bb = single_exit (loop)->dest;
2995           guard_to = split_edge (single_exit (epilog));
2996           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2997                                            skip_vector ? anchor : guard_bb,
2998                                            prob_epilog.invert (),
2999                                            irred_flag);
3000           if (vect_epilogues)
3001             epilogue_vinfo->skip_this_loop_edge = guard_e;
3002           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3003                                               single_exit (epilog));
3004           /* Only need to handle basic block before epilog loop if it's not
3005              the guard_bb, which is the case when skip_vector is true.  */
3006           if (guard_bb != bb_before_epilog)
3007             {
3008               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3009
3010               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3011             }
3012           scale_loop_profile (epilog, prob_epilog, 0);
3013         }
3014       else
3015         slpeel_update_phi_nodes_for_lcssa (epilog);
3016
3017       unsigned HOST_WIDE_INT bound;
3018       if (bound_scalar.is_constant (&bound))
3019         {
3020           gcc_assert (bound != 0);
3021           /* -1 to convert loop iterations to latch iterations.  */
3022           record_niter_bound (epilog, bound - 1, false, true);
3023         }
3024
3025       delete_update_ssa ();
3026       adjust_vec_debug_stmts ();
3027       scev_reset ();
3028     }
3029
3030   if (vect_epilogues)
3031     {
3032       epilog->aux = epilogue_vinfo;
3033       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3034
3035       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3036
3037       /* We now must calculate the number of NITERS performed by the previous
3038          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3039       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3040                                  niters_prolog, niters_vector_mult_vf);
3041
3042       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3043          determine whether we are coming from the previous vectorized loop
3044          using the update_e edge or the skip_vector basic block using the
3045          skip_e edge.  */
3046       if (skip_vector)
3047         {
3048           gcc_assert (update_e != NULL
3049                       && skip_e != NULL
3050                       && !vect_epilogues_updated_niters);
3051           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3052                                            update_e->dest);
3053           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3054           gimple *stmt = gimple_build_assign (new_ssa, niters);
3055           gimple_stmt_iterator gsi;
3056           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3057               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3058             {
3059               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3060               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3061             }
3062           else
3063             {
3064               gsi = gsi_last_bb (update_e->src);
3065               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3066             }
3067
3068           niters = new_ssa;
3069           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3070           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3071                        UNKNOWN_LOCATION);
3072           niters = PHI_RESULT (new_phi);
3073           epilogue_vinfo->main_loop_edge = update_e;
3074           epilogue_vinfo->skip_main_loop_edge = skip_e;
3075         }
3076
3077       /* Set ADVANCE to the number of iterations performed by the previous
3078          loop and its prologue.  */
3079       *advance = niters;
3080
3081       if (!vect_epilogues_updated_niters)
3082         {
3083           /* Subtract the number of iterations performed by the vectorized loop
3084              from the number of total iterations.  */
3085           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3086                                               before_loop_niters,
3087                                               niters);
3088
3089           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3090           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3091             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3092                            epilogue_niters,
3093                            build_one_cst (TREE_TYPE (epilogue_niters)));
3094
3095           /* Decide what to do if the number of epilogue iterations is not
3096              a multiple of the epilogue loop's vectorization factor.
3097              We should have rejected the loop during the analysis phase
3098              if this fails.  */
3099           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3100                                                            true))
3101             gcc_unreachable ();
3102         }
3103     }
3104
3105   adjust_vec.release ();
3106   free_original_copy_tables ();
3107
3108   return vect_epilogues ? epilog : NULL;
3109 }
3110
3111 /* Function vect_create_cond_for_niters_checks.
3112
3113    Create a conditional expression that represents the run-time checks for
3114    loop's niter.  The loop is guaranteed to terminate if the run-time
3115    checks hold.
3116
3117    Input:
3118    COND_EXPR  - input conditional expression.  New conditions will be chained
3119                 with logical AND operation.  If it is NULL, then the function
3120                 is used to return the number of alias checks.
3121    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3122                 to be checked.
3123
3124    Output:
3125    COND_EXPR - conditional expression.
3126
3127    The returned COND_EXPR is the conditional expression to be used in the
3128    if statement that controls which version of the loop gets executed at
3129    runtime.  */
3130
3131 static void
3132 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3133 {
3134   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3135
3136   if (*cond_expr)
3137     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3138                               *cond_expr, part_cond_expr);
3139   else
3140     *cond_expr = part_cond_expr;
3141 }
3142
3143 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3144    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3145
3146 static void
3147 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3148 {
3149   if (*cond_expr)
3150     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3151                               *cond_expr, part_cond_expr);
3152   else
3153     *cond_expr = part_cond_expr;
3154 }
3155
3156 /* Function vect_create_cond_for_align_checks.
3157
3158    Create a conditional expression that represents the alignment checks for
3159    all of data references (array element references) whose alignment must be
3160    checked at runtime.
3161
3162    Input:
3163    COND_EXPR  - input conditional expression.  New conditions will be chained
3164                 with logical AND operation.
3165    LOOP_VINFO - two fields of the loop information are used.
3166                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3167                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3168
3169    Output:
3170    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3171                          expression.
3172    The returned value is the conditional expression to be used in the if
3173    statement that controls which version of the loop gets executed at runtime.
3174
3175    The algorithm makes two assumptions:
3176      1) The number of bytes "n" in a vector is a power of 2.
3177      2) An address "a" is aligned if a%n is zero and that this
3178         test can be done as a&(n-1) == 0.  For example, for 16
3179         byte vectors the test is a&0xf == 0.  */
3180
3181 static void
3182 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3183                                    tree *cond_expr,
3184                                    gimple_seq *cond_expr_stmt_list)
3185 {
3186   const vec<stmt_vec_info> &may_misalign_stmts
3187     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3188   stmt_vec_info stmt_info;
3189   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3190   tree mask_cst;
3191   unsigned int i;
3192   tree int_ptrsize_type;
3193   char tmp_name[20];
3194   tree or_tmp_name = NULL_TREE;
3195   tree and_tmp_name;
3196   gimple *and_stmt;
3197   tree ptrsize_zero;
3198   tree part_cond_expr;
3199
3200   /* Check that mask is one less than a power of 2, i.e., mask is
3201      all zeros followed by all ones.  */
3202   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3203
3204   int_ptrsize_type = signed_type_for (ptr_type_node);
3205
3206   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3207      of the first vector of the i'th data reference. */
3208
3209   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3210     {
3211       gimple_seq new_stmt_list = NULL;
3212       tree addr_base;
3213       tree addr_tmp_name;
3214       tree new_or_tmp_name;
3215       gimple *addr_stmt, *or_stmt;
3216       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3217       bool negative = tree_int_cst_compare
3218         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3219       tree offset = negative
3220         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3221                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3222         : size_zero_node;
3223
3224       /* create: addr_tmp = (int)(address_of_first_vector) */
3225       addr_base =
3226         vect_create_addr_base_for_vector_ref (loop_vinfo,
3227                                               stmt_info, &new_stmt_list,
3228                                               offset);
3229       if (new_stmt_list != NULL)
3230         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3231
3232       sprintf (tmp_name, "addr2int%d", i);
3233       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3234       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3235       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3236
3237       /* The addresses are OR together.  */
3238
3239       if (or_tmp_name != NULL_TREE)
3240         {
3241           /* create: or_tmp = or_tmp | addr_tmp */
3242           sprintf (tmp_name, "orptrs%d", i);
3243           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3244           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3245                                          or_tmp_name, addr_tmp_name);
3246           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3247           or_tmp_name = new_or_tmp_name;
3248         }
3249       else
3250         or_tmp_name = addr_tmp_name;
3251
3252     } /* end for i */
3253
3254   mask_cst = build_int_cst (int_ptrsize_type, mask);
3255
3256   /* create: and_tmp = or_tmp & mask  */
3257   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3258
3259   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3260                                   or_tmp_name, mask_cst);
3261   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3262
3263   /* Make and_tmp the left operand of the conditional test against zero.
3264      if and_tmp has a nonzero bit then some address is unaligned.  */
3265   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3266   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3267                                 and_tmp_name, ptrsize_zero);
3268   chain_cond_expr (cond_expr, part_cond_expr);
3269 }
3270
3271 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3272    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3273    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3274    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3275
3276 static void
3277 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3278 {
3279   const vec<vec_object_pair> &pairs
3280     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3281   unsigned int i;
3282   vec_object_pair *pair;
3283   FOR_EACH_VEC_ELT (pairs, i, pair)
3284     {
3285       tree addr1 = build_fold_addr_expr (pair->first);
3286       tree addr2 = build_fold_addr_expr (pair->second);
3287       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3288                                          addr1, addr2);
3289       chain_cond_expr (cond_expr, part_cond_expr);
3290     }
3291 }
3292
3293 /* Create an expression that is true when all lower-bound conditions for
3294    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3295
3296 static void
3297 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3298 {
3299   const vec<vec_lower_bound> &lower_bounds
3300     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3301   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3302     {
3303       tree expr = lower_bounds[i].expr;
3304       tree type = unsigned_type_for (TREE_TYPE (expr));
3305       expr = fold_convert (type, expr);
3306       poly_uint64 bound = lower_bounds[i].min_value;
3307       if (!lower_bounds[i].unsigned_p)
3308         {
3309           expr = fold_build2 (PLUS_EXPR, type, expr,
3310                               build_int_cstu (type, bound - 1));
3311           bound += bound - 1;
3312         }
3313       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3314                                          build_int_cstu (type, bound));
3315       chain_cond_expr (cond_expr, part_cond_expr);
3316     }
3317 }
3318
3319 /* Function vect_create_cond_for_alias_checks.
3320
3321    Create a conditional expression that represents the run-time checks for
3322    overlapping of address ranges represented by a list of data references
3323    relations passed as input.
3324
3325    Input:
3326    COND_EXPR  - input conditional expression.  New conditions will be chained
3327                 with logical AND operation.  If it is NULL, then the function
3328                 is used to return the number of alias checks.
3329    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3330                 to be checked.
3331
3332    Output:
3333    COND_EXPR - conditional expression.
3334
3335    The returned COND_EXPR is the conditional expression to be used in the if
3336    statement that controls which version of the loop gets executed at runtime.
3337 */
3338
3339 void
3340 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3341 {
3342   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3343     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3344
3345   if (comp_alias_ddrs.is_empty ())
3346     return;
3347
3348   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3349                                &comp_alias_ddrs, cond_expr);
3350   if (dump_enabled_p ())
3351     dump_printf_loc (MSG_NOTE, vect_location,
3352                      "created %u versioning for alias checks.\n",
3353                      comp_alias_ddrs.length ());
3354 }
3355
3356
3357 /* Function vect_loop_versioning.
3358
3359    If the loop has data references that may or may not be aligned or/and
3360    has data reference relations whose independence was not proven then
3361    two versions of the loop need to be generated, one which is vectorized
3362    and one which isn't.  A test is then generated to control which of the
3363    loops is executed.  The test checks for the alignment of all of the
3364    data references that may or may not be aligned.  An additional
3365    sequence of runtime tests is generated for each pairs of DDRs whose
3366    independence was not proven.  The vectorized version of loop is
3367    executed only if both alias and alignment tests are passed.
3368
3369    The test generated to check which version of loop is executed
3370    is modified to also check for profitability as indicated by the
3371    cost model threshold TH.
3372
3373    The versioning precondition(s) are placed in *COND_EXPR and
3374    *COND_EXPR_STMT_LIST.  */
3375
3376 class loop *
3377 vect_loop_versioning (loop_vec_info loop_vinfo,
3378                       gimple *loop_vectorized_call)
3379 {
3380   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3381   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3382   basic_block condition_bb;
3383   gphi_iterator gsi;
3384   gimple_stmt_iterator cond_exp_gsi;
3385   basic_block merge_bb;
3386   basic_block new_exit_bb;
3387   edge new_exit_e, e;
3388   gphi *orig_phi, *new_phi;
3389   tree cond_expr = NULL_TREE;
3390   gimple_seq cond_expr_stmt_list = NULL;
3391   tree arg;
3392   profile_probability prob = profile_probability::likely ();
3393   gimple_seq gimplify_stmt_list = NULL;
3394   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3395   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3396   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3397   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3398   poly_uint64 versioning_threshold
3399     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3400   tree version_simd_if_cond
3401     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3402   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3403
3404   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3405       && !ordered_p (th, versioning_threshold))
3406     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3407                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3408                                             th - 1));
3409   if (maybe_ne (versioning_threshold, 0U))
3410     {
3411       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3412                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3413                                               versioning_threshold - 1));
3414       if (cond_expr)
3415         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3416                                  expr, cond_expr);
3417       else
3418         cond_expr = expr;
3419     }
3420
3421   tree cost_name = NULL_TREE;
3422   profile_probability prob2 = profile_probability::uninitialized ();
3423   if (cond_expr
3424       && !integer_truep (cond_expr)
3425       && (version_niter
3426           || version_align
3427           || version_alias
3428           || version_simd_if_cond))
3429     {
3430       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3431                                                       &cond_expr_stmt_list,
3432                                                       is_gimple_val, NULL_TREE);
3433       /* Split prob () into two so that the overall probability of passing
3434          both the cost-model and versioning checks is the orig prob.  */
3435       prob2 = prob.split (prob);
3436     }
3437
3438   if (version_niter)
3439     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3440
3441   if (cond_expr)
3442     {
3443       gimple_seq tem = NULL;
3444       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3445                                           &tem, is_gimple_condexpr_for_cond,
3446                                           NULL_TREE);
3447       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3448     }
3449
3450   if (version_align)
3451     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3452                                        &cond_expr_stmt_list);
3453
3454   if (version_alias)
3455     {
3456       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3457       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3458       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3459     }
3460
3461   if (version_simd_if_cond)
3462     {
3463       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3464       if (flag_checking)
3465         if (basic_block bb
3466             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3467           gcc_assert (bb != loop->header
3468                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3469                       && (scalar_loop == NULL
3470                           || (bb != scalar_loop->header
3471                               && dominated_by_p (CDI_DOMINATORS,
3472                                                  scalar_loop->header, bb))));
3473       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3474       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3475                             version_simd_if_cond, zero);
3476       if (cond_expr)
3477         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3478                                  c, cond_expr);
3479       else
3480         cond_expr = c;
3481       if (dump_enabled_p ())
3482         dump_printf_loc (MSG_NOTE, vect_location,
3483                          "created versioning for simd if condition check.\n");
3484     }
3485
3486   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3487                                       &gimplify_stmt_list,
3488                                       is_gimple_condexpr_for_cond, NULL_TREE);
3489   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3490
3491   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3492      invariant in.  */
3493   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3494   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3495        !gsi_end_p (gsi); gsi_next (&gsi))
3496     {
3497       gimple *stmt = gsi_stmt (gsi);
3498       update_stmt (stmt);
3499       ssa_op_iter iter;
3500       use_operand_p use_p;
3501       basic_block def_bb;
3502       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3503         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3504             && flow_bb_inside_loop_p (outermost, def_bb))
3505           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3506     }
3507
3508   /* Search for the outermost loop we can version.  Avoid versioning of
3509      non-perfect nests but allow if-conversion versioned loops inside.  */
3510   class loop *loop_to_version = loop;
3511   if (flow_loop_nested_p (outermost, loop))
3512     {
3513       if (dump_enabled_p ())
3514         dump_printf_loc (MSG_NOTE, vect_location,
3515                          "trying to apply versioning to outer loop %d\n",
3516                          outermost->num);
3517       if (outermost->num == 0)
3518         outermost = superloop_at_depth (loop, 1);
3519       /* And avoid applying versioning on non-perfect nests.  */
3520       while (loop_to_version != outermost
3521              && (e = single_exit (loop_outer (loop_to_version)))
3522              && !(e->flags & EDGE_COMPLEX)
3523              && (!loop_outer (loop_to_version)->inner->next
3524                  || vect_loop_vectorized_call (loop_to_version))
3525              && (!loop_outer (loop_to_version)->inner->next
3526                  || !loop_outer (loop_to_version)->inner->next->next))
3527         loop_to_version = loop_outer (loop_to_version);
3528     }
3529
3530   /* Apply versioning.  If there is already a scalar version created by
3531      if-conversion re-use that.  Note we cannot re-use the copy of
3532      an if-converted outer-loop when vectorizing the inner loop only.  */
3533   gcond *cond;
3534   if ((!loop_to_version->inner || loop == loop_to_version)
3535       && loop_vectorized_call)
3536     {
3537       gcc_assert (scalar_loop);
3538       condition_bb = gimple_bb (loop_vectorized_call);
3539       cond = as_a <gcond *> (last_stmt (condition_bb));
3540       gimple_cond_set_condition_from_tree (cond, cond_expr);
3541       update_stmt (cond);
3542
3543       if (cond_expr_stmt_list)
3544         {
3545           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3546           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3547                                  GSI_SAME_STMT);
3548         }
3549
3550       /* if-conversion uses profile_probability::always () for both paths,
3551          reset the paths probabilities appropriately.  */
3552       edge te, fe;
3553       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3554       te->probability = prob;
3555       fe->probability = prob.invert ();
3556       /* We can scale loops counts immediately but have to postpone
3557          scaling the scalar loop because we re-use it during peeling.  */
3558       scale_loop_frequencies (loop_to_version, te->probability);
3559       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3560
3561       nloop = scalar_loop;
3562       if (dump_enabled_p ())
3563         dump_printf_loc (MSG_NOTE, vect_location,
3564                          "reusing %sloop version created by if conversion\n",
3565                          loop_to_version != loop ? "outer " : "");
3566     }
3567   else
3568     {
3569       if (loop_to_version != loop
3570           && dump_enabled_p ())
3571         dump_printf_loc (MSG_NOTE, vect_location,
3572                          "applying loop versioning to outer loop %d\n",
3573                          loop_to_version->num);
3574
3575       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3576
3577       initialize_original_copy_tables ();
3578       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3579                             prob, prob.invert (), prob, prob.invert (), true);
3580       gcc_assert (nloop);
3581       nloop = get_loop_copy (loop);
3582
3583       /* For cycle vectorization with SLP we rely on the PHI arguments
3584          appearing in the same order as the SLP node operands which for the
3585          loop PHI nodes means the preheader edge dest index needs to remain
3586          the same for the analyzed loop which also becomes the vectorized one.
3587          Make it so in case the state after versioning differs by redirecting
3588          the first edge into the header to the same destination which moves
3589          it last.  */
3590       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3591         {
3592           edge e = EDGE_PRED (loop->header, 0);
3593           ssa_redirect_edge (e, e->dest);
3594           flush_pending_stmts (e);
3595         }
3596       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3597
3598       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3599          reap those otherwise;  they also refer to the original
3600          loops.  */
3601       class loop *l = loop;
3602       while (gimple *call = vect_loop_vectorized_call (l))
3603         {
3604           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3605           fold_loop_internal_call (call, boolean_false_node);
3606           l = loop_outer (l);
3607         }
3608       free_original_copy_tables ();
3609
3610       if (cond_expr_stmt_list)
3611         {
3612           cond_exp_gsi = gsi_last_bb (condition_bb);
3613           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3614                                  GSI_SAME_STMT);
3615         }
3616
3617       /* Loop versioning violates an assumption we try to maintain during
3618          vectorization - that the loop exit block has a single predecessor.
3619          After versioning, the exit block of both loop versions is the same
3620          basic block (i.e. it has two predecessors). Just in order to simplify
3621          following transformations in the vectorizer, we fix this situation
3622          here by adding a new (empty) block on the exit-edge of the loop,
3623          with the proper loop-exit phis to maintain loop-closed-form.
3624          If loop versioning wasn't done from loop, but scalar_loop instead,
3625          merge_bb will have already just a single successor.  */
3626
3627       merge_bb = single_exit (loop_to_version)->dest;
3628       if (EDGE_COUNT (merge_bb->preds) >= 2)
3629         {
3630           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3631           new_exit_bb = split_edge (single_exit (loop_to_version));
3632           new_exit_e = single_exit (loop_to_version);
3633           e = EDGE_SUCC (new_exit_bb, 0);
3634
3635           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3636                gsi_next (&gsi))
3637             {
3638               tree new_res;
3639               orig_phi = gsi.phi ();
3640               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3641               new_phi = create_phi_node (new_res, new_exit_bb);
3642               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3643               add_phi_arg (new_phi, arg, new_exit_e,
3644                            gimple_phi_arg_location_from_edge (orig_phi, e));
3645               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3646             }
3647         }
3648
3649       update_ssa (TODO_update_ssa_no_phi);
3650     }
3651
3652   /* Split the cost model check off to a separate BB.  Costing assumes
3653      this is the only thing we perform when we enter the scalar loop
3654      from a failed cost decision.  */
3655   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3656     {
3657       gimple *def = SSA_NAME_DEF_STMT (cost_name);
3658       /* All uses of the cost check are 'true' after the check we
3659          are going to insert.  */
3660       replace_uses_by (cost_name, boolean_true_node);
3661       /* And we're going to build the new single use of it.  */
3662       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3663                                        NULL_TREE, NULL_TREE);
3664       edge e = split_block (gimple_bb (def), def);
3665       gimple_stmt_iterator gsi = gsi_for_stmt (def);
3666       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3667       edge true_e, false_e;
3668       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3669       e->flags &= ~EDGE_FALLTHRU;
3670       e->flags |= EDGE_TRUE_VALUE;
3671       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3672       e->probability = prob2;
3673       e2->probability = prob2.invert ();
3674       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3675       auto_vec<basic_block, 3> adj;
3676       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3677            son;
3678            son = next_dom_son (CDI_DOMINATORS, son))
3679         if (EDGE_COUNT (son->preds) > 1)
3680           adj.safe_push (son);
3681       for (auto son : adj)
3682         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3683     }
3684
3685   if (version_niter)
3686     {
3687       /* The versioned loop could be infinite, we need to clear existing
3688          niter information which is copied from the original loop.  */
3689       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3690       vect_free_loop_info_assumptions (nloop);
3691     }
3692
3693   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3694       && dump_enabled_p ())
3695     {
3696       if (version_alias)
3697         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3698                          vect_location,
3699                          "loop versioned for vectorization because of "
3700                          "possible aliasing\n");
3701       if (version_align)
3702         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3703                          vect_location,
3704                          "loop versioned for vectorization to enhance "
3705                          "alignment\n");
3706
3707     }
3708
3709   return nloop;
3710 }