gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 324                                rgroup_controls *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->type;
 327   tree dest_masktype = dest_rgm->type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 342         {
 343           tree src = src_rgm->controls[i / 2];
 344           tree dest = dest_rgm->controls[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 375         {
 376           tree src = src_rgm->controls[i / 2];
 377           tree dest = dest_rgm->controls[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 388    for all the rgroup controls in RGC and return a control that is nonzero
 389    when the loop needs to iterate.  Add any new preheader statements to
 390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGC->max_nscalars_per_iter * RGC->factor
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks or zero
 415    lengths for RGC.
 416
 417    Note: the cost of the code generated by this function is modeled
 418    by vect_estimate_min_profitable_iters, so changes here may need
 419    corresponding changes there.  */
 420
 421 static tree
 422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 423                                  gimple_seq *preheader_seq,
 424                                  gimple_stmt_iterator loop_cond_gsi,
 425                                  rgroup_controls *rgc, tree niters,
 426                                  tree niters_skip, bool might_wrap_p)
 427 {
 428   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 429   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 430   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 431
 432   tree ctrl_type = rgc->type;
 433   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 434   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 435   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 436   tree length_limit = NULL_TREE;
 437   /* For length, we need length_limit to ensure length in range.  */
 438   if (!use_masks_p)
 439     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 440
 441   /* Calculate the maximum number of item values that the rgroup
 442      handles in total, the number that it handles for each iteration
 443      of the vector loop, and the number that it should skip during the
 444      first iteration of the vector loop.  */
 445   tree nitems_total = niters;
 446   tree nitems_step = build_int_cst (iv_type, vf);
 447   tree nitems_skip = niters_skip;
 448   if (nitems_per_iter != 1)
 449     {
 450       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 451          these multiplications don't overflow.  */
 452       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 453       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 454       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 455                                    nitems_total, compare_factor);
 456       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 457                                   nitems_step, iv_factor);
 458       if (nitems_skip)
 459         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 460                                     nitems_skip, compare_factor);
 461     }
 462
 463   /* Create an induction variable that counts the number of items
 464      processed.  */
 465   tree index_before_incr, index_after_incr;
 466   gimple_stmt_iterator incr_gsi;
 467   bool insert_after;
 468   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 469   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 470              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 471
 472   tree zero_index = build_int_cst (compare_type, 0);
 473   tree test_index, test_limit, first_limit;
 474   gimple_stmt_iterator *test_gsi;
 475   if (might_wrap_p)
 476     {
 477       /* In principle the loop should stop iterating once the incremented
 478          IV reaches a value greater than or equal to:
 479
 480            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 481
 482          However, there's no guarantee that this addition doesn't overflow
 483          the comparison type, or that the IV hits a value above it before
 484          wrapping around.  We therefore adjust the limit down by one
 485          IV step:
 486
 487            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 488            -[infinite-prec] NITEMS_STEP
 489
 490          and compare the IV against this limit _before_ incrementing it.
 491          Since the comparison type is unsigned, we actually want the
 492          subtraction to saturate at zero:
 493
 494            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 495            -[sat] NITEMS_STEP
 496
 497          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 498
 499            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 500
 501          where the rightmost subtraction can be done directly in
 502          COMPARE_TYPE.  */
 503       test_index = index_before_incr;
 504       tree adjust = gimple_convert (preheader_seq, compare_type,
 505                                     nitems_step);
 506       if (nitems_skip)
 507         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 508                                adjust, nitems_skip);
 509       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 510                                  nitems_total, adjust);
 511       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 512                                  test_limit, adjust);
 513       test_gsi = &incr_gsi;
 514
 515       /* Get a safe limit for the first iteration.  */
 516       if (nitems_skip)
 517         {
 518           /* The first vector iteration can handle at most NITEMS_STEP
 519              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 520              NITEMS_SKIP to that cannot overflow.  */
 521           tree const_limit = build_int_cst (compare_type,
 522                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 523                                             * nitems_per_iter);
 524           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 525                                       nitems_total, const_limit);
 526           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 527                                       first_limit, nitems_skip);
 528         }
 529       else
 530         /* For the first iteration it doesn't matter whether the IV hits
 531            a value above NITEMS_TOTAL.  That only matters for the latch
 532            condition.  */
 533         first_limit = nitems_total;
 534     }
 535   else
 536     {
 537       /* Test the incremented IV, which will always hit a value above
 538          the bound before wrapping.  */
 539       test_index = index_after_incr;
 540       test_limit = nitems_total;
 541       if (nitems_skip)
 542         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 543                                    test_limit, nitems_skip);
 544       test_gsi = &loop_cond_gsi;
 545
 546       first_limit = test_limit;
 547     }
 548
 549   /* Convert the IV value to the comparison type (either a no-op or
 550      a demotion).  */
 551   gimple_seq test_seq = NULL;
 552   test_index = gimple_convert (&test_seq, compare_type, test_index);
 553   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 554
 555   /* Provide a definition of each control in the group.  */
 556   tree next_ctrl = NULL_TREE;
 557   tree ctrl;
 558   unsigned int i;
 559   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 560     {
 561       /* Previous controls will cover BIAS items.  This control covers the
 562          next batch.  */
 563       poly_uint64 bias = nitems_per_ctrl * i;
 564       tree bias_tree = build_int_cst (compare_type, bias);
 565
 566       /* See whether the first iteration of the vector loop is known
 567          to have a full control.  */
 568       poly_uint64 const_limit;
 569       bool first_iteration_full
 570         = (poly_int_tree_p (first_limit, &const_limit)
 571            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 572
 573       /* Rather than have a new IV that starts at BIAS and goes up to
 574          TEST_LIMIT, prefer to use the same 0-based IV for each control
 575          and adjust the bound down by BIAS.  */
 576       tree this_test_limit = test_limit;
 577       if (i != 0)
 578         {
 579           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 580                                           compare_type, this_test_limit,
 581                                           bias_tree);
 582           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 583                                           compare_type, this_test_limit,
 584                                           bias_tree);
 585         }
 586
 587       /* Create the initial control.  First include all items that
 588          are within the loop limit.  */
 589       tree init_ctrl = NULL_TREE;
 590       if (!first_iteration_full)
 591         {
 592           tree start, end;
 593           if (first_limit == test_limit)
 594             {
 595               /* Use a natural test between zero (the initial IV value)
 596                  and the loop limit.  The "else" block would be valid too,
 597                  but this choice can avoid the need to load BIAS_TREE into
 598                  a register.  */
 599               start = zero_index;
 600               end = this_test_limit;
 601             }
 602           else
 603             {
 604               /* FIRST_LIMIT is the maximum number of items handled by the
 605                  first iteration of the vector loop.  Test the portion
 606                  associated with this control.  */
 607               start = bias_tree;
 608               end = first_limit;
 609             }
 610
 611           if (use_masks_p)
 612             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 613                                         start, end, "max_mask");
 614           else
 615             {
 616               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 617               gimple_seq seq = vect_gen_len (init_ctrl, start,
 618                                              end, length_limit);
 619               gimple_seq_add_seq (preheader_seq, seq);
 620             }
 621         }
 622
 623       /* Now AND out the bits that are within the number of skipped
 624          items.  */
 625       poly_uint64 const_skip;
 626       if (nitems_skip
 627           && !(poly_int_tree_p (nitems_skip, &const_skip)
 628                && known_le (const_skip, bias)))
 629         {
 630           gcc_assert (use_masks_p);
 631           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 632                                                     bias_tree, nitems_skip);
 633           if (init_ctrl)
 634             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 635                                       init_ctrl, unskipped_mask);
 636           else
 637             init_ctrl = unskipped_mask;
 638         }
 639
 640       if (!init_ctrl)
 641         {
 642           /* First iteration is full.  */
 643           if (use_masks_p)
 644             init_ctrl = build_minus_one_cst (ctrl_type);
 645           else
 646             init_ctrl = length_limit;
 647         }
 648
 649       /* Get the control value for the next iteration of the loop.  */
 650       if (use_masks_p)
 651         {
 652           gimple_seq stmts = NULL;
 653           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 654                                       this_test_limit, "next_mask");
 655           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 656         }
 657       else
 658         {
 659           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 660           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 661                                          length_limit);
 662           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 663         }
 664
 665       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 666     }
 667   return next_ctrl;
 668 }
 669
 670 /* Set up the iteration condition and rgroup controls for LOOP, given
 671    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 672    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 673    the number of iterations of the original scalar loop that should be
 674    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 675    for vect_set_loop_condition.
 676
 677    Insert the branch-back condition before LOOP_COND_GSI and return the
 678    final gcond.  */
 679
 680 static gcond *
 681 vect_set_loop_condition_partial_vectors (class loop *loop,
 682                                          loop_vec_info loop_vinfo, tree niters,
 683                                          tree final_iv, bool niters_maybe_zero,
 684                                          gimple_stmt_iterator loop_cond_gsi)
 685 {
 686   gimple_seq preheader_seq = NULL;
 687   gimple_seq header_seq = NULL;
 688
 689   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 690   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 691   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 692   tree orig_niters = niters;
 693
 694   /* Type of the initial value of NITERS.  */
 695   tree ni_actual_type = TREE_TYPE (niters);
 696   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 697   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 698
 699   /* Convert NITERS to the same size as the compare.  */
 700   if (compare_precision > ni_actual_precision
 701       && niters_maybe_zero)
 702     {
 703       /* We know that there is always at least one iteration, so if the
 704          count is zero then it must have wrapped.  Cope with this by
 705          subtracting 1 before the conversion and adding 1 to the result.  */
 706       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 707       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 708                              niters, build_minus_one_cst (ni_actual_type));
 709       niters = gimple_convert (&preheader_seq, compare_type, niters);
 710       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 711                              niters, build_one_cst (compare_type));
 712     }
 713   else
 714     niters = gimple_convert (&preheader_seq, compare_type, niters);
 715
 716   /* Iterate over all the rgroups and fill in their controls.  We could use
 717      the first control from any rgroup for the loop condition; here we
 718      arbitrarily pick the last.  */
 719   tree test_ctrl = NULL_TREE;
 720   rgroup_controls *rgc;
 721   unsigned int i;
 722   auto_vec<rgroup_controls> *controls = use_masks_p
 723                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 724                                           : &LOOP_VINFO_LENS (loop_vinfo);
 725   FOR_EACH_VEC_ELT (*controls, i, rgc)
 726     if (!rgc->controls.is_empty ())
 727       {
 728         /* First try using permutes.  This adds a single vector
 729            instruction to the loop for each mask, but needs no extra
 730            loop invariants or IVs.  */
 731         unsigned int nmasks = i + 1;
 732         if (use_masks_p && (nmasks & 1) == 0)
 733           {
 734             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 735             if (!half_rgc->controls.is_empty ()
 736                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 737               continue;
 738           }
 739
 740         /* See whether zero-based IV would ever generate all-false masks
 741            or zero length before wrapping around.  */
 742         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 743
 744         /* Set up all controls for this group.  */
 745         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 746                                                      &preheader_seq,
 747                                                      loop_cond_gsi, rgc,
 748                                                      niters, niters_skip,
 749                                                      might_wrap_p);
 750       }
 751
 752   /* Emit all accumulated statements.  */
 753   add_preheader_seq (loop, preheader_seq);
 754   add_header_seq (loop, header_seq);
 755
 756   /* Get a boolean result that tells us whether to iterate.  */
 757   edge exit_edge = single_exit (loop);
 758   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 759   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 760   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 761                                         NULL_TREE, NULL_TREE);
 762   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 763
 764   /* The loop iterates (NITERS - 1) / VF + 1 times.
 765      Subtract one from this to get the latch count.  */
 766   tree step = build_int_cst (compare_type,
 767                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 768   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 769                                        build_minus_one_cst (compare_type));
 770   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 771                                      niters_minus_one, step);
 772
 773   if (final_iv)
 774     {
 775       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 776       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 777     }
 778
 779   return cond_stmt;
 780 }
 781
 782 /* Like vect_set_loop_condition, but handle the case in which the vector
 783    loop handles exactly VF scalars per iteration.  */
 784
 785 static gcond *
 786 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 787                                 tree final_iv, bool niters_maybe_zero,
 788                                 gimple_stmt_iterator loop_cond_gsi)
 789 {
 790   tree indx_before_incr, indx_after_incr;
 791   gcond *cond_stmt;
 792   gcond *orig_cond;
 793   edge pe = loop_preheader_edge (loop);
 794   edge exit_edge = single_exit (loop);
 795   gimple_stmt_iterator incr_gsi;
 796   bool insert_after;
 797   enum tree_code code;
 798   tree niters_type = TREE_TYPE (niters);
 799
 800   orig_cond = get_loop_exit_condition (loop);
 801   gcc_assert (orig_cond);
 802   loop_cond_gsi = gsi_for_stmt (orig_cond);
 803
 804   tree init, limit;
 805   if (!niters_maybe_zero && integer_onep (step))
 806     {
 807       /* In this case we can use a simple 0-based IV:
 808
 809          A:
 810            x = 0;
 811            do
 812              {
 813                ...
 814                x += 1;
 815              }
 816            while (x < NITERS);  */
 817       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 818       init = build_zero_cst (niters_type);
 819       limit = niters;
 820     }
 821   else
 822     {
 823       /* The following works for all values of NITERS except 0:
 824
 825          B:
 826            x = 0;
 827            do
 828              {
 829                ...
 830                x += STEP;
 831              }
 832            while (x <= NITERS - STEP);
 833
 834          so that the loop continues to iterate if x + STEP - 1 < NITERS
 835          but stops if x + STEP - 1 >= NITERS.
 836
 837          However, if NITERS is zero, x never hits a value above NITERS - STEP
 838          before wrapping around.  There are two obvious ways of dealing with
 839          this:
 840
 841          - start at STEP - 1 and compare x before incrementing it
 842          - start at -1 and compare x after incrementing it
 843
 844          The latter is simpler and is what we use.  The loop in this case
 845          looks like:
 846
 847          C:
 848            x = -1;
 849            do
 850              {
 851                ...
 852                x += STEP;
 853              }
 854            while (x < NITERS - STEP);
 855
 856          In both cases the loop limit is NITERS - STEP.  */
 857       gimple_seq seq = NULL;
 858       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 859       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 860       if (seq)
 861         {
 862           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 863           gcc_assert (!new_bb);
 864         }
 865       if (niters_maybe_zero)
 866         {
 867           /* Case C.  */
 868           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 869           init = build_all_ones_cst (niters_type);
 870         }
 871       else
 872         {
 873           /* Case B.  */
 874           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 875           init = build_zero_cst (niters_type);
 876         }
 877     }
 878
 879   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 880   create_iv (init, step, NULL_TREE, loop,
 881              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 882   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 883                                               true, NULL_TREE, true,
 884                                               GSI_SAME_STMT);
 885   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 886                                      true, GSI_SAME_STMT);
 887
 888   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 889                                  NULL_TREE);
 890
 891   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 892
 893   /* Record the number of latch iterations.  */
 894   if (limit == niters)
 895     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 896        latch count.  */
 897     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 898                                        build_int_cst (niters_type, 1));
 899   else
 900     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 901        Subtract one from this to get the latch count.  */
 902     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 903                                        limit, step);
 904
 905   if (final_iv)
 906     {
 907       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 908                                              indx_after_incr, init);
 909       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 910     }
 911
 912   return cond_stmt;
 913 }
 914
 915 /* If we're using fully-masked loops, make LOOP iterate:
 916
 917       N == (NITERS - 1) / STEP + 1
 918
 919    times.  When NITERS is zero, this is equivalent to making the loop
 920    execute (1 << M) / STEP times, where M is the precision of NITERS.
 921    NITERS_MAYBE_ZERO is true if this last case might occur.
 922
 923    If we're not using fully-masked loops, make LOOP iterate:
 924
 925       N == (NITERS - STEP) / STEP + 1
 926
 927    times, where NITERS is known to be outside the range [1, STEP - 1].
 928    This is equivalent to making the loop execute NITERS / STEP times
 929    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 930    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 931
 932    If FINAL_IV is nonnull, it is an SSA name that should be set to
 933    N * STEP on exit from the loop.
 934
 935    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 936
 937 void
 938 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 939                          tree niters, tree step, tree final_iv,
 940                          bool niters_maybe_zero)
 941 {
 942   gcond *cond_stmt;
 943   gcond *orig_cond = get_loop_exit_condition (loop);
 944   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 945
 946   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 947     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 948                                                          niters, final_iv,
 949                                                          niters_maybe_zero,
 950                                                          loop_cond_gsi);
 951   else
 952     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 953                                                 niters_maybe_zero,
 954                                                 loop_cond_gsi);
 955
 956   /* Remove old loop exit test.  */
 957   stmt_vec_info orig_cond_info;
 958   if (loop_vinfo
 959       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 960     loop_vinfo->remove_stmt (orig_cond_info);
 961   else
 962     gsi_remove (&loop_cond_gsi, true);
 963
 964   if (dump_enabled_p ())
 965     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 966                      cond_stmt);
 967 }
 968
 969 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 970    For all PHI arguments in FROM->dest and TO->dest from those
 971    edges ensure that TO->dest PHI arguments have current_def
 972    to that in from.  */
 973
 974 static void
 975 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 976 {
 977   gimple_stmt_iterator gsi_from, gsi_to;
 978
 979   for (gsi_from = gsi_start_phis (from->dest),
 980        gsi_to = gsi_start_phis (to->dest);
 981        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 982     {
 983       gimple *from_phi = gsi_stmt (gsi_from);
 984       gimple *to_phi = gsi_stmt (gsi_to);
 985       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 986       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 987       if (virtual_operand_p (from_arg))
 988         {
 989           gsi_next (&gsi_from);
 990           continue;
 991         }
 992       if (virtual_operand_p (to_arg))
 993         {
 994           gsi_next (&gsi_to);
 995           continue;
 996         }
 997       if (TREE_CODE (from_arg) != SSA_NAME)
 998         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 999       else if (TREE_CODE (to_arg) == SSA_NAME
1000                && from_arg != to_arg)
1001         {
1002           if (get_current_def (to_arg) == NULL_TREE)
1003             {
1004               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1005                                               TREE_TYPE (get_current_def
1006                                                            (from_arg))));
1007               set_current_def (to_arg, get_current_def (from_arg));
1008             }
1009         }
1010       gsi_next (&gsi_from);
1011       gsi_next (&gsi_to);
1012     }
1013
1014   gphi *from_phi = get_virtual_phi (from->dest);
1015   gphi *to_phi = get_virtual_phi (to->dest);
1016   if (from_phi)
1017     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1018                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1019 }
1020
1021
1022 /* Given LOOP this function generates a new copy of it and puts it
1023    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1024    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1025    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1026    entry or exit of LOOP.  */
1027
1028 class loop *
1029 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1030                                         class loop *scalar_loop, edge e)
1031 {
1032   class loop *new_loop;
1033   basic_block *new_bbs, *bbs, *pbbs;
1034   bool at_exit;
1035   bool was_imm_dom;
1036   basic_block exit_dest;
1037   edge exit, new_exit;
1038   bool duplicate_outer_loop = false;
1039
1040   exit = single_exit (loop);
1041   at_exit = (e == exit);
1042   if (!at_exit && e != loop_preheader_edge (loop))
1043     return NULL;
1044
1045   if (scalar_loop == NULL)
1046     scalar_loop = loop;
1047
1048   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1049   pbbs = bbs + 1;
1050   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1051   /* Allow duplication of outer loops.  */
1052   if (scalar_loop->inner)
1053     duplicate_outer_loop = true;
1054   /* Check whether duplication is possible.  */
1055   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1056     {
1057       free (bbs);
1058       return NULL;
1059     }
1060
1061   /* Generate new loop structure.  */
1062   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1063   duplicate_subloops (scalar_loop, new_loop);
1064
1065   exit_dest = exit->dest;
1066   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1067                                           exit_dest) == loop->header ?
1068                  true : false);
1069
1070   /* Also copy the pre-header, this avoids jumping through hoops to
1071      duplicate the loop entry PHI arguments.  Create an empty
1072      pre-header unconditionally for this.  */
1073   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1074   edge entry_e = single_pred_edge (preheader);
1075   bbs[0] = preheader;
1076   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1077
1078   exit = single_exit (scalar_loop);
1079   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1080             &exit, 1, &new_exit, NULL,
1081             at_exit ? loop->latch : e->src, true);
1082   exit = single_exit (loop);
1083   basic_block new_preheader = new_bbs[0];
1084
1085   /* Before installing PHI arguments make sure that the edges
1086      into them match that of the scalar loop we analyzed.  This
1087      makes sure the SLP tree matches up between the main vectorized
1088      loop and the epilogue vectorized copies.  */
1089   if (single_succ_edge (preheader)->dest_idx
1090       != single_succ_edge (new_bbs[0])->dest_idx)
1091     {
1092       basic_block swap_bb = new_bbs[1];
1093       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1094       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1095       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1096       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1097     }
1098   if (duplicate_outer_loop)
1099     {
1100       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1101       if (loop_preheader_edge (scalar_loop)->dest_idx
1102           != loop_preheader_edge (new_inner_loop)->dest_idx)
1103         {
1104           basic_block swap_bb = new_inner_loop->header;
1105           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1106           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1107           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1108           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1109         }
1110     }
1111
1112   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1113
1114   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1115   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1116     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1117
1118   if (scalar_loop != loop)
1119     {
1120       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1121          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1122          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1123          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1124          header) to have current_def set, so copy them over.  */
1125       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1126                                                 exit);
1127       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1128                                                            0),
1129                                                 EDGE_SUCC (loop->latch, 0));
1130     }
1131
1132   if (at_exit) /* Add the loop copy at exit.  */
1133     {
1134       if (scalar_loop != loop)
1135         {
1136           gphi_iterator gsi;
1137           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1138
1139           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1140                gsi_next (&gsi))
1141             {
1142               gphi *phi = gsi.phi ();
1143               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1144               location_t orig_locus
1145                 = gimple_phi_arg_location_from_edge (phi, e);
1146
1147               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1148             }
1149         }
1150       redirect_edge_and_branch_force (e, new_preheader);
1151       flush_pending_stmts (e);
1152       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1153       if (was_imm_dom || duplicate_outer_loop)
1154         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1155
1156       /* And remove the non-necessary forwarder again.  Keep the other
1157          one so we have a proper pre-header for the loop at the exit edge.  */
1158       redirect_edge_pred (single_succ_edge (preheader),
1159                           single_pred (preheader));
1160       delete_basic_block (preheader);
1161       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1162                                loop_preheader_edge (scalar_loop)->src);
1163     }
1164   else /* Add the copy at entry.  */
1165     {
1166       if (scalar_loop != loop)
1167         {
1168           /* Remove the non-necessary forwarder of scalar_loop again.  */
1169           redirect_edge_pred (single_succ_edge (preheader),
1170                               single_pred (preheader));
1171           delete_basic_block (preheader);
1172           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1173                                    loop_preheader_edge (scalar_loop)->src);
1174           preheader = split_edge (loop_preheader_edge (loop));
1175           entry_e = single_pred_edge (preheader);
1176         }
1177
1178       redirect_edge_and_branch_force (entry_e, new_preheader);
1179       flush_pending_stmts (entry_e);
1180       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1181
1182       redirect_edge_and_branch_force (new_exit, preheader);
1183       flush_pending_stmts (new_exit);
1184       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1185
1186       /* And remove the non-necessary forwarder again.  Keep the other
1187          one so we have a proper pre-header for the loop at the exit edge.  */
1188       redirect_edge_pred (single_succ_edge (new_preheader),
1189                           single_pred (new_preheader));
1190       delete_basic_block (new_preheader);
1191       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1192                                loop_preheader_edge (new_loop)->src);
1193     }
1194
1195   if (scalar_loop != loop)
1196     {
1197       /* Update new_loop->header PHIs, so that on the preheader
1198          edge they are the ones from loop rather than scalar_loop.  */
1199       gphi_iterator gsi_orig, gsi_new;
1200       edge orig_e = loop_preheader_edge (loop);
1201       edge new_e = loop_preheader_edge (new_loop);
1202
1203       for (gsi_orig = gsi_start_phis (loop->header),
1204            gsi_new = gsi_start_phis (new_loop->header);
1205            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1206            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1207         {
1208           gphi *orig_phi = gsi_orig.phi ();
1209           gphi *new_phi = gsi_new.phi ();
1210           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1211           location_t orig_locus
1212             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1213
1214           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1215         }
1216     }
1217
1218   free (new_bbs);
1219   free (bbs);
1220
1221   checking_verify_dominators (CDI_DOMINATORS);
1222
1223   return new_loop;
1224 }
1225
1226
1227 /* Given the condition expression COND, put it as the last statement of
1228    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1229    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1230    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1231    new edge as irreducible if IRREDUCIBLE_P is true.  */
1232
1233 static edge
1234 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1235                        basic_block guard_to, basic_block dom_bb,
1236                        profile_probability probability, bool irreducible_p)
1237 {
1238   gimple_stmt_iterator gsi;
1239   edge new_e, enter_e;
1240   gcond *cond_stmt;
1241   gimple_seq gimplify_stmt_list = NULL;
1242
1243   enter_e = EDGE_SUCC (guard_bb, 0);
1244   enter_e->flags &= ~EDGE_FALLTHRU;
1245   enter_e->flags |= EDGE_FALSE_VALUE;
1246   gsi = gsi_last_bb (guard_bb);
1247
1248   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1249                                  NULL_TREE);
1250   if (gimplify_stmt_list)
1251     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1252
1253   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1254   gsi = gsi_last_bb (guard_bb);
1255   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1256
1257   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1258   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1259
1260   new_e->probability = probability;
1261   if (irreducible_p)
1262     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1263
1264   enter_e->probability = probability.invert ();
1265   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1266
1267   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1268   if (enter_e->dest->loop_father->header == enter_e->dest)
1269     split_edge (enter_e);
1270
1271   return new_e;
1272 }
1273
1274
1275 /* This function verifies that the following restrictions apply to LOOP:
1276    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1277        for innermost loop and 5 basic blocks for outer-loop.
1278    (2) it is single entry, single exit
1279    (3) its exit condition is the last stmt in the header
1280    (4) E is the entry/exit edge of LOOP.
1281  */
1282
1283 bool
1284 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1285 {
1286   edge exit_e = single_exit (loop);
1287   edge entry_e = loop_preheader_edge (loop);
1288   gcond *orig_cond = get_loop_exit_condition (loop);
1289   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1290   unsigned int num_bb = loop->inner? 5 : 2;
1291
1292   /* All loops have an outer scope; the only case loop->outer is NULL is for
1293      the function itself.  */
1294   if (!loop_outer (loop)
1295       || loop->num_nodes != num_bb
1296       || !empty_block_p (loop->latch)
1297       || !single_exit (loop)
1298       /* Verify that new loop exit condition can be trivially modified.  */
1299       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1300       || (e != exit_e && e != entry_e))
1301     return false;
1302
1303   return true;
1304 }
1305
1306 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1307    in the exit bb and rename all the uses after the loop.  This simplifies
1308    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1309    (but normally loop closed SSA form doesn't require virtual PHIs to be
1310    in the same form).  Doing this early simplifies the checking what
1311    uses should be renamed.
1312
1313    If we create a new phi after the loop, return the definition that
1314    applies on entry to the loop, otherwise return null.  */
1315
1316 static tree
1317 create_lcssa_for_virtual_phi (class loop *loop)
1318 {
1319   gphi_iterator gsi;
1320   edge exit_e = single_exit (loop);
1321
1322   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1323     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1324       {
1325         gphi *phi = gsi.phi ();
1326         for (gsi = gsi_start_phis (exit_e->dest);
1327              !gsi_end_p (gsi); gsi_next (&gsi))
1328           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1329             break;
1330         if (gsi_end_p (gsi))
1331           {
1332             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1333             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1334             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1335             imm_use_iterator imm_iter;
1336             gimple *stmt;
1337             use_operand_p use_p;
1338
1339             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1340               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1341             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1342             gimple_phi_set_result (new_phi, new_vop);
1343             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1344               if (stmt != new_phi
1345                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1346                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1347                   SET_USE (use_p, new_vop);
1348
1349             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1350           }
1351         break;
1352       }
1353   return NULL_TREE;
1354 }
1355
1356 /* Function vect_get_loop_location.
1357
1358    Extract the location of the loop in the source code.
1359    If the loop is not well formed for vectorization, an estimated
1360    location is calculated.
1361    Return the loop location if succeed and NULL if not.  */
1362
1363 dump_user_location_t
1364 find_loop_location (class loop *loop)
1365 {
1366   gimple *stmt = NULL;
1367   basic_block bb;
1368   gimple_stmt_iterator si;
1369
1370   if (!loop)
1371     return dump_user_location_t ();
1372
1373   stmt = get_loop_exit_condition (loop);
1374
1375   if (stmt
1376       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1377     return stmt;
1378
1379   /* If we got here the loop is probably not "well formed",
1380      try to estimate the loop location */
1381
1382   if (!loop->header)
1383     return dump_user_location_t ();
1384
1385   bb = loop->header;
1386
1387   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1388     {
1389       stmt = gsi_stmt (si);
1390       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1391         return stmt;
1392     }
1393
1394   return dump_user_location_t ();
1395 }
1396
1397 /* Return true if the phi described by STMT_INFO defines an IV of the
1398    loop to be vectorized.  */
1399
1400 static bool
1401 iv_phi_p (stmt_vec_info stmt_info)
1402 {
1403   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1404   if (virtual_operand_p (PHI_RESULT (phi)))
1405     return false;
1406
1407   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1408       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1409     return false;
1410
1411   return true;
1412 }
1413
1414 /* Function vect_can_advance_ivs_p
1415
1416    In case the number of iterations that LOOP iterates is unknown at compile
1417    time, an epilog loop will be generated, and the loop induction variables
1418    (IVs) will be "advanced" to the value they are supposed to take just before
1419    the epilog loop.  Here we check that the access function of the loop IVs
1420    and the expression that represents the loop bound are simple enough.
1421    These restrictions will be relaxed in the future.  */
1422
1423 bool
1424 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1425 {
1426   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1427   basic_block bb = loop->header;
1428   gphi_iterator gsi;
1429
1430   /* Analyze phi functions of the loop header.  */
1431
1432   if (dump_enabled_p ())
1433     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1434   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1435     {
1436       tree evolution_part;
1437
1438       gphi *phi = gsi.phi ();
1439       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1440       if (dump_enabled_p ())
1441         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1442                          phi_info->stmt);
1443
1444       /* Skip virtual phi's. The data dependences that are associated with
1445          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1446
1447          Skip reduction phis.  */
1448       if (!iv_phi_p (phi_info))
1449         {
1450           if (dump_enabled_p ())
1451             dump_printf_loc (MSG_NOTE, vect_location,
1452                              "reduc or virtual phi. skip.\n");
1453           continue;
1454         }
1455
1456       /* Analyze the evolution function.  */
1457
1458       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1459       if (evolution_part == NULL_TREE)
1460         {
1461           if (dump_enabled_p ())
1462             dump_printf (MSG_MISSED_OPTIMIZATION,
1463                          "No access function or evolution.\n");
1464           return false;
1465         }
1466
1467       /* FORNOW: We do not transform initial conditions of IVs
1468          which evolution functions are not invariants in the loop.  */
1469
1470       if (!expr_invariant_in_loop_p (loop, evolution_part))
1471         {
1472           if (dump_enabled_p ())
1473             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474                              "evolution not invariant in loop.\n");
1475           return false;
1476         }
1477
1478       /* FORNOW: We do not transform initial conditions of IVs
1479          which evolution functions are a polynomial of degree >= 2.  */
1480
1481       if (tree_is_chrec (evolution_part))
1482         {
1483           if (dump_enabled_p ())
1484             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485                              "evolution is chrec.\n");
1486           return false;
1487         }
1488     }
1489
1490   return true;
1491 }
1492
1493
1494 /*   Function vect_update_ivs_after_vectorizer.
1495
1496      "Advance" the induction variables of LOOP to the value they should take
1497      after the execution of LOOP.  This is currently necessary because the
1498      vectorizer does not handle induction variables that are used after the
1499      loop.  Such a situation occurs when the last iterations of LOOP are
1500      peeled, because:
1501      1. We introduced new uses after LOOP for IVs that were not originally used
1502         after LOOP: the IVs of LOOP are now used by an epilog loop.
1503      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1504         times, whereas the loop IVs should be bumped N times.
1505
1506      Input:
1507      - LOOP - a loop that is going to be vectorized. The last few iterations
1508               of LOOP were peeled.
1509      - NITERS - the number of iterations that LOOP executes (before it is
1510                 vectorized). i.e, the number of times the ivs should be bumped.
1511      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1512                   coming out from LOOP on which there are uses of the LOOP ivs
1513                   (this is the path from LOOP->exit to epilog_loop->preheader).
1514
1515                   The new definitions of the ivs are placed in LOOP->exit.
1516                   The phi args associated with the edge UPDATE_E in the bb
1517                   UPDATE_E->dest are updated accordingly.
1518
1519      Assumption 1: Like the rest of the vectorizer, this function assumes
1520      a single loop exit that has a single predecessor.
1521
1522      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1523      organized in the same order.
1524
1525      Assumption 3: The access function of the ivs is simple enough (see
1526      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1527
1528      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1529      coming out of LOOP on which the ivs of LOOP are used (this is the path
1530      that leads to the epilog loop; other paths skip the epilog loop).  This
1531      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1532      needs to have its phis updated.
1533  */
1534
1535 static void
1536 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1537                                   tree niters, edge update_e)
1538 {
1539   gphi_iterator gsi, gsi1;
1540   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1541   basic_block update_bb = update_e->dest;
1542   basic_block exit_bb = single_exit (loop)->dest;
1543
1544   /* Make sure there exists a single-predecessor exit bb:  */
1545   gcc_assert (single_pred_p (exit_bb));
1546   gcc_assert (single_succ_edge (exit_bb) == update_e);
1547
1548   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1549        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1550        gsi_next (&gsi), gsi_next (&gsi1))
1551     {
1552       tree init_expr;
1553       tree step_expr, off;
1554       tree type;
1555       tree var, ni, ni_name;
1556       gimple_stmt_iterator last_gsi;
1557
1558       gphi *phi = gsi.phi ();
1559       gphi *phi1 = gsi1.phi ();
1560       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1561       if (dump_enabled_p ())
1562         dump_printf_loc (MSG_NOTE, vect_location,
1563                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1564
1565       /* Skip reduction and virtual phis.  */
1566       if (!iv_phi_p (phi_info))
1567         {
1568           if (dump_enabled_p ())
1569             dump_printf_loc (MSG_NOTE, vect_location,
1570                              "reduc or virtual phi. skip.\n");
1571           continue;
1572         }
1573
1574       type = TREE_TYPE (gimple_phi_result (phi));
1575       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1576       step_expr = unshare_expr (step_expr);
1577
1578       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1579          of degree >= 2 or exponential.  */
1580       gcc_assert (!tree_is_chrec (step_expr));
1581
1582       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1583
1584       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1585                          fold_convert (TREE_TYPE (step_expr), niters),
1586                          step_expr);
1587       if (POINTER_TYPE_P (type))
1588         ni = fold_build_pointer_plus (init_expr, off);
1589       else
1590         ni = fold_build2 (PLUS_EXPR, type,
1591                           init_expr, fold_convert (type, off));
1592
1593       var = create_tmp_var (type, "tmp");
1594
1595       last_gsi = gsi_last_bb (exit_bb);
1596       gimple_seq new_stmts = NULL;
1597       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1598       /* Exit_bb shouldn't be empty.  */
1599       if (!gsi_end_p (last_gsi))
1600         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1601       else
1602         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1603
1604       /* Fix phi expressions in the successor bb.  */
1605       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1606     }
1607 }
1608
1609 /* Return a gimple value containing the misalignment (measured in vector
1610    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1611    it is away from a perfectly aligned address.  Add any new statements
1612    to SEQ.  */
1613
1614 static tree
1615 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1616 {
1617   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1618   stmt_vec_info stmt_info = dr_info->stmt;
1619   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1620
1621   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1622   unsigned HOST_WIDE_INT target_align_c;
1623   tree target_align_minus_1;
1624
1625   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1626                                         size_zero_node) < 0;
1627   tree offset = (negative
1628                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1629                              * TREE_INT_CST_LOW
1630                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1631                  : size_zero_node);
1632   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1633                                                           stmt_info, seq,
1634                                                           offset);
1635   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1636   if (target_align.is_constant (&target_align_c))
1637     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1638   else
1639     {
1640       tree vla = build_int_cst (type, target_align);
1641       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1642                                     fold_build2 (MINUS_EXPR, type,
1643                                                  build_int_cst (type, 0), vla));
1644       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1645                                           build_int_cst (type, 1));
1646     }
1647
1648   HOST_WIDE_INT elem_size
1649     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1650   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1651
1652   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1653   tree int_start_addr = fold_convert (type, start_addr);
1654   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1655                                         target_align_minus_1);
1656
1657   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1658   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1659                                         elem_size_log);
1660
1661   return misalign_in_elems;
1662 }
1663
1664 /* Function vect_gen_prolog_loop_niters
1665
1666    Generate the number of iterations which should be peeled as prolog for the
1667    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1668    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1669    As a result, after the execution of this loop, the data reference DR will
1670    refer to an aligned location.  The following computation is generated:
1671
1672    If the misalignment of DR is known at compile time:
1673      addr_mis = int mis = DR_MISALIGNMENT (dr);
1674    Else, compute address misalignment in bytes:
1675      addr_mis = addr & (target_align - 1)
1676
1677    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1678
1679    (elem_size = element type size; an element is the scalar element whose type
1680    is the inner type of the vectype)
1681
1682    The computations will be emitted at the end of BB.  We also compute and
1683    store upper bound (included) of the result in BOUND.
1684
1685    When the step of the data-ref in the loop is not 1 (as in interleaved data
1686    and SLP), the number of iterations of the prolog must be divided by the step
1687    (which is equal to the size of interleaved group).
1688
1689    The above formulas assume that VF == number of elements in the vector. This
1690    may not hold when there are multiple-types in the loop.
1691    In this case, for some data-references in the loop the VF does not represent
1692    the number of elements that fit in the vector.  Therefore, instead of VF we
1693    use TYPE_VECTOR_SUBPARTS.  */
1694
1695 static tree
1696 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1697                              basic_block bb, int *bound)
1698 {
1699   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1700   tree var;
1701   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1702   gimple_seq stmts = NULL, new_stmts = NULL;
1703   tree iters, iters_name;
1704   stmt_vec_info stmt_info = dr_info->stmt;
1705   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1706   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1707
1708   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1709     {
1710       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1711
1712       if (dump_enabled_p ())
1713         dump_printf_loc (MSG_NOTE, vect_location,
1714                          "known peeling = %d.\n", npeel);
1715
1716       iters = build_int_cst (niters_type, npeel);
1717       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1718     }
1719   else
1720     {
1721       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1722       tree type = TREE_TYPE (misalign_in_elems);
1723       HOST_WIDE_INT elem_size
1724         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1725       /* We only do prolog peeling if the target alignment is known at compile
1726          time.  */
1727       poly_uint64 align_in_elems =
1728         exact_div (target_align, elem_size);
1729       tree align_in_elems_minus_1 =
1730         build_int_cst (type, align_in_elems - 1);
1731       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1732
1733       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1734                                  & (align_in_elems - 1)).  */
1735       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1736                                             size_zero_node) < 0;
1737       if (negative)
1738         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1739                              align_in_elems_tree);
1740       else
1741         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1742                              misalign_in_elems);
1743       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1744       iters = fold_convert (niters_type, iters);
1745       unsigned HOST_WIDE_INT align_in_elems_c;
1746       if (align_in_elems.is_constant (&align_in_elems_c))
1747         *bound = align_in_elems_c - 1;
1748       else
1749         *bound = -1;
1750     }
1751
1752   if (dump_enabled_p ())
1753     dump_printf_loc (MSG_NOTE, vect_location,
1754                      "niters for prolog loop: %T\n", iters);
1755
1756   var = create_tmp_var (niters_type, "prolog_loop_niters");
1757   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1758
1759   if (new_stmts)
1760     gimple_seq_add_seq (&stmts, new_stmts);
1761   if (stmts)
1762     {
1763       gcc_assert (single_succ_p (bb));
1764       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1765       if (gsi_end_p (gsi))
1766         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1767       else
1768         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1769     }
1770   return iters_name;
1771 }
1772
1773
1774 /* Function vect_update_init_of_dr
1775
1776    If CODE is PLUS, the vector loop starts NITERS iterations after the
1777    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1778    iterations before the scalar one (using masking to skip inactive
1779    elements).  This function updates the information recorded in DR to
1780    account for the difference.  Specifically, it updates the OFFSET
1781    field of DR_INFO.  */
1782
1783 static void
1784 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1785 {
1786   struct data_reference *dr = dr_info->dr;
1787   tree offset = dr_info->offset;
1788   if (!offset)
1789     offset = build_zero_cst (sizetype);
1790
1791   niters = fold_build2 (MULT_EXPR, sizetype,
1792                         fold_convert (sizetype, niters),
1793                         fold_convert (sizetype, DR_STEP (dr)));
1794   offset = fold_build2 (code, sizetype,
1795                         fold_convert (sizetype, offset), niters);
1796   dr_info->offset = offset;
1797 }
1798
1799
1800 /* Function vect_update_inits_of_drs
1801
1802    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1803    CODE and NITERS are as for vect_update_inits_of_dr.  */
1804
1805 void
1806 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1807                           tree_code code)
1808 {
1809   unsigned int i;
1810   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1811   struct data_reference *dr;
1812
1813   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1814
1815   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1816      here, but since we might use these niters to update the epilogues niters
1817      and data references we can't insert them here as this definition might not
1818      always dominate its uses.  */
1819   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1820     niters = fold_convert (sizetype, niters);
1821
1822   FOR_EACH_VEC_ELT (datarefs, i, dr)
1823     {
1824       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1825       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1826           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1827         vect_update_init_of_dr (dr_info, niters, code);
1828     }
1829 }
1830
1831 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1832    by masking.  This involves calculating the number of iterations to
1833    be peeled and then aligning all memory references appropriately.  */
1834
1835 void
1836 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1837 {
1838   tree misalign_in_elems;
1839   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1840
1841   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1842
1843   /* From the information recorded in LOOP_VINFO get the number of iterations
1844      that need to be skipped via masking.  */
1845   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1846     {
1847       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1848                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1849       misalign_in_elems = build_int_cst (type, misalign);
1850     }
1851   else
1852     {
1853       gimple_seq seq1 = NULL, seq2 = NULL;
1854       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1855       misalign_in_elems = fold_convert (type, misalign_in_elems);
1856       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1857                                                 &seq2, true, NULL_TREE);
1858       gimple_seq_add_seq (&seq1, seq2);
1859       if (seq1)
1860         {
1861           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1862           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1863           gcc_assert (!new_bb);
1864         }
1865     }
1866
1867   if (dump_enabled_p ())
1868     dump_printf_loc (MSG_NOTE, vect_location,
1869                      "misalignment for fully-masked loop: %T\n",
1870                      misalign_in_elems);
1871
1872   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1873
1874   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1875 }
1876
1877 /* This function builds ni_name = number of iterations.  Statements
1878    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1879    it to TRUE if new ssa_var is generated.  */
1880
1881 tree
1882 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1883 {
1884   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1885   if (TREE_CODE (ni) == INTEGER_CST)
1886     return ni;
1887   else
1888     {
1889       tree ni_name, var;
1890       gimple_seq stmts = NULL;
1891       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1892
1893       var = create_tmp_var (TREE_TYPE (ni), "niters");
1894       ni_name = force_gimple_operand (ni, &stmts, false, var);
1895       if (stmts)
1896         {
1897           gsi_insert_seq_on_edge_immediate (pe, stmts);
1898           if (new_var_p != NULL)
1899             *new_var_p = true;
1900         }
1901
1902       return ni_name;
1903     }
1904 }
1905
1906 /* Calculate the number of iterations above which vectorized loop will be
1907    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1908    of prolog loop.  If it's integer const, the integer number is also passed
1909    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1910    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1911    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1912    threshold below which the scalar (rather than vectorized) loop will be
1913    executed.  This function stores the upper bound (inclusive) of the result
1914    in BOUND_SCALAR.  */
1915
1916 static tree
1917 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1918                              int bound_prolog, poly_int64 bound_epilog, int th,
1919                              poly_uint64 *bound_scalar,
1920                              bool check_profitability)
1921 {
1922   tree type = TREE_TYPE (niters_prolog);
1923   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1924                              build_int_cst (type, bound_epilog));
1925
1926   *bound_scalar = bound_prolog + bound_epilog;
1927   if (check_profitability)
1928     {
1929       /* TH indicates the minimum niters of vectorized loop, while we
1930          compute the maximum niters of scalar loop.  */
1931       th--;
1932       /* Peeling for constant times.  */
1933       if (int_niters_prolog >= 0)
1934         {
1935           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1936           return build_int_cst (type, *bound_scalar);
1937         }
1938       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1939          and BOUND_EPILOG are inclusive upper bounds.  */
1940       if (known_ge (th, bound_prolog + bound_epilog))
1941         {
1942           *bound_scalar = th;
1943           return build_int_cst (type, th);
1944         }
1945       /* Need to do runtime comparison.  */
1946       else if (maybe_gt (th, bound_epilog))
1947         {
1948           *bound_scalar = upper_bound (*bound_scalar, th);
1949           return fold_build2 (MAX_EXPR, type,
1950                               build_int_cst (type, th), niters);
1951         }
1952     }
1953   return niters;
1954 }
1955
1956 /* NITERS is the number of times that the original scalar loop executes
1957    after peeling.  Work out the maximum number of iterations N that can
1958    be handled by the vectorized form of the loop and then either:
1959
1960    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1961
1962         niters_vector = N
1963
1964    b) set *STEP_VECTOR_PTR to one and generate:
1965
1966         niters_vector = N / vf
1967
1968    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1969    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1970    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1971
1972 void
1973 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1974                              tree *niters_vector_ptr, tree *step_vector_ptr,
1975                              bool niters_no_overflow)
1976 {
1977   tree ni_minus_gap, var;
1978   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1979   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1980   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1981   tree log_vf = NULL_TREE;
1982
1983   /* If epilogue loop is required because of data accesses with gaps, we
1984      subtract one iteration from the total number of iterations here for
1985      correct calculation of RATIO.  */
1986   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1987     {
1988       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1989                                   build_one_cst (type));
1990       if (!is_gimple_val (ni_minus_gap))
1991         {
1992           var = create_tmp_var (type, "ni_gap");
1993           gimple *stmts = NULL;
1994           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1995                                                true, var);
1996           gsi_insert_seq_on_edge_immediate (pe, stmts);
1997         }
1998     }
1999   else
2000     ni_minus_gap = niters;
2001
2002   unsigned HOST_WIDE_INT const_vf;
2003   if (vf.is_constant (&const_vf)
2004       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2005     {
2006       /* Create: niters >> log2(vf) */
2007       /* If it's known that niters == number of latch executions + 1 doesn't
2008          overflow, we can generate niters >> log2(vf); otherwise we generate
2009          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2010          will be at least one.  */
2011       log_vf = build_int_cst (type, exact_log2 (const_vf));
2012       if (niters_no_overflow)
2013         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2014       else
2015         niters_vector
2016           = fold_build2 (PLUS_EXPR, type,
2017                          fold_build2 (RSHIFT_EXPR, type,
2018                                       fold_build2 (MINUS_EXPR, type,
2019                                                    ni_minus_gap,
2020                                                    build_int_cst (type, vf)),
2021                                       log_vf),
2022                          build_int_cst (type, 1));
2023       step_vector = build_one_cst (type);
2024     }
2025   else
2026     {
2027       niters_vector = ni_minus_gap;
2028       step_vector = build_int_cst (type, vf);
2029     }
2030
2031   if (!is_gimple_val (niters_vector))
2032     {
2033       var = create_tmp_var (type, "bnd");
2034       gimple_seq stmts = NULL;
2035       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2036       gsi_insert_seq_on_edge_immediate (pe, stmts);
2037       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2038          we set range information to make niters analyzer's life easier.
2039          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2040          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2041       if (stmts != NULL && log_vf)
2042         {
2043           if (niters_no_overflow)
2044             set_range_info (niters_vector, VR_RANGE,
2045                             wi::one (TYPE_PRECISION (type)),
2046                             wi::rshift (wi::max_value (TYPE_PRECISION (type),
2047                                                        TYPE_SIGN (type)),
2048                                         exact_log2 (const_vf),
2049                                         TYPE_SIGN (type)));
2050           /* For VF == 1 the vector IV might also overflow so we cannot
2051              assert a minimum value of 1.  */
2052           else if (const_vf > 1)
2053             set_range_info (niters_vector, VR_RANGE,
2054                             wi::one (TYPE_PRECISION (type)),
2055                             wi::rshift (wi::max_value (TYPE_PRECISION (type),
2056                                                        TYPE_SIGN (type))
2057                                         - (const_vf - 1),
2058                                         exact_log2 (const_vf), TYPE_SIGN (type))
2059                             + 1);
2060         }
2061     }
2062   *niters_vector_ptr = niters_vector;
2063   *step_vector_ptr = step_vector;
2064
2065   return;
2066 }
2067
2068 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2069    loop specified by LOOP_VINFO after vectorization, compute the number
2070    of iterations before vectorization (niters_vector * vf) and store it
2071    to NITERS_VECTOR_MULT_VF_PTR.  */
2072
2073 static void
2074 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2075                                      tree niters_vector,
2076                                      tree *niters_vector_mult_vf_ptr)
2077 {
2078   /* We should be using a step_vector of VF if VF is variable.  */
2079   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2080   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081   tree type = TREE_TYPE (niters_vector);
2082   tree log_vf = build_int_cst (type, exact_log2 (vf));
2083   basic_block exit_bb = single_exit (loop)->dest;
2084
2085   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2086   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2087                                             niters_vector, log_vf);
2088   if (!is_gimple_val (niters_vector_mult_vf))
2089     {
2090       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2091       gimple_seq stmts = NULL;
2092       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2093                                                     &stmts, true, var);
2094       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2095       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2096     }
2097   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2098 }
2099
2100 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2101    this function searches for the corresponding lcssa phi node in exit
2102    bb of LOOP.  If it is found, return the phi result; otherwise return
2103    NULL.  */
2104
2105 static tree
2106 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2107                 gphi *lcssa_phi)
2108 {
2109   gphi_iterator gsi;
2110   edge e = single_exit (loop);
2111
2112   gcc_assert (single_pred_p (e->dest));
2113   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2114     {
2115       gphi *phi = gsi.phi ();
2116       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2117                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2118         return PHI_RESULT (phi);
2119     }
2120   return NULL_TREE;
2121 }
2122
2123 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2124    from SECOND/FIRST and puts it at the original loop's preheader/exit
2125    edge, the two loops are arranged as below:
2126
2127        preheader_a:
2128      first_loop:
2129        header_a:
2130          i_1 = PHI<i_0, i_2>;
2131          ...
2132          i_2 = i_1 + 1;
2133          if (cond_a)
2134            goto latch_a;
2135          else
2136            goto between_bb;
2137        latch_a:
2138          goto header_a;
2139
2140        between_bb:
2141          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2142
2143      second_loop:
2144        header_b:
2145          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2146                                  or with i_2 if no LCSSA phi is created
2147                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2148          ...
2149          i_4 = i_3 + 1;
2150          if (cond_b)
2151            goto latch_b;
2152          else
2153            goto exit_bb;
2154        latch_b:
2155          goto header_b;
2156
2157        exit_bb:
2158
2159    This function creates loop closed SSA for the first loop; update the
2160    second loop's PHI nodes by replacing argument on incoming edge with the
2161    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2162    is false, Loop closed ssa phis will only be created for non-iv phis for
2163    the first loop.
2164
2165    This function assumes exit bb of the first loop is preheader bb of the
2166    second loop, i.e, between_bb in the example code.  With PHIs updated,
2167    the second loop will execute rest iterations of the first.  */
2168
2169 static void
2170 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2171                                    class loop *first, class loop *second,
2172                                    bool create_lcssa_for_iv_phis)
2173 {
2174   gphi_iterator gsi_update, gsi_orig;
2175   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2176
2177   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2178   edge second_preheader_e = loop_preheader_edge (second);
2179   basic_block between_bb = single_exit (first)->dest;
2180
2181   gcc_assert (between_bb == second_preheader_e->src);
2182   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2183   /* Either the first loop or the second is the loop to be vectorized.  */
2184   gcc_assert (loop == first || loop == second);
2185
2186   for (gsi_orig = gsi_start_phis (first->header),
2187        gsi_update = gsi_start_phis (second->header);
2188        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2189        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2190     {
2191       gphi *orig_phi = gsi_orig.phi ();
2192       gphi *update_phi = gsi_update.phi ();
2193
2194       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2195       /* Generate lcssa PHI node for the first loop.  */
2196       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2197       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2198       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2199         {
2200           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2201           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2202           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2203           arg = new_res;
2204         }
2205
2206       /* Update PHI node in the second loop by replacing arg on the loop's
2207          incoming edge.  */
2208       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2209     }
2210
2211   /* For epilogue peeling we have to make sure to copy all LC PHIs
2212      for correct vectorization of live stmts.  */
2213   if (loop == first)
2214     {
2215       basic_block orig_exit = single_exit (second)->dest;
2216       for (gsi_orig = gsi_start_phis (orig_exit);
2217            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2218         {
2219           gphi *orig_phi = gsi_orig.phi ();
2220           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2221           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2222             continue;
2223
2224           /* Already created in the above loop.   */
2225           if (find_guard_arg (first, second, orig_phi))
2226             continue;
2227
2228           tree new_res = copy_ssa_name (orig_arg);
2229           gphi *lcphi = create_phi_node (new_res, between_bb);
2230           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2231         }
2232     }
2233 }
2234
2235 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2236    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2237    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2238    appear like below:
2239
2240        guard_bb:
2241          if (cond)
2242            goto merge_bb;
2243          else
2244            goto skip_loop;
2245
2246      skip_loop:
2247        header_a:
2248          i_1 = PHI<i_0, i_2>;
2249          ...
2250          i_2 = i_1 + 1;
2251          if (cond_a)
2252            goto latch_a;
2253          else
2254            goto exit_a;
2255        latch_a:
2256          goto header_a;
2257
2258        exit_a:
2259          i_5 = PHI<i_2>;
2260
2261        merge_bb:
2262          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2263
2264      update_loop:
2265        header_b:
2266          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2267          ...
2268          i_4 = i_3 + 1;
2269          if (cond_b)
2270            goto latch_b;
2271          else
2272            goto exit_bb;
2273        latch_b:
2274          goto header_b;
2275
2276        exit_bb:
2277
2278    This function creates PHI nodes at merge_bb and replaces the use of i_5
2279    in the update_loop's PHI node with the result of new PHI result.  */
2280
2281 static void
2282 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2283                                     class loop *update_loop,
2284                                     edge guard_edge, edge merge_edge)
2285 {
2286   location_t merge_loc, guard_loc;
2287   edge orig_e = loop_preheader_edge (skip_loop);
2288   edge update_e = loop_preheader_edge (update_loop);
2289   gphi_iterator gsi_orig, gsi_update;
2290
2291   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2292         gsi_update = gsi_start_phis (update_loop->header));
2293        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2294        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2295     {
2296       gphi *orig_phi = gsi_orig.phi ();
2297       gphi *update_phi = gsi_update.phi ();
2298
2299       /* Generate new phi node at merge bb of the guard.  */
2300       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2301       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2302
2303       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2304          args in NEW_PHI for these edges.  */
2305       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2306       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2307       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2308       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2309       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2310       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2311
2312       /* Update phi in UPDATE_PHI.  */
2313       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2314     }
2315 }
2316
2317 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2318    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2319    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2320    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2321    The CFG looks like:
2322
2323      loop:
2324        header_a:
2325          i_1 = PHI<i_0, i_2>;
2326          ...
2327          i_2 = i_1 + 1;
2328          if (cond_a)
2329            goto latch_a;
2330          else
2331            goto exit_a;
2332        latch_a:
2333          goto header_a;
2334
2335        exit_a:
2336
2337        guard_bb:
2338          if (cond)
2339            goto merge_bb;
2340          else
2341            goto epilog_loop;
2342
2343        ;; fall_through_bb
2344
2345      epilog_loop:
2346        header_b:
2347          i_3 = PHI<i_2, i_4>;
2348          ...
2349          i_4 = i_3 + 1;
2350          if (cond_b)
2351            goto latch_b;
2352          else
2353            goto merge_bb;
2354        latch_b:
2355          goto header_b;
2356
2357        merge_bb:
2358          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2359
2360        exit_bb:
2361          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2362
2363    For each name used out side EPILOG (i.e - for each name that has a lcssa
2364    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2365    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2366    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2367    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2368    in exit_bb will also be updated.  */
2369
2370 static void
2371 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2372                                     edge guard_edge, edge merge_edge)
2373 {
2374   gphi_iterator gsi;
2375   basic_block merge_bb = guard_edge->dest;
2376
2377   gcc_assert (single_succ_p (merge_bb));
2378   edge e = single_succ_edge (merge_bb);
2379   basic_block exit_bb = e->dest;
2380   gcc_assert (single_pred_p (exit_bb));
2381   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2382
2383   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2384     {
2385       gphi *update_phi = gsi.phi ();
2386       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2387
2388       tree merge_arg = NULL_TREE;
2389
2390       /* If the old argument is a SSA_NAME use its current_def.  */
2391       if (TREE_CODE (old_arg) == SSA_NAME)
2392         merge_arg = get_current_def (old_arg);
2393       /* If it's a constant or doesn't have a current_def, just use the old
2394          argument.  */
2395       if (!merge_arg)
2396         merge_arg = old_arg;
2397
2398       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2399       /* If the var is live after loop but not a reduction, we simply
2400          use the old arg.  */
2401       if (!guard_arg)
2402         guard_arg = old_arg;
2403
2404       /* Create new phi node in MERGE_BB:  */
2405       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2406       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2407
2408       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2409          the two PHI args in merge_phi for these edges.  */
2410       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2411       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2412
2413       /* Update the original phi in exit_bb.  */
2414       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2415     }
2416 }
2417
2418 /* EPILOG loop is duplicated from the original loop for vectorizing,
2419    the arg of its loop closed ssa PHI needs to be updated.  */
2420
2421 static void
2422 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2423 {
2424   gphi_iterator gsi;
2425   basic_block exit_bb = single_exit (epilog)->dest;
2426
2427   gcc_assert (single_pred_p (exit_bb));
2428   edge e = EDGE_PRED (exit_bb, 0);
2429   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2430     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2431 }
2432
2433 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2434    iterate exactly CONST_NITERS times.  Make a final decision about
2435    whether the epilogue loop should be used, returning true if so.  */
2436
2437 static bool
2438 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2439                              unsigned HOST_WIDE_INT const_niters)
2440 {
2441   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2442      using an epilogue loop for a single scalar iteration, even if
2443      we could in principle implement that using partial vectors.  */
2444   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2445   if (const_niters <= gap_niters + 1)
2446     return false;
2447
2448   /* Install the number of iterations.  */
2449   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2450   tree niters_tree = build_int_cst (niters_type, const_niters);
2451   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2452
2453   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2454   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2455
2456   /* Decide what to do if the number of epilogue iterations is not
2457      a multiple of the epilogue loop's vectorization factor.  */
2458   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2459 }
2460
2461 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2462    Return a value that equals:
2463
2464    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2465    - SKIP_VALUE when the main loop is skipped.  */
2466
2467 tree
2468 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2469                            tree skip_value)
2470 {
2471   gcc_assert (loop_vinfo->main_loop_edge);
2472
2473   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2474   basic_block bb = loop_vinfo->main_loop_edge->dest;
2475   gphi *new_phi = create_phi_node (phi_result, bb);
2476   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2477                UNKNOWN_LOCATION);
2478   add_phi_arg (new_phi, skip_value,
2479                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2480   return phi_result;
2481 }
2482
2483 /* Function vect_do_peeling.
2484
2485    Input:
2486    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2487
2488        preheader:
2489      LOOP:
2490        header_bb:
2491          loop_body
2492          if (exit_loop_cond) goto exit_bb
2493          else                goto header_bb
2494        exit_bb:
2495
2496    - NITERS: The number of iterations of the loop.
2497    - NITERSM1: The number of iterations of the loop's latch.
2498    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2499    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2500                               CHECK_PROFITABILITY is true.
2501    Output:
2502    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2503      iterate after vectorization; see vect_set_loop_condition for details.
2504    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2505      should be set to the number of scalar iterations handled by the
2506      vector loop.  The SSA name is only used on exit from the loop.
2507
2508    This function peels prolog and epilog from the loop, adds guards skipping
2509    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2510    would look like:
2511
2512        guard_bb_1:
2513          if (prefer_scalar_loop) goto merge_bb_1
2514          else                    goto guard_bb_2
2515
2516        guard_bb_2:
2517          if (skip_prolog) goto merge_bb_2
2518          else             goto prolog_preheader
2519
2520        prolog_preheader:
2521      PROLOG:
2522        prolog_header_bb:
2523          prolog_body
2524          if (exit_prolog_cond) goto prolog_exit_bb
2525          else                  goto prolog_header_bb
2526        prolog_exit_bb:
2527
2528        merge_bb_2:
2529
2530        vector_preheader:
2531      VECTOR LOOP:
2532        vector_header_bb:
2533          vector_body
2534          if (exit_vector_cond) goto vector_exit_bb
2535          else                  goto vector_header_bb
2536        vector_exit_bb:
2537
2538        guard_bb_3:
2539          if (skip_epilog) goto merge_bb_3
2540          else             goto epilog_preheader
2541
2542        merge_bb_1:
2543
2544        epilog_preheader:
2545      EPILOG:
2546        epilog_header_bb:
2547          epilog_body
2548          if (exit_epilog_cond) goto merge_bb_3
2549          else                  goto epilog_header_bb
2550
2551        merge_bb_3:
2552
2553    Note this function peels prolog and epilog only if it's necessary,
2554    as well as guards.
2555    This function returns the epilogue loop if a decision was made to vectorize
2556    it, otherwise NULL.
2557
2558    The analysis resulting in this epilogue loop's loop_vec_info was performed
2559    in the same vect_analyze_loop call as the main loop's.  At that time
2560    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2561    vectorization factors than the main loop.  This list is stored in the main
2562    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2563    vectorize the epilogue loop for a lower vectorization factor,  the
2564    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2565    updated and linked to the epilogue loop.  This is later used to vectorize
2566    the epilogue.  The reason the loop_vec_info needs updating is that it was
2567    constructed based on the original main loop, and the epilogue loop is a
2568    copy of this loop, so all links pointing to statements in the original loop
2569    need updating.  Furthermore, these loop_vec_infos share the
2570    data_reference's records, which will also need to be updated.
2571
2572    TODO: Guard for prefer_scalar_loop should be emitted along with
2573    versioning conditions if loop versioning is needed.  */
2574
2575
2576 class loop *
2577 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2578                  tree *niters_vector, tree *step_vector,
2579                  tree *niters_vector_mult_vf_var, int th,
2580                  bool check_profitability, bool niters_no_overflow,
2581                  tree *advance)
2582 {
2583   edge e, guard_e;
2584   tree type = TREE_TYPE (niters), guard_cond;
2585   basic_block guard_bb, guard_to;
2586   profile_probability prob_prolog, prob_vector, prob_epilog;
2587   int estimated_vf;
2588   int prolog_peeling = 0;
2589   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2590   bool vect_epilogues_updated_niters = false;
2591   /* We currently do not support prolog peeling if the target alignment is not
2592      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2593      target alignment being constant.  */
2594   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2595   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2596     return NULL;
2597
2598   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2599     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2600
2601   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2602   poly_uint64 bound_epilog = 0;
2603   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2604       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2605     bound_epilog += vf - 1;
2606   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2607     bound_epilog += 1;
2608   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2609   poly_uint64 bound_scalar = bound_epilog;
2610
2611   if (!prolog_peeling && !epilog_peeling)
2612     return NULL;
2613
2614   /* Before doing any peeling make sure to reset debug binds outside of
2615      the loop refering to defs not in LC SSA.  */
2616   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2617   for (unsigned i = 0; i < loop->num_nodes; ++i)
2618     {
2619       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2620       imm_use_iterator ui;
2621       gimple *use_stmt;
2622       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2623            gsi_next (&gsi))
2624         {
2625           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2626             if (gimple_debug_bind_p (use_stmt)
2627                 && loop != gimple_bb (use_stmt)->loop_father
2628                 && !flow_loop_nested_p (loop,
2629                                         gimple_bb (use_stmt)->loop_father))
2630               {
2631                 gimple_debug_bind_reset_value (use_stmt);
2632                 update_stmt (use_stmt);
2633               }
2634         }
2635       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2636            gsi_next (&gsi))
2637         {
2638           ssa_op_iter op_iter;
2639           def_operand_p def_p;
2640           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2641             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2642               if (gimple_debug_bind_p (use_stmt)
2643                   && loop != gimple_bb (use_stmt)->loop_father
2644                   && !flow_loop_nested_p (loop,
2645                                           gimple_bb (use_stmt)->loop_father))
2646                 {
2647                   gimple_debug_bind_reset_value (use_stmt);
2648                   update_stmt (use_stmt);
2649                 }
2650         }
2651     }
2652
2653   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2654   estimated_vf = vect_vf_for_cost (loop_vinfo);
2655   if (estimated_vf == 2)
2656     estimated_vf = 3;
2657   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2658                         .apply_scale (estimated_vf - 1, estimated_vf);
2659
2660   class loop *prolog, *epilog = NULL;
2661   class loop *first_loop = loop;
2662   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2663
2664   /* We might have a queued need to update virtual SSA form.  As we
2665      delete the update SSA machinery below after doing a regular
2666      incremental SSA update during loop copying make sure we don't
2667      lose that fact.
2668      ???  Needing to update virtual SSA form by renaming is unfortunate
2669      but not all of the vectorizer code inserting new loads / stores
2670      properly assigns virtual operands to those statements.  */
2671   update_ssa (TODO_update_ssa_only_virtuals);
2672
2673   create_lcssa_for_virtual_phi (loop);
2674
2675   /* If we're vectorizing an epilogue loop, the update_ssa above will
2676      have ensured that the virtual operand is in SSA form throughout the
2677      vectorized main loop.  Normally it is possible to trace the updated
2678      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2679      back to scalar-stmt vuses, meaning that the effect of the SSA update
2680      remains local to the main loop.  However, there are rare cases in
2681      which the vectorized loop has vdefs even when the original scalar
2682      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2683      introduces clobbers of the temporary vector array, which in turn
2684      needs new vdefs.  If the scalar loop doesn't write to memory, these
2685      new vdefs will be the only ones in the vector loop.
2686
2687      In that case, update_ssa will have added a new virtual phi to the
2688      main loop, which previously didn't need one.  Ensure that we (locally)
2689      maintain LCSSA form for the virtual operand, just as we would have
2690      done if the virtual phi had existed from the outset.  This makes it
2691      easier to duplicate the scalar epilogue loop below.  */
2692   tree vop_to_rename = NULL_TREE;
2693   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2694     {
2695       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2696       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2697     }
2698
2699   if (MAY_HAVE_DEBUG_BIND_STMTS)
2700     {
2701       gcc_assert (!adjust_vec.exists ());
2702       adjust_vec.create (32);
2703     }
2704   initialize_original_copy_tables ();
2705
2706   /* Record the anchor bb at which the guard should be placed if the scalar
2707      loop might be preferred.  */
2708   basic_block anchor = loop_preheader_edge (loop)->src;
2709
2710   /* Generate the number of iterations for the prolog loop.  We do this here
2711      so that we can also get the upper bound on the number of iterations.  */
2712   tree niters_prolog;
2713   int bound_prolog = 0;
2714   if (prolog_peeling)
2715     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2716                                                   &bound_prolog);
2717   else
2718     niters_prolog = build_int_cst (type, 0);
2719
2720   loop_vec_info epilogue_vinfo = NULL;
2721   if (vect_epilogues)
2722     {
2723       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2724       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2725     }
2726
2727   tree niters_vector_mult_vf = NULL_TREE;
2728   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2729   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2730   edge update_e = NULL, skip_e = NULL;
2731   unsigned int lowest_vf = constant_lower_bound (vf);
2732   /* If we know the number of scalar iterations for the main loop we should
2733      check whether after the main loop there are enough iterations left over
2734      for the epilogue.  */
2735   if (vect_epilogues
2736       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2737       && prolog_peeling >= 0
2738       && known_eq (vf, lowest_vf))
2739     {
2740       unsigned HOST_WIDE_INT eiters
2741         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2742            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2743
2744       eiters -= prolog_peeling;
2745       eiters
2746         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2747
2748       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2749         {
2750           delete epilogue_vinfo;
2751           epilogue_vinfo = NULL;
2752           if (loop_vinfo->epilogue_vinfos.length () == 0)
2753             {
2754               vect_epilogues = false;
2755               break;
2756             }
2757           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2758           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2759         }
2760       vect_epilogues_updated_niters = true;
2761     }
2762   /* Prolog loop may be skipped.  */
2763   bool skip_prolog = (prolog_peeling != 0);
2764   /* Skip this loop to epilog when there are not enough iterations to enter this
2765      vectorized loop.  If true we should perform runtime checks on the NITERS
2766      to check whether we should skip the current vectorized loop.  If we know
2767      the number of scalar iterations we may choose to add a runtime check if
2768      this number "maybe" smaller than the number of iterations required
2769      when we know the number of scalar iterations may potentially
2770      be smaller than the number of iterations required to enter this loop, for
2771      this we use the upper bounds on the prolog and epilog peeling.  When we
2772      don't know the number of iterations and don't require versioning it is
2773      because we have asserted that there are enough scalar iterations to enter
2774      the main loop, so this skip is not necessary.  When we are versioning then
2775      we only add such a skip if we have chosen to vectorize the epilogue.  */
2776   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2777                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2778                                   bound_prolog + bound_epilog)
2779                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2780                          || vect_epilogues));
2781   /* Epilog loop must be executed if the number of iterations for epilog
2782      loop is known at compile time, otherwise we need to add a check at
2783      the end of vector loop and skip to the end of epilog loop.  */
2784   bool skip_epilog = (prolog_peeling < 0
2785                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2786                       || !vf.is_constant ());
2787   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2788   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2789     skip_epilog = false;
2790
2791   if (skip_vector)
2792     {
2793       split_edge (loop_preheader_edge (loop));
2794
2795       /* Due to the order in which we peel prolog and epilog, we first
2796          propagate probability to the whole loop.  The purpose is to
2797          avoid adjusting probabilities of both prolog and vector loops
2798          separately.  Note in this case, the probability of epilog loop
2799          needs to be scaled back later.  */
2800       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2801       if (prob_vector.initialized_p ())
2802         {
2803           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2804           scale_loop_profile (loop, prob_vector, 0);
2805         }
2806     }
2807
2808   dump_user_location_t loop_loc = find_loop_location (loop);
2809   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2810   if (vect_epilogues)
2811     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2812        use the original scalar loop as remaining epilogue if necessary.  */
2813     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2814       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2815
2816   if (prolog_peeling)
2817     {
2818       e = loop_preheader_edge (loop);
2819       if (!slpeel_can_duplicate_loop_p (loop, e))
2820         {
2821           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2822                            "loop can't be duplicated to preheader edge.\n");
2823           gcc_unreachable ();
2824         }
2825       /* Peel prolog and put it on preheader edge of loop.  */
2826       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2827       if (!prolog)
2828         {
2829           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2830                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2831           gcc_unreachable ();
2832         }
2833       prolog->force_vectorize = false;
2834       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2835       first_loop = prolog;
2836       reset_original_copy_tables ();
2837
2838       /* Update the number of iterations for prolog loop.  */
2839       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2840       vect_set_loop_condition (prolog, NULL, niters_prolog,
2841                                step_prolog, NULL_TREE, false);
2842
2843       /* Skip the prolog loop.  */
2844       if (skip_prolog)
2845         {
2846           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2847                                     niters_prolog, build_int_cst (type, 0));
2848           guard_bb = loop_preheader_edge (prolog)->src;
2849           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2850           guard_to = split_edge (loop_preheader_edge (loop));
2851           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2852                                            guard_to, guard_bb,
2853                                            prob_prolog.invert (),
2854                                            irred_flag);
2855           e = EDGE_PRED (guard_to, 0);
2856           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2857           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2858
2859           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2860           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2861         }
2862
2863       /* Update init address of DRs.  */
2864       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2865       /* Update niters for vector loop.  */
2866       LOOP_VINFO_NITERS (loop_vinfo)
2867         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2868       LOOP_VINFO_NITERSM1 (loop_vinfo)
2869         = fold_build2 (MINUS_EXPR, type,
2870                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2871       bool new_var_p = false;
2872       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2873       /* It's guaranteed that vector loop bound before vectorization is at
2874          least VF, so set range information for newly generated var.  */
2875       if (new_var_p)
2876         set_range_info (niters, VR_RANGE,
2877                         wi::to_wide (build_int_cst (type, vf)),
2878                         wi::to_wide (TYPE_MAX_VALUE (type)));
2879
2880       /* Prolog iterates at most bound_prolog times, latch iterates at
2881          most bound_prolog - 1 times.  */
2882       record_niter_bound (prolog, bound_prolog - 1, false, true);
2883       delete_update_ssa ();
2884       adjust_vec_debug_stmts ();
2885       scev_reset ();
2886     }
2887
2888   if (epilog_peeling)
2889     {
2890       e = single_exit (loop);
2891       if (!slpeel_can_duplicate_loop_p (loop, e))
2892         {
2893           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2894                            "loop can't be duplicated to exit edge.\n");
2895           gcc_unreachable ();
2896         }
2897       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2898          said epilog then we should use a copy of the main loop as a starting
2899          point.  This loop may have already had some preliminary transformations
2900          to allow for more optimal vectorization, for example if-conversion.
2901          If we are not vectorizing the epilog then we should use the scalar loop
2902          as the transformations mentioned above make less or no sense when not
2903          vectorizing.  */
2904       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2905       if (vop_to_rename)
2906         {
2907           /* Vectorizing the main loop can sometimes introduce a vdef to
2908              a loop that previously didn't have one; see the comment above
2909              the definition of VOP_TO_RENAME for details.  The definition
2910              D that holds on E will then be different from the definition
2911              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2912              rename VOP_TO_RENAME to D when copying the loop.
2913
2914              The virtual operand is in LCSSA form for the main loop,
2915              and no stmt between the main loop and E needs a vdef,
2916              so we know that D is provided by a phi rather than by a
2917              vdef on a normal gimple stmt.  */
2918           basic_block vdef_bb = e->src;
2919           gphi *vphi;
2920           while (!(vphi = get_virtual_phi (vdef_bb)))
2921             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2922           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2923           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2924         }
2925       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2926       if (!epilog)
2927         {
2928           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2929                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2930           gcc_unreachable ();
2931         }
2932       epilog->force_vectorize = false;
2933       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2934
2935       /* Scalar version loop may be preferred.  In this case, add guard
2936          and skip to epilog.  Note this only happens when the number of
2937          iterations of loop is unknown at compile time, otherwise this
2938          won't be vectorized.  */
2939       if (skip_vector)
2940         {
2941           /* Additional epilogue iteration is peeled if gap exists.  */
2942           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2943                                                 bound_prolog, bound_epilog,
2944                                                 th, &bound_scalar,
2945                                                 check_profitability);
2946           /* Build guard against NITERSM1 since NITERS may overflow.  */
2947           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2948           guard_bb = anchor;
2949           guard_to = split_edge (loop_preheader_edge (epilog));
2950           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2951                                            guard_to, guard_bb,
2952                                            prob_vector.invert (),
2953                                            irred_flag);
2954           skip_e = guard_e;
2955           e = EDGE_PRED (guard_to, 0);
2956           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2957           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2958
2959           /* Simply propagate profile info from guard_bb to guard_to which is
2960              a merge point of control flow.  */
2961           guard_to->count = guard_bb->count;
2962
2963           /* Scale probability of epilog loop back.
2964              FIXME: We should avoid scaling down and back up.  Profile may
2965              get lost if we scale down to 0.  */
2966           basic_block *bbs = get_loop_body (epilog);
2967           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2968             bbs[i]->count = bbs[i]->count.apply_scale
2969                                  (bbs[i]->count,
2970                                   bbs[i]->count.apply_probability
2971                                     (prob_vector));
2972           free (bbs);
2973         }
2974
2975       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2976       /* If loop is peeled for non-zero constant times, now niters refers to
2977          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2978          overflows.  */
2979       niters_no_overflow |= (prolog_peeling > 0);
2980       vect_gen_vector_loop_niters (loop_vinfo, niters,
2981                                    niters_vector, step_vector,
2982                                    niters_no_overflow);
2983       if (!integer_onep (*step_vector))
2984         {
2985           /* On exit from the loop we will have an easy way of calcalating
2986              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2987              until then.  */
2988           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2989           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2990           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2991         }
2992       else
2993         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2994                                              &niters_vector_mult_vf);
2995       /* Update IVs of original loop as if they were advanced by
2996          niters_vector_mult_vf steps.  */
2997       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2998       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2999       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3000                                         update_e);
3001
3002       if (skip_epilog)
3003         {
3004           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3005                                     niters, niters_vector_mult_vf);
3006           guard_bb = single_exit (loop)->dest;
3007           guard_to = split_edge (single_exit (epilog));
3008           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3009                                            skip_vector ? anchor : guard_bb,
3010                                            prob_epilog.invert (),
3011                                            irred_flag);
3012           if (vect_epilogues)
3013             epilogue_vinfo->skip_this_loop_edge = guard_e;
3014           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3015                                               single_exit (epilog));
3016           /* Only need to handle basic block before epilog loop if it's not
3017              the guard_bb, which is the case when skip_vector is true.  */
3018           if (guard_bb != bb_before_epilog)
3019             {
3020               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3021
3022               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3023             }
3024           scale_loop_profile (epilog, prob_epilog, 0);
3025         }
3026       else
3027         slpeel_update_phi_nodes_for_lcssa (epilog);
3028
3029       unsigned HOST_WIDE_INT bound;
3030       if (bound_scalar.is_constant (&bound))
3031         {
3032           gcc_assert (bound != 0);
3033           /* -1 to convert loop iterations to latch iterations.  */
3034           record_niter_bound (epilog, bound - 1, false, true);
3035         }
3036
3037       delete_update_ssa ();
3038       adjust_vec_debug_stmts ();
3039       scev_reset ();
3040     }
3041
3042   if (vect_epilogues)
3043     {
3044       epilog->aux = epilogue_vinfo;
3045       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3046
3047       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3048
3049       /* We now must calculate the number of NITERS performed by the previous
3050          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3051       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3052                                  niters_prolog, niters_vector_mult_vf);
3053
3054       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3055          determine whether we are coming from the previous vectorized loop
3056          using the update_e edge or the skip_vector basic block using the
3057          skip_e edge.  */
3058       if (skip_vector)
3059         {
3060           gcc_assert (update_e != NULL
3061                       && skip_e != NULL
3062                       && !vect_epilogues_updated_niters);
3063           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3064                                            update_e->dest);
3065           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3066           gimple *stmt = gimple_build_assign (new_ssa, niters);
3067           gimple_stmt_iterator gsi;
3068           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3069               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3070             {
3071               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3072               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3073             }
3074           else
3075             {
3076               gsi = gsi_last_bb (update_e->src);
3077               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3078             }
3079
3080           niters = new_ssa;
3081           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3082           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3083                        UNKNOWN_LOCATION);
3084           niters = PHI_RESULT (new_phi);
3085           epilogue_vinfo->main_loop_edge = update_e;
3086           epilogue_vinfo->skip_main_loop_edge = skip_e;
3087         }
3088
3089       /* Set ADVANCE to the number of iterations performed by the previous
3090          loop and its prologue.  */
3091       *advance = niters;
3092
3093       if (!vect_epilogues_updated_niters)
3094         {
3095           /* Subtract the number of iterations performed by the vectorized loop
3096              from the number of total iterations.  */
3097           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3098                                               before_loop_niters,
3099                                               niters);
3100
3101           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3102           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3103             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3104                            epilogue_niters,
3105                            build_one_cst (TREE_TYPE (epilogue_niters)));
3106
3107           /* Decide what to do if the number of epilogue iterations is not
3108              a multiple of the epilogue loop's vectorization factor.
3109              We should have rejected the loop during the analysis phase
3110              if this fails.  */
3111           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3112                                                            true))
3113             gcc_unreachable ();
3114         }
3115     }
3116
3117   adjust_vec.release ();
3118   free_original_copy_tables ();
3119
3120   return vect_epilogues ? epilog : NULL;
3121 }
3122
3123 /* Function vect_create_cond_for_niters_checks.
3124
3125    Create a conditional expression that represents the run-time checks for
3126    loop's niter.  The loop is guaranteed to terminate if the run-time
3127    checks hold.
3128
3129    Input:
3130    COND_EXPR  - input conditional expression.  New conditions will be chained
3131                 with logical AND operation.  If it is NULL, then the function
3132                 is used to return the number of alias checks.
3133    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3134                 to be checked.
3135
3136    Output:
3137    COND_EXPR - conditional expression.
3138
3139    The returned COND_EXPR is the conditional expression to be used in the
3140    if statement that controls which version of the loop gets executed at
3141    runtime.  */
3142
3143 static void
3144 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3145 {
3146   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3147
3148   if (*cond_expr)
3149     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3150                               *cond_expr, part_cond_expr);
3151   else
3152     *cond_expr = part_cond_expr;
3153 }
3154
3155 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3156    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3157
3158 static void
3159 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3160 {
3161   if (*cond_expr)
3162     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3163                               *cond_expr, part_cond_expr);
3164   else
3165     *cond_expr = part_cond_expr;
3166 }
3167
3168 /* Function vect_create_cond_for_align_checks.
3169
3170    Create a conditional expression that represents the alignment checks for
3171    all of data references (array element references) whose alignment must be
3172    checked at runtime.
3173
3174    Input:
3175    COND_EXPR  - input conditional expression.  New conditions will be chained
3176                 with logical AND operation.
3177    LOOP_VINFO - two fields of the loop information are used.
3178                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3179                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3180
3181    Output:
3182    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3183                          expression.
3184    The returned value is the conditional expression to be used in the if
3185    statement that controls which version of the loop gets executed at runtime.
3186
3187    The algorithm makes two assumptions:
3188      1) The number of bytes "n" in a vector is a power of 2.
3189      2) An address "a" is aligned if a%n is zero and that this
3190         test can be done as a&(n-1) == 0.  For example, for 16
3191         byte vectors the test is a&0xf == 0.  */
3192
3193 static void
3194 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3195                                    tree *cond_expr,
3196                                    gimple_seq *cond_expr_stmt_list)
3197 {
3198   const vec<stmt_vec_info> &may_misalign_stmts
3199     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3200   stmt_vec_info stmt_info;
3201   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3202   tree mask_cst;
3203   unsigned int i;
3204   tree int_ptrsize_type;
3205   char tmp_name[20];
3206   tree or_tmp_name = NULL_TREE;
3207   tree and_tmp_name;
3208   gimple *and_stmt;
3209   tree ptrsize_zero;
3210   tree part_cond_expr;
3211
3212   /* Check that mask is one less than a power of 2, i.e., mask is
3213      all zeros followed by all ones.  */
3214   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3215
3216   int_ptrsize_type = signed_type_for (ptr_type_node);
3217
3218   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3219      of the first vector of the i'th data reference. */
3220
3221   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3222     {
3223       gimple_seq new_stmt_list = NULL;
3224       tree addr_base;
3225       tree addr_tmp_name;
3226       tree new_or_tmp_name;
3227       gimple *addr_stmt, *or_stmt;
3228       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3229       bool negative = tree_int_cst_compare
3230         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3231       tree offset = negative
3232         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3233                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3234         : size_zero_node;
3235
3236       /* create: addr_tmp = (int)(address_of_first_vector) */
3237       addr_base =
3238         vect_create_addr_base_for_vector_ref (loop_vinfo,
3239                                               stmt_info, &new_stmt_list,
3240                                               offset);
3241       if (new_stmt_list != NULL)
3242         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3243
3244       sprintf (tmp_name, "addr2int%d", i);
3245       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3246       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3247       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3248
3249       /* The addresses are OR together.  */
3250
3251       if (or_tmp_name != NULL_TREE)
3252         {
3253           /* create: or_tmp = or_tmp | addr_tmp */
3254           sprintf (tmp_name, "orptrs%d", i);
3255           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3256           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3257                                          or_tmp_name, addr_tmp_name);
3258           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3259           or_tmp_name = new_or_tmp_name;
3260         }
3261       else
3262         or_tmp_name = addr_tmp_name;
3263
3264     } /* end for i */
3265
3266   mask_cst = build_int_cst (int_ptrsize_type, mask);
3267
3268   /* create: and_tmp = or_tmp & mask  */
3269   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3270
3271   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3272                                   or_tmp_name, mask_cst);
3273   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3274
3275   /* Make and_tmp the left operand of the conditional test against zero.
3276      if and_tmp has a nonzero bit then some address is unaligned.  */
3277   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3278   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3279                                 and_tmp_name, ptrsize_zero);
3280   chain_cond_expr (cond_expr, part_cond_expr);
3281 }
3282
3283 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3284    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3285    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3286    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3287
3288 static void
3289 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3290 {
3291   const vec<vec_object_pair> &pairs
3292     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3293   unsigned int i;
3294   vec_object_pair *pair;
3295   FOR_EACH_VEC_ELT (pairs, i, pair)
3296     {
3297       tree addr1 = build_fold_addr_expr (pair->first);
3298       tree addr2 = build_fold_addr_expr (pair->second);
3299       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3300                                          addr1, addr2);
3301       chain_cond_expr (cond_expr, part_cond_expr);
3302     }
3303 }
3304
3305 /* Create an expression that is true when all lower-bound conditions for
3306    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3307
3308 static void
3309 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3310 {
3311   const vec<vec_lower_bound> &lower_bounds
3312     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3313   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3314     {
3315       tree expr = lower_bounds[i].expr;
3316       tree type = unsigned_type_for (TREE_TYPE (expr));
3317       expr = fold_convert (type, expr);
3318       poly_uint64 bound = lower_bounds[i].min_value;
3319       if (!lower_bounds[i].unsigned_p)
3320         {
3321           expr = fold_build2 (PLUS_EXPR, type, expr,
3322                               build_int_cstu (type, bound - 1));
3323           bound += bound - 1;
3324         }
3325       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3326                                          build_int_cstu (type, bound));
3327       chain_cond_expr (cond_expr, part_cond_expr);
3328     }
3329 }
3330
3331 /* Function vect_create_cond_for_alias_checks.
3332
3333    Create a conditional expression that represents the run-time checks for
3334    overlapping of address ranges represented by a list of data references
3335    relations passed as input.
3336
3337    Input:
3338    COND_EXPR  - input conditional expression.  New conditions will be chained
3339                 with logical AND operation.  If it is NULL, then the function
3340                 is used to return the number of alias checks.
3341    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3342                 to be checked.
3343
3344    Output:
3345    COND_EXPR - conditional expression.
3346
3347    The returned COND_EXPR is the conditional expression to be used in the if
3348    statement that controls which version of the loop gets executed at runtime.
3349 */
3350
3351 void
3352 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3353 {
3354   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3355     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3356
3357   if (comp_alias_ddrs.is_empty ())
3358     return;
3359
3360   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3361                                &comp_alias_ddrs, cond_expr);
3362   if (dump_enabled_p ())
3363     dump_printf_loc (MSG_NOTE, vect_location,
3364                      "created %u versioning for alias checks.\n",
3365                      comp_alias_ddrs.length ());
3366 }
3367
3368
3369 /* Function vect_loop_versioning.
3370
3371    If the loop has data references that may or may not be aligned or/and
3372    has data reference relations whose independence was not proven then
3373    two versions of the loop need to be generated, one which is vectorized
3374    and one which isn't.  A test is then generated to control which of the
3375    loops is executed.  The test checks for the alignment of all of the
3376    data references that may or may not be aligned.  An additional
3377    sequence of runtime tests is generated for each pairs of DDRs whose
3378    independence was not proven.  The vectorized version of loop is
3379    executed only if both alias and alignment tests are passed.
3380
3381    The test generated to check which version of loop is executed
3382    is modified to also check for profitability as indicated by the
3383    cost model threshold TH.
3384
3385    The versioning precondition(s) are placed in *COND_EXPR and
3386    *COND_EXPR_STMT_LIST.  */
3387
3388 class loop *
3389 vect_loop_versioning (loop_vec_info loop_vinfo,
3390                       gimple *loop_vectorized_call)
3391 {
3392   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3393   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3394   basic_block condition_bb;
3395   gphi_iterator gsi;
3396   gimple_stmt_iterator cond_exp_gsi;
3397   basic_block merge_bb;
3398   basic_block new_exit_bb;
3399   edge new_exit_e, e;
3400   gphi *orig_phi, *new_phi;
3401   tree cond_expr = NULL_TREE;
3402   gimple_seq cond_expr_stmt_list = NULL;
3403   tree arg;
3404   profile_probability prob = profile_probability::likely ();
3405   gimple_seq gimplify_stmt_list = NULL;
3406   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3407   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3408   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3409   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3410   poly_uint64 versioning_threshold
3411     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3412   tree version_simd_if_cond
3413     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3414   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3415
3416   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3417       && !ordered_p (th, versioning_threshold))
3418     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3419                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3420                                             th - 1));
3421   if (maybe_ne (versioning_threshold, 0U))
3422     {
3423       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3424                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3425                                               versioning_threshold - 1));
3426       if (cond_expr)
3427         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3428                                  expr, cond_expr);
3429       else
3430         cond_expr = expr;
3431     }
3432
3433   if (version_niter)
3434     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3435
3436   if (cond_expr)
3437     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3438                                         &cond_expr_stmt_list,
3439                                         is_gimple_condexpr, NULL_TREE);
3440
3441   if (version_align)
3442     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3443                                        &cond_expr_stmt_list);
3444
3445   if (version_alias)
3446     {
3447       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3448       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3449       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3450     }
3451
3452   if (version_simd_if_cond)
3453     {
3454       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3455       if (flag_checking)
3456         if (basic_block bb
3457             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3458           gcc_assert (bb != loop->header
3459                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3460                       && (scalar_loop == NULL
3461                           || (bb != scalar_loop->header
3462                               && dominated_by_p (CDI_DOMINATORS,
3463                                                  scalar_loop->header, bb))));
3464       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3465       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3466                             version_simd_if_cond, zero);
3467       if (cond_expr)
3468         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3469                                  c, cond_expr);
3470       else
3471         cond_expr = c;
3472       if (dump_enabled_p ())
3473         dump_printf_loc (MSG_NOTE, vect_location,
3474                          "created versioning for simd if condition check.\n");
3475     }
3476
3477   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3478                                       &gimplify_stmt_list,
3479                                       is_gimple_condexpr, NULL_TREE);
3480   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3481
3482   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3483      invariant in.  */
3484   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3485   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3486        !gsi_end_p (gsi); gsi_next (&gsi))
3487     {
3488       gimple *stmt = gsi_stmt (gsi);
3489       update_stmt (stmt);
3490       ssa_op_iter iter;
3491       use_operand_p use_p;
3492       basic_block def_bb;
3493       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3494         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3495             && flow_bb_inside_loop_p (outermost, def_bb))
3496           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3497     }
3498
3499   /* Search for the outermost loop we can version.  Avoid versioning of
3500      non-perfect nests but allow if-conversion versioned loops inside.  */
3501   class loop *loop_to_version = loop;
3502   if (flow_loop_nested_p (outermost, loop))
3503     {
3504       if (dump_enabled_p ())
3505         dump_printf_loc (MSG_NOTE, vect_location,
3506                          "trying to apply versioning to outer loop %d\n",
3507                          outermost->num);
3508       if (outermost->num == 0)
3509         outermost = superloop_at_depth (loop, 1);
3510       /* And avoid applying versioning on non-perfect nests.  */
3511       while (loop_to_version != outermost
3512              && single_exit (loop_outer (loop_to_version))
3513              && (!loop_outer (loop_to_version)->inner->next
3514                  || vect_loop_vectorized_call (loop_to_version))
3515              && (!loop_outer (loop_to_version)->inner->next
3516                  || !loop_outer (loop_to_version)->inner->next->next))
3517         loop_to_version = loop_outer (loop_to_version);
3518     }
3519
3520   /* Apply versioning.  If there is already a scalar version created by
3521      if-conversion re-use that.  Note we cannot re-use the copy of
3522      an if-converted outer-loop when vectorizing the inner loop only.  */
3523   gcond *cond;
3524   if ((!loop_to_version->inner || loop == loop_to_version)
3525       && loop_vectorized_call)
3526     {
3527       gcc_assert (scalar_loop);
3528       condition_bb = gimple_bb (loop_vectorized_call);
3529       cond = as_a <gcond *> (last_stmt (condition_bb));
3530       gimple_cond_set_condition_from_tree (cond, cond_expr);
3531       update_stmt (cond);
3532
3533       if (cond_expr_stmt_list)
3534         {
3535           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3536           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3537                                  GSI_SAME_STMT);
3538         }
3539
3540       /* if-conversion uses profile_probability::always () for both paths,
3541          reset the paths probabilities appropriately.  */
3542       edge te, fe;
3543       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3544       te->probability = prob;
3545       fe->probability = prob.invert ();
3546       /* We can scale loops counts immediately but have to postpone
3547          scaling the scalar loop because we re-use it during peeling.  */
3548       scale_loop_frequencies (loop_to_version, te->probability);
3549       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3550
3551       nloop = scalar_loop;
3552       if (dump_enabled_p ())
3553         dump_printf_loc (MSG_NOTE, vect_location,
3554                          "reusing %sloop version created by if conversion\n",
3555                          loop_to_version != loop ? "outer " : "");
3556     }
3557   else
3558     {
3559       if (loop_to_version != loop
3560           && dump_enabled_p ())
3561         dump_printf_loc (MSG_NOTE, vect_location,
3562                          "applying loop versioning to outer loop %d\n",
3563                          loop_to_version->num);
3564
3565       initialize_original_copy_tables ();
3566       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3567                             prob, prob.invert (), prob, prob.invert (), true);
3568       gcc_assert (nloop);
3569       nloop = get_loop_copy (loop);
3570
3571       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3572          reap those otherwise;  they also refer to the original
3573          loops.  */
3574       class loop *l = loop;
3575       while (gimple *call = vect_loop_vectorized_call (l))
3576         {
3577           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3578           fold_loop_internal_call (call, boolean_false_node);
3579           l = loop_outer (l);
3580         }
3581       free_original_copy_tables ();
3582
3583       if (cond_expr_stmt_list)
3584         {
3585           cond_exp_gsi = gsi_last_bb (condition_bb);
3586           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3587                                  GSI_SAME_STMT);
3588         }
3589
3590       /* Loop versioning violates an assumption we try to maintain during
3591          vectorization - that the loop exit block has a single predecessor.
3592          After versioning, the exit block of both loop versions is the same
3593          basic block (i.e. it has two predecessors). Just in order to simplify
3594          following transformations in the vectorizer, we fix this situation
3595          here by adding a new (empty) block on the exit-edge of the loop,
3596          with the proper loop-exit phis to maintain loop-closed-form.
3597          If loop versioning wasn't done from loop, but scalar_loop instead,
3598          merge_bb will have already just a single successor.  */
3599
3600       merge_bb = single_exit (loop_to_version)->dest;
3601       if (EDGE_COUNT (merge_bb->preds) >= 2)
3602         {
3603           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3604           new_exit_bb = split_edge (single_exit (loop_to_version));
3605           new_exit_e = single_exit (loop_to_version);
3606           e = EDGE_SUCC (new_exit_bb, 0);
3607
3608           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3609                gsi_next (&gsi))
3610             {
3611               tree new_res;
3612               orig_phi = gsi.phi ();
3613               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3614               new_phi = create_phi_node (new_res, new_exit_bb);
3615               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3616               add_phi_arg (new_phi, arg, new_exit_e,
3617                            gimple_phi_arg_location_from_edge (orig_phi, e));
3618               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3619             }
3620         }
3621
3622       update_ssa (TODO_update_ssa);
3623     }
3624
3625   if (version_niter)
3626     {
3627       /* The versioned loop could be infinite, we need to clear existing
3628          niter information which is copied from the original loop.  */
3629       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3630       vect_free_loop_info_assumptions (nloop);
3631     }
3632
3633   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3634       && dump_enabled_p ())
3635     {
3636       if (version_alias)
3637         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3638                          vect_location,
3639                          "loop versioned for vectorization because of "
3640                          "possible aliasing\n");
3641       if (version_align)
3642         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3643                          vect_location,
3644                          "loop versioned for vectorization to enhance "
3645                          "alignment\n");
3646
3647     }
3648
3649   return nloop;
3650 }