gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 324                                rgroup_controls *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->type;
 327   tree dest_masktype = dest_rgm->type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 342         {
 343           tree src = src_rgm->controls[i / 2];
 344           tree dest = dest_rgm->controls[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 375         {
 376           tree src = src_rgm->controls[i / 2];
 377           tree dest = dest_rgm->controls[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 388    for all the rgroup controls in RGC and return a control that is nonzero
 389    when the loop needs to iterate.  Add any new preheader statements to
 390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGC->max_nscalars_per_iter * RGC->factor
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks or zero
 415    lengths for RGC.
 416
 417    Note: the cost of the code generated by this function is modeled
 418    by vect_estimate_min_profitable_iters, so changes here may need
 419    corresponding changes there.  */
 420
 421 static tree
 422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 423                                  gimple_seq *preheader_seq,
 424                                  gimple_stmt_iterator loop_cond_gsi,
 425                                  rgroup_controls *rgc, tree niters,
 426                                  tree niters_skip, bool might_wrap_p)
 427 {
 428   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 429   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 430   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 431
 432   tree ctrl_type = rgc->type;
 433   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 434   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 435   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 436   tree length_limit = NULL_TREE;
 437   /* For length, we need length_limit to ensure length in range.  */
 438   if (!use_masks_p)
 439     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 440
 441   /* Calculate the maximum number of item values that the rgroup
 442      handles in total, the number that it handles for each iteration
 443      of the vector loop, and the number that it should skip during the
 444      first iteration of the vector loop.  */
 445   tree nitems_total = niters;
 446   tree nitems_step = build_int_cst (iv_type, vf);
 447   tree nitems_skip = niters_skip;
 448   if (nitems_per_iter != 1)
 449     {
 450       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 451          these multiplications don't overflow.  */
 452       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 453       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 454       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 455                                    nitems_total, compare_factor);
 456       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 457                                   nitems_step, iv_factor);
 458       if (nitems_skip)
 459         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 460                                     nitems_skip, compare_factor);
 461     }
 462
 463   /* Create an induction variable that counts the number of items
 464      processed.  */
 465   tree index_before_incr, index_after_incr;
 466   gimple_stmt_iterator incr_gsi;
 467   bool insert_after;
 468   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 469   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 470              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 471
 472   tree zero_index = build_int_cst (compare_type, 0);
 473   tree test_index, test_limit, first_limit;
 474   gimple_stmt_iterator *test_gsi;
 475   if (might_wrap_p)
 476     {
 477       /* In principle the loop should stop iterating once the incremented
 478          IV reaches a value greater than or equal to:
 479
 480            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 481
 482          However, there's no guarantee that this addition doesn't overflow
 483          the comparison type, or that the IV hits a value above it before
 484          wrapping around.  We therefore adjust the limit down by one
 485          IV step:
 486
 487            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 488            -[infinite-prec] NITEMS_STEP
 489
 490          and compare the IV against this limit _before_ incrementing it.
 491          Since the comparison type is unsigned, we actually want the
 492          subtraction to saturate at zero:
 493
 494            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 495            -[sat] NITEMS_STEP
 496
 497          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 498
 499            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 500
 501          where the rightmost subtraction can be done directly in
 502          COMPARE_TYPE.  */
 503       test_index = index_before_incr;
 504       tree adjust = gimple_convert (preheader_seq, compare_type,
 505                                     nitems_step);
 506       if (nitems_skip)
 507         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 508                                adjust, nitems_skip);
 509       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 510                                  nitems_total, adjust);
 511       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 512                                  test_limit, adjust);
 513       test_gsi = &incr_gsi;
 514
 515       /* Get a safe limit for the first iteration.  */
 516       if (nitems_skip)
 517         {
 518           /* The first vector iteration can handle at most NITEMS_STEP
 519              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 520              NITEMS_SKIP to that cannot overflow.  */
 521           tree const_limit = build_int_cst (compare_type,
 522                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 523                                             * nitems_per_iter);
 524           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 525                                       nitems_total, const_limit);
 526           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 527                                       first_limit, nitems_skip);
 528         }
 529       else
 530         /* For the first iteration it doesn't matter whether the IV hits
 531            a value above NITEMS_TOTAL.  That only matters for the latch
 532            condition.  */
 533         first_limit = nitems_total;
 534     }
 535   else
 536     {
 537       /* Test the incremented IV, which will always hit a value above
 538          the bound before wrapping.  */
 539       test_index = index_after_incr;
 540       test_limit = nitems_total;
 541       if (nitems_skip)
 542         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 543                                    test_limit, nitems_skip);
 544       test_gsi = &loop_cond_gsi;
 545
 546       first_limit = test_limit;
 547     }
 548
 549   /* Convert the IV value to the comparison type (either a no-op or
 550      a demotion).  */
 551   gimple_seq test_seq = NULL;
 552   test_index = gimple_convert (&test_seq, compare_type, test_index);
 553   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 554
 555   /* Provide a definition of each control in the group.  */
 556   tree next_ctrl = NULL_TREE;
 557   tree ctrl;
 558   unsigned int i;
 559   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 560     {
 561       /* Previous controls will cover BIAS items.  This control covers the
 562          next batch.  */
 563       poly_uint64 bias = nitems_per_ctrl * i;
 564       tree bias_tree = build_int_cst (compare_type, bias);
 565
 566       /* See whether the first iteration of the vector loop is known
 567          to have a full control.  */
 568       poly_uint64 const_limit;
 569       bool first_iteration_full
 570         = (poly_int_tree_p (first_limit, &const_limit)
 571            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 572
 573       /* Rather than have a new IV that starts at BIAS and goes up to
 574          TEST_LIMIT, prefer to use the same 0-based IV for each control
 575          and adjust the bound down by BIAS.  */
 576       tree this_test_limit = test_limit;
 577       if (i != 0)
 578         {
 579           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 580                                           compare_type, this_test_limit,
 581                                           bias_tree);
 582           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 583                                           compare_type, this_test_limit,
 584                                           bias_tree);
 585         }
 586
 587       /* Create the initial control.  First include all items that
 588          are within the loop limit.  */
 589       tree init_ctrl = NULL_TREE;
 590       if (!first_iteration_full)
 591         {
 592           tree start, end;
 593           if (first_limit == test_limit)
 594             {
 595               /* Use a natural test between zero (the initial IV value)
 596                  and the loop limit.  The "else" block would be valid too,
 597                  but this choice can avoid the need to load BIAS_TREE into
 598                  a register.  */
 599               start = zero_index;
 600               end = this_test_limit;
 601             }
 602           else
 603             {
 604               /* FIRST_LIMIT is the maximum number of items handled by the
 605                  first iteration of the vector loop.  Test the portion
 606                  associated with this control.  */
 607               start = bias_tree;
 608               end = first_limit;
 609             }
 610
 611           if (use_masks_p)
 612             {
 613               init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
 614               gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end);
 615               gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 616             }
 617           else
 618             {
 619               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 620               gimple_seq seq = vect_gen_len (init_ctrl, start,
 621                                              end, length_limit);
 622               gimple_seq_add_seq (preheader_seq, seq);
 623             }
 624         }
 625
 626       /* Now AND out the bits that are within the number of skipped
 627          items.  */
 628       poly_uint64 const_skip;
 629       if (nitems_skip
 630           && !(poly_int_tree_p (nitems_skip, &const_skip)
 631                && known_le (const_skip, bias)))
 632         {
 633           gcc_assert (use_masks_p);
 634           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 635                                                     bias_tree, nitems_skip);
 636           if (init_ctrl)
 637             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 638                                       init_ctrl, unskipped_mask);
 639           else
 640             init_ctrl = unskipped_mask;
 641         }
 642
 643       if (!init_ctrl)
 644         {
 645           /* First iteration is full.  */
 646           if (use_masks_p)
 647             init_ctrl = build_minus_one_cst (ctrl_type);
 648           else
 649             init_ctrl = length_limit;
 650         }
 651
 652       /* Get the control value for the next iteration of the loop.  */
 653       if (use_masks_p)
 654         {
 655           next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
 656           gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
 657           gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 658         }
 659       else
 660         {
 661           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 662           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 663                                          length_limit);
 664           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 665         }
 666
 667       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 668     }
 669   return next_ctrl;
 670 }
 671
 672 /* Set up the iteration condition and rgroup controls for LOOP, given
 673    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 674    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 675    the number of iterations of the original scalar loop that should be
 676    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 677    for vect_set_loop_condition.
 678
 679    Insert the branch-back condition before LOOP_COND_GSI and return the
 680    final gcond.  */
 681
 682 static gcond *
 683 vect_set_loop_condition_partial_vectors (class loop *loop,
 684                                          loop_vec_info loop_vinfo, tree niters,
 685                                          tree final_iv, bool niters_maybe_zero,
 686                                          gimple_stmt_iterator loop_cond_gsi)
 687 {
 688   gimple_seq preheader_seq = NULL;
 689   gimple_seq header_seq = NULL;
 690
 691   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 692   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 693   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 694   tree orig_niters = niters;
 695
 696   /* Type of the initial value of NITERS.  */
 697   tree ni_actual_type = TREE_TYPE (niters);
 698   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 699   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 700
 701   /* Convert NITERS to the same size as the compare.  */
 702   if (compare_precision > ni_actual_precision
 703       && niters_maybe_zero)
 704     {
 705       /* We know that there is always at least one iteration, so if the
 706          count is zero then it must have wrapped.  Cope with this by
 707          subtracting 1 before the conversion and adding 1 to the result.  */
 708       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 709       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 710                              niters, build_minus_one_cst (ni_actual_type));
 711       niters = gimple_convert (&preheader_seq, compare_type, niters);
 712       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 713                              niters, build_one_cst (compare_type));
 714     }
 715   else
 716     niters = gimple_convert (&preheader_seq, compare_type, niters);
 717
 718   /* Iterate over all the rgroups and fill in their controls.  We could use
 719      the first control from any rgroup for the loop condition; here we
 720      arbitrarily pick the last.  */
 721   tree test_ctrl = NULL_TREE;
 722   rgroup_controls *rgc;
 723   unsigned int i;
 724   auto_vec<rgroup_controls> *controls = use_masks_p
 725                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 726                                           : &LOOP_VINFO_LENS (loop_vinfo);
 727   FOR_EACH_VEC_ELT (*controls, i, rgc)
 728     if (!rgc->controls.is_empty ())
 729       {
 730         /* First try using permutes.  This adds a single vector
 731            instruction to the loop for each mask, but needs no extra
 732            loop invariants or IVs.  */
 733         unsigned int nmasks = i + 1;
 734         if (use_masks_p && (nmasks & 1) == 0)
 735           {
 736             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 737             if (!half_rgc->controls.is_empty ()
 738                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 739               continue;
 740           }
 741
 742         /* See whether zero-based IV would ever generate all-false masks
 743            or zero length before wrapping around.  */
 744         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 745
 746         /* Set up all controls for this group.  */
 747         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 748                                                      &preheader_seq,
 749                                                      loop_cond_gsi, rgc,
 750                                                      niters, niters_skip,
 751                                                      might_wrap_p);
 752       }
 753
 754   /* Emit all accumulated statements.  */
 755   add_preheader_seq (loop, preheader_seq);
 756   add_header_seq (loop, header_seq);
 757
 758   /* Get a boolean result that tells us whether to iterate.  */
 759   edge exit_edge = single_exit (loop);
 760   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 761   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 762   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 763                                         NULL_TREE, NULL_TREE);
 764   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 765
 766   /* The loop iterates (NITERS - 1) / VF + 1 times.
 767      Subtract one from this to get the latch count.  */
 768   tree step = build_int_cst (compare_type,
 769                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 770   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 771                                        build_minus_one_cst (compare_type));
 772   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 773                                      niters_minus_one, step);
 774
 775   if (final_iv)
 776     {
 777       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 778       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 779     }
 780
 781   return cond_stmt;
 782 }
 783
 784 /* Like vect_set_loop_condition, but handle the case in which the vector
 785    loop handles exactly VF scalars per iteration.  */
 786
 787 static gcond *
 788 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 789                                 tree final_iv, bool niters_maybe_zero,
 790                                 gimple_stmt_iterator loop_cond_gsi)
 791 {
 792   tree indx_before_incr, indx_after_incr;
 793   gcond *cond_stmt;
 794   gcond *orig_cond;
 795   edge pe = loop_preheader_edge (loop);
 796   edge exit_edge = single_exit (loop);
 797   gimple_stmt_iterator incr_gsi;
 798   bool insert_after;
 799   enum tree_code code;
 800   tree niters_type = TREE_TYPE (niters);
 801
 802   orig_cond = get_loop_exit_condition (loop);
 803   gcc_assert (orig_cond);
 804   loop_cond_gsi = gsi_for_stmt (orig_cond);
 805
 806   tree init, limit;
 807   if (!niters_maybe_zero && integer_onep (step))
 808     {
 809       /* In this case we can use a simple 0-based IV:
 810
 811          A:
 812            x = 0;
 813            do
 814              {
 815                ...
 816                x += 1;
 817              }
 818            while (x < NITERS);  */
 819       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 820       init = build_zero_cst (niters_type);
 821       limit = niters;
 822     }
 823   else
 824     {
 825       /* The following works for all values of NITERS except 0:
 826
 827          B:
 828            x = 0;
 829            do
 830              {
 831                ...
 832                x += STEP;
 833              }
 834            while (x <= NITERS - STEP);
 835
 836          so that the loop continues to iterate if x + STEP - 1 < NITERS
 837          but stops if x + STEP - 1 >= NITERS.
 838
 839          However, if NITERS is zero, x never hits a value above NITERS - STEP
 840          before wrapping around.  There are two obvious ways of dealing with
 841          this:
 842
 843          - start at STEP - 1 and compare x before incrementing it
 844          - start at -1 and compare x after incrementing it
 845
 846          The latter is simpler and is what we use.  The loop in this case
 847          looks like:
 848
 849          C:
 850            x = -1;
 851            do
 852              {
 853                ...
 854                x += STEP;
 855              }
 856            while (x < NITERS - STEP);
 857
 858          In both cases the loop limit is NITERS - STEP.  */
 859       gimple_seq seq = NULL;
 860       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 861       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 862       if (seq)
 863         {
 864           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 865           gcc_assert (!new_bb);
 866         }
 867       if (niters_maybe_zero)
 868         {
 869           /* Case C.  */
 870           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 871           init = build_all_ones_cst (niters_type);
 872         }
 873       else
 874         {
 875           /* Case B.  */
 876           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 877           init = build_zero_cst (niters_type);
 878         }
 879     }
 880
 881   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 882   create_iv (init, step, NULL_TREE, loop,
 883              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 884   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 885                                               true, NULL_TREE, true,
 886                                               GSI_SAME_STMT);
 887   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 888                                      true, GSI_SAME_STMT);
 889
 890   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 891                                  NULL_TREE);
 892
 893   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 894
 895   /* Record the number of latch iterations.  */
 896   if (limit == niters)
 897     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 898        latch count.  */
 899     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 900                                        build_int_cst (niters_type, 1));
 901   else
 902     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 903        Subtract one from this to get the latch count.  */
 904     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 905                                        limit, step);
 906
 907   if (final_iv)
 908     {
 909       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 910                                              indx_after_incr, init);
 911       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 912     }
 913
 914   return cond_stmt;
 915 }
 916
 917 /* If we're using fully-masked loops, make LOOP iterate:
 918
 919       N == (NITERS - 1) / STEP + 1
 920
 921    times.  When NITERS is zero, this is equivalent to making the loop
 922    execute (1 << M) / STEP times, where M is the precision of NITERS.
 923    NITERS_MAYBE_ZERO is true if this last case might occur.
 924
 925    If we're not using fully-masked loops, make LOOP iterate:
 926
 927       N == (NITERS - STEP) / STEP + 1
 928
 929    times, where NITERS is known to be outside the range [1, STEP - 1].
 930    This is equivalent to making the loop execute NITERS / STEP times
 931    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 932    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 933
 934    If FINAL_IV is nonnull, it is an SSA name that should be set to
 935    N * STEP on exit from the loop.
 936
 937    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 938
 939 void
 940 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 941                          tree niters, tree step, tree final_iv,
 942                          bool niters_maybe_zero)
 943 {
 944   gcond *cond_stmt;
 945   gcond *orig_cond = get_loop_exit_condition (loop);
 946   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 947
 948   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 949     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 950                                                          niters, final_iv,
 951                                                          niters_maybe_zero,
 952                                                          loop_cond_gsi);
 953   else
 954     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 955                                                 niters_maybe_zero,
 956                                                 loop_cond_gsi);
 957
 958   /* Remove old loop exit test.  */
 959   stmt_vec_info orig_cond_info;
 960   if (loop_vinfo
 961       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 962     loop_vinfo->remove_stmt (orig_cond_info);
 963   else
 964     gsi_remove (&loop_cond_gsi, true);
 965
 966   if (dump_enabled_p ())
 967     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 968                      cond_stmt);
 969 }
 970
 971 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 972    For all PHI arguments in FROM->dest and TO->dest from those
 973    edges ensure that TO->dest PHI arguments have current_def
 974    to that in from.  */
 975
 976 static void
 977 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 978 {
 979   gimple_stmt_iterator gsi_from, gsi_to;
 980
 981   for (gsi_from = gsi_start_phis (from->dest),
 982        gsi_to = gsi_start_phis (to->dest);
 983        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 984     {
 985       gimple *from_phi = gsi_stmt (gsi_from);
 986       gimple *to_phi = gsi_stmt (gsi_to);
 987       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 988       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 989       if (virtual_operand_p (from_arg))
 990         {
 991           gsi_next (&gsi_from);
 992           continue;
 993         }
 994       if (virtual_operand_p (to_arg))
 995         {
 996           gsi_next (&gsi_to);
 997           continue;
 998         }
 999       if (TREE_CODE (from_arg) != SSA_NAME)
1000         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1001       else if (TREE_CODE (to_arg) == SSA_NAME
1002                && from_arg != to_arg)
1003         {
1004           if (get_current_def (to_arg) == NULL_TREE)
1005             {
1006               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1007                                               TREE_TYPE (get_current_def
1008                                                            (from_arg))));
1009               set_current_def (to_arg, get_current_def (from_arg));
1010             }
1011         }
1012       gsi_next (&gsi_from);
1013       gsi_next (&gsi_to);
1014     }
1015
1016   gphi *from_phi = get_virtual_phi (from->dest);
1017   gphi *to_phi = get_virtual_phi (to->dest);
1018   if (from_phi)
1019     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1020                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1021 }
1022
1023
1024 /* Given LOOP this function generates a new copy of it and puts it
1025    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1026    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1027    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1028    entry or exit of LOOP.  */
1029
1030 class loop *
1031 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1032                                         class loop *scalar_loop, edge e)
1033 {
1034   class loop *new_loop;
1035   basic_block *new_bbs, *bbs, *pbbs;
1036   bool at_exit;
1037   bool was_imm_dom;
1038   basic_block exit_dest;
1039   edge exit, new_exit;
1040   bool duplicate_outer_loop = false;
1041
1042   exit = single_exit (loop);
1043   at_exit = (e == exit);
1044   if (!at_exit && e != loop_preheader_edge (loop))
1045     return NULL;
1046
1047   if (scalar_loop == NULL)
1048     scalar_loop = loop;
1049
1050   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1051   pbbs = bbs + 1;
1052   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1053   /* Allow duplication of outer loops.  */
1054   if (scalar_loop->inner)
1055     duplicate_outer_loop = true;
1056   /* Check whether duplication is possible.  */
1057   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1058     {
1059       free (bbs);
1060       return NULL;
1061     }
1062
1063   /* Generate new loop structure.  */
1064   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1065   duplicate_subloops (scalar_loop, new_loop);
1066
1067   exit_dest = exit->dest;
1068   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1069                                           exit_dest) == loop->header ?
1070                  true : false);
1071
1072   /* Also copy the pre-header, this avoids jumping through hoops to
1073      duplicate the loop entry PHI arguments.  Create an empty
1074      pre-header unconditionally for this.  */
1075   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1076   edge entry_e = single_pred_edge (preheader);
1077   bbs[0] = preheader;
1078   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1079
1080   exit = single_exit (scalar_loop);
1081   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1082             &exit, 1, &new_exit, NULL,
1083             at_exit ? loop->latch : e->src, true);
1084   exit = single_exit (loop);
1085   basic_block new_preheader = new_bbs[0];
1086
1087   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1088
1089   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1090   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1091     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1092
1093   if (scalar_loop != loop)
1094     {
1095       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1096          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1097          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1098          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1099          header) to have current_def set, so copy them over.  */
1100       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1101                                                 exit);
1102       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1103                                                            0),
1104                                                 EDGE_SUCC (loop->latch, 0));
1105     }
1106
1107   if (at_exit) /* Add the loop copy at exit.  */
1108     {
1109       if (scalar_loop != loop)
1110         {
1111           gphi_iterator gsi;
1112           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1113
1114           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1115                gsi_next (&gsi))
1116             {
1117               gphi *phi = gsi.phi ();
1118               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1119               location_t orig_locus
1120                 = gimple_phi_arg_location_from_edge (phi, e);
1121
1122               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1123             }
1124         }
1125       redirect_edge_and_branch_force (e, new_preheader);
1126       flush_pending_stmts (e);
1127       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1128       if (was_imm_dom || duplicate_outer_loop)
1129         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1130
1131       /* And remove the non-necessary forwarder again.  Keep the other
1132          one so we have a proper pre-header for the loop at the exit edge.  */
1133       redirect_edge_pred (single_succ_edge (preheader),
1134                           single_pred (preheader));
1135       delete_basic_block (preheader);
1136       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1137                                loop_preheader_edge (scalar_loop)->src);
1138     }
1139   else /* Add the copy at entry.  */
1140     {
1141       if (scalar_loop != loop)
1142         {
1143           /* Remove the non-necessary forwarder of scalar_loop again.  */
1144           redirect_edge_pred (single_succ_edge (preheader),
1145                               single_pred (preheader));
1146           delete_basic_block (preheader);
1147           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1148                                    loop_preheader_edge (scalar_loop)->src);
1149           preheader = split_edge (loop_preheader_edge (loop));
1150           entry_e = single_pred_edge (preheader);
1151         }
1152
1153       redirect_edge_and_branch_force (entry_e, new_preheader);
1154       flush_pending_stmts (entry_e);
1155       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1156
1157       redirect_edge_and_branch_force (new_exit, preheader);
1158       flush_pending_stmts (new_exit);
1159       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1160
1161       /* And remove the non-necessary forwarder again.  Keep the other
1162          one so we have a proper pre-header for the loop at the exit edge.  */
1163       redirect_edge_pred (single_succ_edge (new_preheader),
1164                           single_pred (new_preheader));
1165       delete_basic_block (new_preheader);
1166       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1167                                loop_preheader_edge (new_loop)->src);
1168     }
1169
1170   if (scalar_loop != loop)
1171     {
1172       /* Update new_loop->header PHIs, so that on the preheader
1173          edge they are the ones from loop rather than scalar_loop.  */
1174       gphi_iterator gsi_orig, gsi_new;
1175       edge orig_e = loop_preheader_edge (loop);
1176       edge new_e = loop_preheader_edge (new_loop);
1177
1178       for (gsi_orig = gsi_start_phis (loop->header),
1179            gsi_new = gsi_start_phis (new_loop->header);
1180            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1181            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1182         {
1183           gphi *orig_phi = gsi_orig.phi ();
1184           gphi *new_phi = gsi_new.phi ();
1185           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1186           location_t orig_locus
1187             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1188
1189           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1190         }
1191     }
1192
1193   free (new_bbs);
1194   free (bbs);
1195
1196   checking_verify_dominators (CDI_DOMINATORS);
1197
1198   return new_loop;
1199 }
1200
1201
1202 /* Given the condition expression COND, put it as the last statement of
1203    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1204    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1205    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1206    new edge as irreducible if IRREDUCIBLE_P is true.  */
1207
1208 static edge
1209 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1210                        basic_block guard_to, basic_block dom_bb,
1211                        profile_probability probability, bool irreducible_p)
1212 {
1213   gimple_stmt_iterator gsi;
1214   edge new_e, enter_e;
1215   gcond *cond_stmt;
1216   gimple_seq gimplify_stmt_list = NULL;
1217
1218   enter_e = EDGE_SUCC (guard_bb, 0);
1219   enter_e->flags &= ~EDGE_FALLTHRU;
1220   enter_e->flags |= EDGE_FALSE_VALUE;
1221   gsi = gsi_last_bb (guard_bb);
1222
1223   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1224                                  NULL_TREE);
1225   if (gimplify_stmt_list)
1226     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1227
1228   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1229   gsi = gsi_last_bb (guard_bb);
1230   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1231
1232   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1233   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1234
1235   new_e->probability = probability;
1236   if (irreducible_p)
1237     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1238
1239   enter_e->probability = probability.invert ();
1240   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1241
1242   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1243   if (enter_e->dest->loop_father->header == enter_e->dest)
1244     split_edge (enter_e);
1245
1246   return new_e;
1247 }
1248
1249
1250 /* This function verifies that the following restrictions apply to LOOP:
1251    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1252        for innermost loop and 5 basic blocks for outer-loop.
1253    (2) it is single entry, single exit
1254    (3) its exit condition is the last stmt in the header
1255    (4) E is the entry/exit edge of LOOP.
1256  */
1257
1258 bool
1259 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1260 {
1261   edge exit_e = single_exit (loop);
1262   edge entry_e = loop_preheader_edge (loop);
1263   gcond *orig_cond = get_loop_exit_condition (loop);
1264   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1265   unsigned int num_bb = loop->inner? 5 : 2;
1266
1267   /* All loops have an outer scope; the only case loop->outer is NULL is for
1268      the function itself.  */
1269   if (!loop_outer (loop)
1270       || loop->num_nodes != num_bb
1271       || !empty_block_p (loop->latch)
1272       || !single_exit (loop)
1273       /* Verify that new loop exit condition can be trivially modified.  */
1274       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1275       || (e != exit_e && e != entry_e))
1276     return false;
1277
1278   return true;
1279 }
1280
1281 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1282    in the exit bb and rename all the uses after the loop.  This simplifies
1283    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1284    (but normally loop closed SSA form doesn't require virtual PHIs to be
1285    in the same form).  Doing this early simplifies the checking what
1286    uses should be renamed.
1287
1288    If we create a new phi after the loop, return the definition that
1289    applies on entry to the loop, otherwise return null.  */
1290
1291 static tree
1292 create_lcssa_for_virtual_phi (class loop *loop)
1293 {
1294   gphi_iterator gsi;
1295   edge exit_e = single_exit (loop);
1296
1297   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1298     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1299       {
1300         gphi *phi = gsi.phi ();
1301         for (gsi = gsi_start_phis (exit_e->dest);
1302              !gsi_end_p (gsi); gsi_next (&gsi))
1303           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1304             break;
1305         if (gsi_end_p (gsi))
1306           {
1307             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1308             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1309             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1310             imm_use_iterator imm_iter;
1311             gimple *stmt;
1312             use_operand_p use_p;
1313
1314             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1315               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1316             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1317             gimple_phi_set_result (new_phi, new_vop);
1318             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1319               if (stmt != new_phi
1320                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1321                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1322                   SET_USE (use_p, new_vop);
1323
1324             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1325           }
1326         break;
1327       }
1328   return NULL_TREE;
1329 }
1330
1331 /* Function vect_get_loop_location.
1332
1333    Extract the location of the loop in the source code.
1334    If the loop is not well formed for vectorization, an estimated
1335    location is calculated.
1336    Return the loop location if succeed and NULL if not.  */
1337
1338 dump_user_location_t
1339 find_loop_location (class loop *loop)
1340 {
1341   gimple *stmt = NULL;
1342   basic_block bb;
1343   gimple_stmt_iterator si;
1344
1345   if (!loop)
1346     return dump_user_location_t ();
1347
1348   stmt = get_loop_exit_condition (loop);
1349
1350   if (stmt
1351       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1352     return stmt;
1353
1354   /* If we got here the loop is probably not "well formed",
1355      try to estimate the loop location */
1356
1357   if (!loop->header)
1358     return dump_user_location_t ();
1359
1360   bb = loop->header;
1361
1362   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1363     {
1364       stmt = gsi_stmt (si);
1365       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1366         return stmt;
1367     }
1368
1369   return dump_user_location_t ();
1370 }
1371
1372 /* Return true if the phi described by STMT_INFO defines an IV of the
1373    loop to be vectorized.  */
1374
1375 static bool
1376 iv_phi_p (stmt_vec_info stmt_info)
1377 {
1378   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1379   if (virtual_operand_p (PHI_RESULT (phi)))
1380     return false;
1381
1382   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1383       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1384     return false;
1385
1386   return true;
1387 }
1388
1389 /* Function vect_can_advance_ivs_p
1390
1391    In case the number of iterations that LOOP iterates is unknown at compile
1392    time, an epilog loop will be generated, and the loop induction variables
1393    (IVs) will be "advanced" to the value they are supposed to take just before
1394    the epilog loop.  Here we check that the access function of the loop IVs
1395    and the expression that represents the loop bound are simple enough.
1396    These restrictions will be relaxed in the future.  */
1397
1398 bool
1399 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1400 {
1401   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402   basic_block bb = loop->header;
1403   gphi_iterator gsi;
1404
1405   /* Analyze phi functions of the loop header.  */
1406
1407   if (dump_enabled_p ())
1408     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1409   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1410     {
1411       tree evolution_part;
1412
1413       gphi *phi = gsi.phi ();
1414       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1415       if (dump_enabled_p ())
1416         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1417                          phi_info->stmt);
1418
1419       /* Skip virtual phi's. The data dependences that are associated with
1420          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1421
1422          Skip reduction phis.  */
1423       if (!iv_phi_p (phi_info))
1424         {
1425           if (dump_enabled_p ())
1426             dump_printf_loc (MSG_NOTE, vect_location,
1427                              "reduc or virtual phi. skip.\n");
1428           continue;
1429         }
1430
1431       /* Analyze the evolution function.  */
1432
1433       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1434       if (evolution_part == NULL_TREE)
1435         {
1436           if (dump_enabled_p ())
1437             dump_printf (MSG_MISSED_OPTIMIZATION,
1438                          "No access function or evolution.\n");
1439           return false;
1440         }
1441
1442       /* FORNOW: We do not transform initial conditions of IVs
1443          which evolution functions are not invariants in the loop.  */
1444
1445       if (!expr_invariant_in_loop_p (loop, evolution_part))
1446         {
1447           if (dump_enabled_p ())
1448             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                              "evolution not invariant in loop.\n");
1450           return false;
1451         }
1452
1453       /* FORNOW: We do not transform initial conditions of IVs
1454          which evolution functions are a polynomial of degree >= 2.  */
1455
1456       if (tree_is_chrec (evolution_part))
1457         {
1458           if (dump_enabled_p ())
1459             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1460                              "evolution is chrec.\n");
1461           return false;
1462         }
1463     }
1464
1465   return true;
1466 }
1467
1468
1469 /*   Function vect_update_ivs_after_vectorizer.
1470
1471      "Advance" the induction variables of LOOP to the value they should take
1472      after the execution of LOOP.  This is currently necessary because the
1473      vectorizer does not handle induction variables that are used after the
1474      loop.  Such a situation occurs when the last iterations of LOOP are
1475      peeled, because:
1476      1. We introduced new uses after LOOP for IVs that were not originally used
1477         after LOOP: the IVs of LOOP are now used by an epilog loop.
1478      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1479         times, whereas the loop IVs should be bumped N times.
1480
1481      Input:
1482      - LOOP - a loop that is going to be vectorized. The last few iterations
1483               of LOOP were peeled.
1484      - NITERS - the number of iterations that LOOP executes (before it is
1485                 vectorized). i.e, the number of times the ivs should be bumped.
1486      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1487                   coming out from LOOP on which there are uses of the LOOP ivs
1488                   (this is the path from LOOP->exit to epilog_loop->preheader).
1489
1490                   The new definitions of the ivs are placed in LOOP->exit.
1491                   The phi args associated with the edge UPDATE_E in the bb
1492                   UPDATE_E->dest are updated accordingly.
1493
1494      Assumption 1: Like the rest of the vectorizer, this function assumes
1495      a single loop exit that has a single predecessor.
1496
1497      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1498      organized in the same order.
1499
1500      Assumption 3: The access function of the ivs is simple enough (see
1501      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1502
1503      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1504      coming out of LOOP on which the ivs of LOOP are used (this is the path
1505      that leads to the epilog loop; other paths skip the epilog loop).  This
1506      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1507      needs to have its phis updated.
1508  */
1509
1510 static void
1511 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1512                                   tree niters, edge update_e)
1513 {
1514   gphi_iterator gsi, gsi1;
1515   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1516   basic_block update_bb = update_e->dest;
1517   basic_block exit_bb = single_exit (loop)->dest;
1518
1519   /* Make sure there exists a single-predecessor exit bb:  */
1520   gcc_assert (single_pred_p (exit_bb));
1521   gcc_assert (single_succ_edge (exit_bb) == update_e);
1522
1523   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1524        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1525        gsi_next (&gsi), gsi_next (&gsi1))
1526     {
1527       tree init_expr;
1528       tree step_expr, off;
1529       tree type;
1530       tree var, ni, ni_name;
1531       gimple_stmt_iterator last_gsi;
1532
1533       gphi *phi = gsi.phi ();
1534       gphi *phi1 = gsi1.phi ();
1535       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1536       if (dump_enabled_p ())
1537         dump_printf_loc (MSG_NOTE, vect_location,
1538                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1539
1540       /* Skip reduction and virtual phis.  */
1541       if (!iv_phi_p (phi_info))
1542         {
1543           if (dump_enabled_p ())
1544             dump_printf_loc (MSG_NOTE, vect_location,
1545                              "reduc or virtual phi. skip.\n");
1546           continue;
1547         }
1548
1549       type = TREE_TYPE (gimple_phi_result (phi));
1550       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1551       step_expr = unshare_expr (step_expr);
1552
1553       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1554          of degree >= 2 or exponential.  */
1555       gcc_assert (!tree_is_chrec (step_expr));
1556
1557       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1558
1559       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1560                          fold_convert (TREE_TYPE (step_expr), niters),
1561                          step_expr);
1562       if (POINTER_TYPE_P (type))
1563         ni = fold_build_pointer_plus (init_expr, off);
1564       else
1565         ni = fold_build2 (PLUS_EXPR, type,
1566                           init_expr, fold_convert (type, off));
1567
1568       var = create_tmp_var (type, "tmp");
1569
1570       last_gsi = gsi_last_bb (exit_bb);
1571       gimple_seq new_stmts = NULL;
1572       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1573       /* Exit_bb shouldn't be empty.  */
1574       if (!gsi_end_p (last_gsi))
1575         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1576       else
1577         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1578
1579       /* Fix phi expressions in the successor bb.  */
1580       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1581     }
1582 }
1583
1584 /* Return a gimple value containing the misalignment (measured in vector
1585    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1586    it is away from a perfectly aligned address.  Add any new statements
1587    to SEQ.  */
1588
1589 static tree
1590 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1591 {
1592   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1593   stmt_vec_info stmt_info = dr_info->stmt;
1594   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1595
1596   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1597   unsigned HOST_WIDE_INT target_align_c;
1598   tree target_align_minus_1;
1599
1600   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1601                                         size_zero_node) < 0;
1602   tree offset = (negative
1603                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1604                  : size_zero_node);
1605   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1606                                                           stmt_info, seq,
1607                                                           offset);
1608   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1609   if (target_align.is_constant (&target_align_c))
1610     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1611   else
1612     {
1613       tree vla = build_int_cst (type, target_align);
1614       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1615                                     fold_build2 (MINUS_EXPR, type,
1616                                                  build_int_cst (type, 0), vla));
1617       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1618                                           build_int_cst (type, 1));
1619     }
1620
1621   HOST_WIDE_INT elem_size
1622     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1623   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1624
1625   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1626   tree int_start_addr = fold_convert (type, start_addr);
1627   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1628                                         target_align_minus_1);
1629
1630   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1631   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1632                                         elem_size_log);
1633
1634   return misalign_in_elems;
1635 }
1636
1637 /* Function vect_gen_prolog_loop_niters
1638
1639    Generate the number of iterations which should be peeled as prolog for the
1640    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1641    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1642    As a result, after the execution of this loop, the data reference DR will
1643    refer to an aligned location.  The following computation is generated:
1644
1645    If the misalignment of DR is known at compile time:
1646      addr_mis = int mis = DR_MISALIGNMENT (dr);
1647    Else, compute address misalignment in bytes:
1648      addr_mis = addr & (target_align - 1)
1649
1650    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1651
1652    (elem_size = element type size; an element is the scalar element whose type
1653    is the inner type of the vectype)
1654
1655    The computations will be emitted at the end of BB.  We also compute and
1656    store upper bound (included) of the result in BOUND.
1657
1658    When the step of the data-ref in the loop is not 1 (as in interleaved data
1659    and SLP), the number of iterations of the prolog must be divided by the step
1660    (which is equal to the size of interleaved group).
1661
1662    The above formulas assume that VF == number of elements in the vector. This
1663    may not hold when there are multiple-types in the loop.
1664    In this case, for some data-references in the loop the VF does not represent
1665    the number of elements that fit in the vector.  Therefore, instead of VF we
1666    use TYPE_VECTOR_SUBPARTS.  */
1667
1668 static tree
1669 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1670                              basic_block bb, int *bound)
1671 {
1672   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1673   tree var;
1674   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1675   gimple_seq stmts = NULL, new_stmts = NULL;
1676   tree iters, iters_name;
1677   stmt_vec_info stmt_info = dr_info->stmt;
1678   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1679   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1680
1681   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1682     {
1683       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1684
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_NOTE, vect_location,
1687                          "known peeling = %d.\n", npeel);
1688
1689       iters = build_int_cst (niters_type, npeel);
1690       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1691     }
1692   else
1693     {
1694       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1695       tree type = TREE_TYPE (misalign_in_elems);
1696       HOST_WIDE_INT elem_size
1697         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1698       /* We only do prolog peeling if the target alignment is known at compile
1699          time.  */
1700       poly_uint64 align_in_elems =
1701         exact_div (target_align, elem_size);
1702       tree align_in_elems_minus_1 =
1703         build_int_cst (type, align_in_elems - 1);
1704       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1705
1706       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1707                                  & (align_in_elems - 1)).  */
1708       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1709                                             size_zero_node) < 0;
1710       if (negative)
1711         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1712                              align_in_elems_tree);
1713       else
1714         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1715                              misalign_in_elems);
1716       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1717       iters = fold_convert (niters_type, iters);
1718       unsigned HOST_WIDE_INT align_in_elems_c;
1719       if (align_in_elems.is_constant (&align_in_elems_c))
1720         *bound = align_in_elems_c - 1;
1721       else
1722         *bound = -1;
1723     }
1724
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_NOTE, vect_location,
1727                      "niters for prolog loop: %T\n", iters);
1728
1729   var = create_tmp_var (niters_type, "prolog_loop_niters");
1730   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1731
1732   if (new_stmts)
1733     gimple_seq_add_seq (&stmts, new_stmts);
1734   if (stmts)
1735     {
1736       gcc_assert (single_succ_p (bb));
1737       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1738       if (gsi_end_p (gsi))
1739         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1740       else
1741         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1742     }
1743   return iters_name;
1744 }
1745
1746
1747 /* Function vect_update_init_of_dr
1748
1749    If CODE is PLUS, the vector loop starts NITERS iterations after the
1750    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1751    iterations before the scalar one (using masking to skip inactive
1752    elements).  This function updates the information recorded in DR to
1753    account for the difference.  Specifically, it updates the OFFSET
1754    field of DR_INFO.  */
1755
1756 static void
1757 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1758 {
1759   struct data_reference *dr = dr_info->dr;
1760   tree offset = dr_info->offset;
1761   if (!offset)
1762     offset = build_zero_cst (sizetype);
1763
1764   niters = fold_build2 (MULT_EXPR, sizetype,
1765                         fold_convert (sizetype, niters),
1766                         fold_convert (sizetype, DR_STEP (dr)));
1767   offset = fold_build2 (code, sizetype,
1768                         fold_convert (sizetype, offset), niters);
1769   dr_info->offset = offset;
1770 }
1771
1772
1773 /* Function vect_update_inits_of_drs
1774
1775    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1776    CODE and NITERS are as for vect_update_inits_of_dr.  */
1777
1778 void
1779 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1780                           tree_code code)
1781 {
1782   unsigned int i;
1783   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1784   struct data_reference *dr;
1785
1786   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1787
1788   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1789      here, but since we might use these niters to update the epilogues niters
1790      and data references we can't insert them here as this definition might not
1791      always dominate its uses.  */
1792   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1793     niters = fold_convert (sizetype, niters);
1794
1795   FOR_EACH_VEC_ELT (datarefs, i, dr)
1796     {
1797       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1798       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1799         vect_update_init_of_dr (dr_info, niters, code);
1800     }
1801 }
1802
1803 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1804    by masking.  This involves calculating the number of iterations to
1805    be peeled and then aligning all memory references appropriately.  */
1806
1807 void
1808 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1809 {
1810   tree misalign_in_elems;
1811   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1812
1813   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1814
1815   /* From the information recorded in LOOP_VINFO get the number of iterations
1816      that need to be skipped via masking.  */
1817   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1818     {
1819       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1820                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1821       misalign_in_elems = build_int_cst (type, misalign);
1822     }
1823   else
1824     {
1825       gimple_seq seq1 = NULL, seq2 = NULL;
1826       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1827       misalign_in_elems = fold_convert (type, misalign_in_elems);
1828       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1829                                                 &seq2, true, NULL_TREE);
1830       gimple_seq_add_seq (&seq1, seq2);
1831       if (seq1)
1832         {
1833           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1834           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1835           gcc_assert (!new_bb);
1836         }
1837     }
1838
1839   if (dump_enabled_p ())
1840     dump_printf_loc (MSG_NOTE, vect_location,
1841                      "misalignment for fully-masked loop: %T\n",
1842                      misalign_in_elems);
1843
1844   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1845
1846   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1847 }
1848
1849 /* This function builds ni_name = number of iterations.  Statements
1850    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1851    it to TRUE if new ssa_var is generated.  */
1852
1853 tree
1854 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1855 {
1856   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1857   if (TREE_CODE (ni) == INTEGER_CST)
1858     return ni;
1859   else
1860     {
1861       tree ni_name, var;
1862       gimple_seq stmts = NULL;
1863       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1864
1865       var = create_tmp_var (TREE_TYPE (ni), "niters");
1866       ni_name = force_gimple_operand (ni, &stmts, false, var);
1867       if (stmts)
1868         {
1869           gsi_insert_seq_on_edge_immediate (pe, stmts);
1870           if (new_var_p != NULL)
1871             *new_var_p = true;
1872         }
1873
1874       return ni_name;
1875     }
1876 }
1877
1878 /* Calculate the number of iterations above which vectorized loop will be
1879    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1880    of prolog loop.  If it's integer const, the integer number is also passed
1881    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1882    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1883    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1884    threshold below which the scalar (rather than vectorized) loop will be
1885    executed.  This function stores the upper bound (inclusive) of the result
1886    in BOUND_SCALAR.  */
1887
1888 static tree
1889 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1890                              int bound_prolog, poly_int64 bound_epilog, int th,
1891                              poly_uint64 *bound_scalar,
1892                              bool check_profitability)
1893 {
1894   tree type = TREE_TYPE (niters_prolog);
1895   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1896                              build_int_cst (type, bound_epilog));
1897
1898   *bound_scalar = bound_prolog + bound_epilog;
1899   if (check_profitability)
1900     {
1901       /* TH indicates the minimum niters of vectorized loop, while we
1902          compute the maximum niters of scalar loop.  */
1903       th--;
1904       /* Peeling for constant times.  */
1905       if (int_niters_prolog >= 0)
1906         {
1907           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1908           return build_int_cst (type, *bound_scalar);
1909         }
1910       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1911          and BOUND_EPILOG are inclusive upper bounds.  */
1912       if (known_ge (th, bound_prolog + bound_epilog))
1913         {
1914           *bound_scalar = th;
1915           return build_int_cst (type, th);
1916         }
1917       /* Need to do runtime comparison.  */
1918       else if (maybe_gt (th, bound_epilog))
1919         {
1920           *bound_scalar = upper_bound (*bound_scalar, th);
1921           return fold_build2 (MAX_EXPR, type,
1922                               build_int_cst (type, th), niters);
1923         }
1924     }
1925   return niters;
1926 }
1927
1928 /* NITERS is the number of times that the original scalar loop executes
1929    after peeling.  Work out the maximum number of iterations N that can
1930    be handled by the vectorized form of the loop and then either:
1931
1932    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1933
1934         niters_vector = N
1935
1936    b) set *STEP_VECTOR_PTR to one and generate:
1937
1938         niters_vector = N / vf
1939
1940    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1941    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1942    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1943
1944 void
1945 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1946                              tree *niters_vector_ptr, tree *step_vector_ptr,
1947                              bool niters_no_overflow)
1948 {
1949   tree ni_minus_gap, var;
1950   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1951   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1952   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1953   tree log_vf = NULL_TREE;
1954
1955   /* If epilogue loop is required because of data accesses with gaps, we
1956      subtract one iteration from the total number of iterations here for
1957      correct calculation of RATIO.  */
1958   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1959     {
1960       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1961                                   build_one_cst (type));
1962       if (!is_gimple_val (ni_minus_gap))
1963         {
1964           var = create_tmp_var (type, "ni_gap");
1965           gimple *stmts = NULL;
1966           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1967                                                true, var);
1968           gsi_insert_seq_on_edge_immediate (pe, stmts);
1969         }
1970     }
1971   else
1972     ni_minus_gap = niters;
1973
1974   unsigned HOST_WIDE_INT const_vf;
1975   if (vf.is_constant (&const_vf)
1976       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1977     {
1978       /* Create: niters >> log2(vf) */
1979       /* If it's known that niters == number of latch executions + 1 doesn't
1980          overflow, we can generate niters >> log2(vf); otherwise we generate
1981          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1982          will be at least one.  */
1983       log_vf = build_int_cst (type, exact_log2 (const_vf));
1984       if (niters_no_overflow)
1985         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1986       else
1987         niters_vector
1988           = fold_build2 (PLUS_EXPR, type,
1989                          fold_build2 (RSHIFT_EXPR, type,
1990                                       fold_build2 (MINUS_EXPR, type,
1991                                                    ni_minus_gap,
1992                                                    build_int_cst (type, vf)),
1993                                       log_vf),
1994                          build_int_cst (type, 1));
1995       step_vector = build_one_cst (type);
1996     }
1997   else
1998     {
1999       niters_vector = ni_minus_gap;
2000       step_vector = build_int_cst (type, vf);
2001     }
2002
2003   if (!is_gimple_val (niters_vector))
2004     {
2005       var = create_tmp_var (type, "bnd");
2006       gimple_seq stmts = NULL;
2007       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2008       gsi_insert_seq_on_edge_immediate (pe, stmts);
2009       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2010          we set range information to make niters analyzer's life easier.  */
2011       if (stmts != NULL && log_vf)
2012         set_range_info (niters_vector, VR_RANGE,
2013                         wi::to_wide (build_int_cst (type, 1)),
2014                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
2015                                                   TYPE_MAX_VALUE (type),
2016                                                   log_vf)));
2017     }
2018   *niters_vector_ptr = niters_vector;
2019   *step_vector_ptr = step_vector;
2020
2021   return;
2022 }
2023
2024 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2025    loop specified by LOOP_VINFO after vectorization, compute the number
2026    of iterations before vectorization (niters_vector * vf) and store it
2027    to NITERS_VECTOR_MULT_VF_PTR.  */
2028
2029 static void
2030 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2031                                      tree niters_vector,
2032                                      tree *niters_vector_mult_vf_ptr)
2033 {
2034   /* We should be using a step_vector of VF if VF is variable.  */
2035   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2036   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2037   tree type = TREE_TYPE (niters_vector);
2038   tree log_vf = build_int_cst (type, exact_log2 (vf));
2039   basic_block exit_bb = single_exit (loop)->dest;
2040
2041   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2042   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2043                                             niters_vector, log_vf);
2044   if (!is_gimple_val (niters_vector_mult_vf))
2045     {
2046       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2047       gimple_seq stmts = NULL;
2048       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2049                                                     &stmts, true, var);
2050       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2051       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2052     }
2053   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2054 }
2055
2056 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2057    this function searches for the corresponding lcssa phi node in exit
2058    bb of LOOP.  If it is found, return the phi result; otherwise return
2059    NULL.  */
2060
2061 static tree
2062 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2063                 gphi *lcssa_phi)
2064 {
2065   gphi_iterator gsi;
2066   edge e = single_exit (loop);
2067
2068   gcc_assert (single_pred_p (e->dest));
2069   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2070     {
2071       gphi *phi = gsi.phi ();
2072       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2073                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2074         return PHI_RESULT (phi);
2075     }
2076   return NULL_TREE;
2077 }
2078
2079 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2080    from SECOND/FIRST and puts it at the original loop's preheader/exit
2081    edge, the two loops are arranged as below:
2082
2083        preheader_a:
2084      first_loop:
2085        header_a:
2086          i_1 = PHI<i_0, i_2>;
2087          ...
2088          i_2 = i_1 + 1;
2089          if (cond_a)
2090            goto latch_a;
2091          else
2092            goto between_bb;
2093        latch_a:
2094          goto header_a;
2095
2096        between_bb:
2097          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2098
2099      second_loop:
2100        header_b:
2101          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2102                                  or with i_2 if no LCSSA phi is created
2103                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2104          ...
2105          i_4 = i_3 + 1;
2106          if (cond_b)
2107            goto latch_b;
2108          else
2109            goto exit_bb;
2110        latch_b:
2111          goto header_b;
2112
2113        exit_bb:
2114
2115    This function creates loop closed SSA for the first loop; update the
2116    second loop's PHI nodes by replacing argument on incoming edge with the
2117    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2118    is false, Loop closed ssa phis will only be created for non-iv phis for
2119    the first loop.
2120
2121    This function assumes exit bb of the first loop is preheader bb of the
2122    second loop, i.e, between_bb in the example code.  With PHIs updated,
2123    the second loop will execute rest iterations of the first.  */
2124
2125 static void
2126 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2127                                    class loop *first, class loop *second,
2128                                    bool create_lcssa_for_iv_phis)
2129 {
2130   gphi_iterator gsi_update, gsi_orig;
2131   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2132
2133   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2134   edge second_preheader_e = loop_preheader_edge (second);
2135   basic_block between_bb = single_exit (first)->dest;
2136
2137   gcc_assert (between_bb == second_preheader_e->src);
2138   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2139   /* Either the first loop or the second is the loop to be vectorized.  */
2140   gcc_assert (loop == first || loop == second);
2141
2142   for (gsi_orig = gsi_start_phis (first->header),
2143        gsi_update = gsi_start_phis (second->header);
2144        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2145        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2146     {
2147       gphi *orig_phi = gsi_orig.phi ();
2148       gphi *update_phi = gsi_update.phi ();
2149
2150       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2151       /* Generate lcssa PHI node for the first loop.  */
2152       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2153       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2154       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2155         {
2156           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2157           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2158           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2159           arg = new_res;
2160         }
2161
2162       /* Update PHI node in the second loop by replacing arg on the loop's
2163          incoming edge.  */
2164       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2165     }
2166
2167   /* For epilogue peeling we have to make sure to copy all LC PHIs
2168      for correct vectorization of live stmts.  */
2169   if (loop == first)
2170     {
2171       basic_block orig_exit = single_exit (second)->dest;
2172       for (gsi_orig = gsi_start_phis (orig_exit);
2173            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2174         {
2175           gphi *orig_phi = gsi_orig.phi ();
2176           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2177           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2178             continue;
2179
2180           /* Already created in the above loop.   */
2181           if (find_guard_arg (first, second, orig_phi))
2182             continue;
2183
2184           tree new_res = copy_ssa_name (orig_arg);
2185           gphi *lcphi = create_phi_node (new_res, between_bb);
2186           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2187         }
2188     }
2189 }
2190
2191 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2192    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2193    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2194    appear like below:
2195
2196        guard_bb:
2197          if (cond)
2198            goto merge_bb;
2199          else
2200            goto skip_loop;
2201
2202      skip_loop:
2203        header_a:
2204          i_1 = PHI<i_0, i_2>;
2205          ...
2206          i_2 = i_1 + 1;
2207          if (cond_a)
2208            goto latch_a;
2209          else
2210            goto exit_a;
2211        latch_a:
2212          goto header_a;
2213
2214        exit_a:
2215          i_5 = PHI<i_2>;
2216
2217        merge_bb:
2218          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2219
2220      update_loop:
2221        header_b:
2222          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2223          ...
2224          i_4 = i_3 + 1;
2225          if (cond_b)
2226            goto latch_b;
2227          else
2228            goto exit_bb;
2229        latch_b:
2230          goto header_b;
2231
2232        exit_bb:
2233
2234    This function creates PHI nodes at merge_bb and replaces the use of i_5
2235    in the update_loop's PHI node with the result of new PHI result.  */
2236
2237 static void
2238 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2239                                     class loop *update_loop,
2240                                     edge guard_edge, edge merge_edge)
2241 {
2242   location_t merge_loc, guard_loc;
2243   edge orig_e = loop_preheader_edge (skip_loop);
2244   edge update_e = loop_preheader_edge (update_loop);
2245   gphi_iterator gsi_orig, gsi_update;
2246
2247   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2248         gsi_update = gsi_start_phis (update_loop->header));
2249        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2250        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2251     {
2252       gphi *orig_phi = gsi_orig.phi ();
2253       gphi *update_phi = gsi_update.phi ();
2254
2255       /* Generate new phi node at merge bb of the guard.  */
2256       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2257       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2258
2259       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2260          args in NEW_PHI for these edges.  */
2261       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2262       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2263       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2264       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2265       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2266       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2267
2268       /* Update phi in UPDATE_PHI.  */
2269       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2270     }
2271 }
2272
2273 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2274    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2275    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2276    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2277    The CFG looks like:
2278
2279      loop:
2280        header_a:
2281          i_1 = PHI<i_0, i_2>;
2282          ...
2283          i_2 = i_1 + 1;
2284          if (cond_a)
2285            goto latch_a;
2286          else
2287            goto exit_a;
2288        latch_a:
2289          goto header_a;
2290
2291        exit_a:
2292
2293        guard_bb:
2294          if (cond)
2295            goto merge_bb;
2296          else
2297            goto epilog_loop;
2298
2299        ;; fall_through_bb
2300
2301      epilog_loop:
2302        header_b:
2303          i_3 = PHI<i_2, i_4>;
2304          ...
2305          i_4 = i_3 + 1;
2306          if (cond_b)
2307            goto latch_b;
2308          else
2309            goto merge_bb;
2310        latch_b:
2311          goto header_b;
2312
2313        merge_bb:
2314          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2315
2316        exit_bb:
2317          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2318
2319    For each name used out side EPILOG (i.e - for each name that has a lcssa
2320    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2321    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2322    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2323    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2324    in exit_bb will also be updated.  */
2325
2326 static void
2327 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2328                                     edge guard_edge, edge merge_edge)
2329 {
2330   gphi_iterator gsi;
2331   basic_block merge_bb = guard_edge->dest;
2332
2333   gcc_assert (single_succ_p (merge_bb));
2334   edge e = single_succ_edge (merge_bb);
2335   basic_block exit_bb = e->dest;
2336   gcc_assert (single_pred_p (exit_bb));
2337   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2338
2339   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2340     {
2341       gphi *update_phi = gsi.phi ();
2342       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2343
2344       tree merge_arg = NULL_TREE;
2345
2346       /* If the old argument is a SSA_NAME use its current_def.  */
2347       if (TREE_CODE (old_arg) == SSA_NAME)
2348         merge_arg = get_current_def (old_arg);
2349       /* If it's a constant or doesn't have a current_def, just use the old
2350          argument.  */
2351       if (!merge_arg)
2352         merge_arg = old_arg;
2353
2354       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2355       /* If the var is live after loop but not a reduction, we simply
2356          use the old arg.  */
2357       if (!guard_arg)
2358         guard_arg = old_arg;
2359
2360       /* Create new phi node in MERGE_BB:  */
2361       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2362       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2363
2364       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2365          the two PHI args in merge_phi for these edges.  */
2366       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2367       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2368
2369       /* Update the original phi in exit_bb.  */
2370       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2371     }
2372 }
2373
2374 /* EPILOG loop is duplicated from the original loop for vectorizing,
2375    the arg of its loop closed ssa PHI needs to be updated.  */
2376
2377 static void
2378 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2379 {
2380   gphi_iterator gsi;
2381   basic_block exit_bb = single_exit (epilog)->dest;
2382
2383   gcc_assert (single_pred_p (exit_bb));
2384   edge e = EDGE_PRED (exit_bb, 0);
2385   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2386     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2387 }
2388
2389 /* Function vect_do_peeling.
2390
2391    Input:
2392    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2393
2394        preheader:
2395      LOOP:
2396        header_bb:
2397          loop_body
2398          if (exit_loop_cond) goto exit_bb
2399          else                goto header_bb
2400        exit_bb:
2401
2402    - NITERS: The number of iterations of the loop.
2403    - NITERSM1: The number of iterations of the loop's latch.
2404    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2405    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2406                               CHECK_PROFITABILITY is true.
2407    Output:
2408    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2409      iterate after vectorization; see vect_set_loop_condition for details.
2410    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2411      should be set to the number of scalar iterations handled by the
2412      vector loop.  The SSA name is only used on exit from the loop.
2413
2414    This function peels prolog and epilog from the loop, adds guards skipping
2415    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2416    would look like:
2417
2418        guard_bb_1:
2419          if (prefer_scalar_loop) goto merge_bb_1
2420          else                    goto guard_bb_2
2421
2422        guard_bb_2:
2423          if (skip_prolog) goto merge_bb_2
2424          else             goto prolog_preheader
2425
2426        prolog_preheader:
2427      PROLOG:
2428        prolog_header_bb:
2429          prolog_body
2430          if (exit_prolog_cond) goto prolog_exit_bb
2431          else                  goto prolog_header_bb
2432        prolog_exit_bb:
2433
2434        merge_bb_2:
2435
2436        vector_preheader:
2437      VECTOR LOOP:
2438        vector_header_bb:
2439          vector_body
2440          if (exit_vector_cond) goto vector_exit_bb
2441          else                  goto vector_header_bb
2442        vector_exit_bb:
2443
2444        guard_bb_3:
2445          if (skip_epilog) goto merge_bb_3
2446          else             goto epilog_preheader
2447
2448        merge_bb_1:
2449
2450        epilog_preheader:
2451      EPILOG:
2452        epilog_header_bb:
2453          epilog_body
2454          if (exit_epilog_cond) goto merge_bb_3
2455          else                  goto epilog_header_bb
2456
2457        merge_bb_3:
2458
2459    Note this function peels prolog and epilog only if it's necessary,
2460    as well as guards.
2461    This function returns the epilogue loop if a decision was made to vectorize
2462    it, otherwise NULL.
2463
2464    The analysis resulting in this epilogue loop's loop_vec_info was performed
2465    in the same vect_analyze_loop call as the main loop's.  At that time
2466    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2467    vectorization factors than the main loop.  This list is stored in the main
2468    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2469    vectorize the epilogue loop for a lower vectorization factor,  the
2470    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2471    updated and linked to the epilogue loop.  This is later used to vectorize
2472    the epilogue.  The reason the loop_vec_info needs updating is that it was
2473    constructed based on the original main loop, and the epilogue loop is a
2474    copy of this loop, so all links pointing to statements in the original loop
2475    need updating.  Furthermore, these loop_vec_infos share the
2476    data_reference's records, which will also need to be updated.
2477
2478    TODO: Guard for prefer_scalar_loop should be emitted along with
2479    versioning conditions if loop versioning is needed.  */
2480
2481
2482 class loop *
2483 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2484                  tree *niters_vector, tree *step_vector,
2485                  tree *niters_vector_mult_vf_var, int th,
2486                  bool check_profitability, bool niters_no_overflow,
2487                  tree *advance)
2488 {
2489   edge e, guard_e;
2490   tree type = TREE_TYPE (niters), guard_cond;
2491   basic_block guard_bb, guard_to;
2492   profile_probability prob_prolog, prob_vector, prob_epilog;
2493   int estimated_vf;
2494   int prolog_peeling = 0;
2495   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2496   /* We currently do not support prolog peeling if the target alignment is not
2497      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2498      target alignment being constant.  */
2499   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2500   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2501     return NULL;
2502
2503   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2504     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2505
2506   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2507   poly_uint64 bound_epilog = 0;
2508   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2509       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2510     bound_epilog += vf - 1;
2511   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2512     bound_epilog += 1;
2513   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2514   poly_uint64 bound_scalar = bound_epilog;
2515
2516   if (!prolog_peeling && !epilog_peeling)
2517     return NULL;
2518
2519   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2520   estimated_vf = vect_vf_for_cost (loop_vinfo);
2521   if (estimated_vf == 2)
2522     estimated_vf = 3;
2523   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2524                         .apply_scale (estimated_vf - 1, estimated_vf);
2525
2526   class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2527   class loop *first_loop = loop;
2528   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2529
2530   /* We might have a queued need to update virtual SSA form.  As we
2531      delete the update SSA machinery below after doing a regular
2532      incremental SSA update during loop copying make sure we don't
2533      lose that fact.
2534      ???  Needing to update virtual SSA form by renaming is unfortunate
2535      but not all of the vectorizer code inserting new loads / stores
2536      properly assigns virtual operands to those statements.  */
2537   update_ssa (TODO_update_ssa_only_virtuals);
2538
2539   create_lcssa_for_virtual_phi (loop);
2540
2541   /* If we're vectorizing an epilogue loop, the update_ssa above will
2542      have ensured that the virtual operand is in SSA form throughout the
2543      vectorized main loop.  Normally it is possible to trace the updated
2544      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2545      back to scalar-stmt vuses, meaning that the effect of the SSA update
2546      remains local to the main loop.  However, there are rare cases in
2547      which the vectorized loop has vdefs even when the original scalar
2548      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2549      introduces clobbers of the temporary vector array, which in turn
2550      needs new vdefs.  If the scalar loop doesn't write to memory, these
2551      new vdefs will be the only ones in the vector loop.
2552
2553      In that case, update_ssa will have added a new virtual phi to the
2554      main loop, which previously didn't need one.  Ensure that we (locally)
2555      maintain LCSSA form for the virtual operand, just as we would have
2556      done if the virtual phi had existed from the outset.  This makes it
2557      easier to duplicate the scalar epilogue loop below.  */
2558   tree vop_to_rename = NULL_TREE;
2559   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2560     {
2561       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2562       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2563     }
2564
2565   if (MAY_HAVE_DEBUG_BIND_STMTS)
2566     {
2567       gcc_assert (!adjust_vec.exists ());
2568       adjust_vec.create (32);
2569     }
2570   initialize_original_copy_tables ();
2571
2572   /* Record the anchor bb at which the guard should be placed if the scalar
2573      loop might be preferred.  */
2574   basic_block anchor = loop_preheader_edge (loop)->src;
2575
2576   /* Generate the number of iterations for the prolog loop.  We do this here
2577      so that we can also get the upper bound on the number of iterations.  */
2578   tree niters_prolog;
2579   int bound_prolog = 0;
2580   if (prolog_peeling)
2581     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2582                                                   &bound_prolog);
2583   else
2584     niters_prolog = build_int_cst (type, 0);
2585
2586   loop_vec_info epilogue_vinfo = NULL;
2587   if (vect_epilogues)
2588     {
2589       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2590       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2591     }
2592
2593   tree niters_vector_mult_vf = NULL_TREE;
2594   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2595   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2596   edge update_e = NULL, skip_e = NULL;
2597   unsigned int lowest_vf = constant_lower_bound (vf);
2598   /* If we know the number of scalar iterations for the main loop we should
2599      check whether after the main loop there are enough iterations left over
2600      for the epilogue.  */
2601   if (vect_epilogues
2602       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2603       && prolog_peeling >= 0
2604       && known_eq (vf, lowest_vf)
2605       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (epilogue_vinfo))
2606     {
2607       unsigned HOST_WIDE_INT eiters
2608         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2609            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2610
2611       eiters -= prolog_peeling;
2612       eiters
2613         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2614
2615       unsigned int ratio;
2616       unsigned int epilogue_gaps
2617         = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2618       while (!(constant_multiple_p
2619                (GET_MODE_SIZE (loop_vinfo->vector_mode),
2620                 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2621                && eiters >= lowest_vf / ratio + epilogue_gaps))
2622         {
2623           delete epilogue_vinfo;
2624           epilogue_vinfo = NULL;
2625           if (loop_vinfo->epilogue_vinfos.length () == 0)
2626             {
2627               vect_epilogues = false;
2628               break;
2629             }
2630           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2631           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2632           epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2633         }
2634     }
2635   /* Prolog loop may be skipped.  */
2636   bool skip_prolog = (prolog_peeling != 0);
2637   /* Skip this loop to epilog when there are not enough iterations to enter this
2638      vectorized loop.  If true we should perform runtime checks on the NITERS
2639      to check whether we should skip the current vectorized loop.  If we know
2640      the number of scalar iterations we may choose to add a runtime check if
2641      this number "maybe" smaller than the number of iterations required
2642      when we know the number of scalar iterations may potentially
2643      be smaller than the number of iterations required to enter this loop, for
2644      this we use the upper bounds on the prolog and epilog peeling.  When we
2645      don't know the number of iterations and don't require versioning it is
2646      because we have asserted that there are enough scalar iterations to enter
2647      the main loop, so this skip is not necessary.  When we are versioning then
2648      we only add such a skip if we have chosen to vectorize the epilogue.  */
2649   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2650                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2651                                   bound_prolog + bound_epilog)
2652                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2653                          || vect_epilogues));
2654   /* Epilog loop must be executed if the number of iterations for epilog
2655      loop is known at compile time, otherwise we need to add a check at
2656      the end of vector loop and skip to the end of epilog loop.  */
2657   bool skip_epilog = (prolog_peeling < 0
2658                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2659                       || !vf.is_constant ());
2660   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2661   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2662     skip_epilog = false;
2663
2664   if (skip_vector)
2665     {
2666       split_edge (loop_preheader_edge (loop));
2667
2668       /* Due to the order in which we peel prolog and epilog, we first
2669          propagate probability to the whole loop.  The purpose is to
2670          avoid adjusting probabilities of both prolog and vector loops
2671          separately.  Note in this case, the probability of epilog loop
2672          needs to be scaled back later.  */
2673       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2674       if (prob_vector.initialized_p ())
2675         {
2676           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2677           scale_loop_profile (loop, prob_vector, 0);
2678         }
2679     }
2680
2681   dump_user_location_t loop_loc = find_loop_location (loop);
2682   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2683   if (vect_epilogues)
2684     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2685        use the original scalar loop as remaining epilogue if necessary.  */
2686     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2687       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2688
2689   if (prolog_peeling)
2690     {
2691       e = loop_preheader_edge (loop);
2692       if (!slpeel_can_duplicate_loop_p (loop, e))
2693         {
2694           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2695                            "loop can't be duplicated to preheader edge.\n");
2696           gcc_unreachable ();
2697         }
2698       /* Peel prolog and put it on preheader edge of loop.  */
2699       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2700       if (!prolog)
2701         {
2702           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2703                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2704           gcc_unreachable ();
2705         }
2706       prolog->force_vectorize = false;
2707       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2708       first_loop = prolog;
2709       reset_original_copy_tables ();
2710
2711       /* Update the number of iterations for prolog loop.  */
2712       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2713       vect_set_loop_condition (prolog, NULL, niters_prolog,
2714                                step_prolog, NULL_TREE, false);
2715
2716       /* Skip the prolog loop.  */
2717       if (skip_prolog)
2718         {
2719           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2720                                     niters_prolog, build_int_cst (type, 0));
2721           guard_bb = loop_preheader_edge (prolog)->src;
2722           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2723           guard_to = split_edge (loop_preheader_edge (loop));
2724           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2725                                            guard_to, guard_bb,
2726                                            prob_prolog.invert (),
2727                                            irred_flag);
2728           e = EDGE_PRED (guard_to, 0);
2729           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2730           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2731
2732           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2733           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2734         }
2735
2736       /* Update init address of DRs.  */
2737       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2738       /* Update niters for vector loop.  */
2739       LOOP_VINFO_NITERS (loop_vinfo)
2740         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2741       LOOP_VINFO_NITERSM1 (loop_vinfo)
2742         = fold_build2 (MINUS_EXPR, type,
2743                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2744       bool new_var_p = false;
2745       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2746       /* It's guaranteed that vector loop bound before vectorization is at
2747          least VF, so set range information for newly generated var.  */
2748       if (new_var_p)
2749         set_range_info (niters, VR_RANGE,
2750                         wi::to_wide (build_int_cst (type, vf)),
2751                         wi::to_wide (TYPE_MAX_VALUE (type)));
2752
2753       /* Prolog iterates at most bound_prolog times, latch iterates at
2754          most bound_prolog - 1 times.  */
2755       record_niter_bound (prolog, bound_prolog - 1, false, true);
2756       delete_update_ssa ();
2757       adjust_vec_debug_stmts ();
2758       scev_reset ();
2759     }
2760
2761   if (epilog_peeling)
2762     {
2763       e = single_exit (loop);
2764       if (!slpeel_can_duplicate_loop_p (loop, e))
2765         {
2766           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2767                            "loop can't be duplicated to exit edge.\n");
2768           gcc_unreachable ();
2769         }
2770       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2771          said epilog then we should use a copy of the main loop as a starting
2772          point.  This loop may have already had some preliminary transformations
2773          to allow for more optimal vectorization, for example if-conversion.
2774          If we are not vectorizing the epilog then we should use the scalar loop
2775          as the transformations mentioned above make less or no sense when not
2776          vectorizing.  */
2777       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2778       if (vop_to_rename)
2779         {
2780           /* Vectorizing the main loop can sometimes introduce a vdef to
2781              a loop that previously didn't have one; see the comment above
2782              the definition of VOP_TO_RENAME for details.  The definition
2783              D that holds on E will then be different from the definition
2784              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2785              rename VOP_TO_RENAME to D when copying the loop.
2786
2787              The virtual operand is in LCSSA form for the main loop,
2788              and no stmt between the main loop and E needs a vdef,
2789              so we know that D is provided by a phi rather than by a
2790              vdef on a normal gimple stmt.  */
2791           basic_block vdef_bb = e->src;
2792           gphi *vphi;
2793           while (!(vphi = get_virtual_phi (vdef_bb)))
2794             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2795           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2796           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2797         }
2798       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2799       if (!epilog)
2800         {
2801           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2802                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2803           gcc_unreachable ();
2804         }
2805       epilog->force_vectorize = false;
2806       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2807
2808       /* Scalar version loop may be preferred.  In this case, add guard
2809          and skip to epilog.  Note this only happens when the number of
2810          iterations of loop is unknown at compile time, otherwise this
2811          won't be vectorized.  */
2812       if (skip_vector)
2813         {
2814           /* Additional epilogue iteration is peeled if gap exists.  */
2815           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2816                                                 bound_prolog, bound_epilog,
2817                                                 th, &bound_scalar,
2818                                                 check_profitability);
2819           /* Build guard against NITERSM1 since NITERS may overflow.  */
2820           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2821           guard_bb = anchor;
2822           guard_to = split_edge (loop_preheader_edge (epilog));
2823           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2824                                            guard_to, guard_bb,
2825                                            prob_vector.invert (),
2826                                            irred_flag);
2827           skip_e = guard_e;
2828           e = EDGE_PRED (guard_to, 0);
2829           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2830           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2831
2832           /* Simply propagate profile info from guard_bb to guard_to which is
2833              a merge point of control flow.  */
2834           guard_to->count = guard_bb->count;
2835
2836           /* Scale probability of epilog loop back.
2837              FIXME: We should avoid scaling down and back up.  Profile may
2838              get lost if we scale down to 0.  */
2839           basic_block *bbs = get_loop_body (epilog);
2840           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2841             bbs[i]->count = bbs[i]->count.apply_scale
2842                                  (bbs[i]->count,
2843                                   bbs[i]->count.apply_probability
2844                                     (prob_vector));
2845           free (bbs);
2846         }
2847
2848       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2849       /* If loop is peeled for non-zero constant times, now niters refers to
2850          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2851          overflows.  */
2852       niters_no_overflow |= (prolog_peeling > 0);
2853       vect_gen_vector_loop_niters (loop_vinfo, niters,
2854                                    niters_vector, step_vector,
2855                                    niters_no_overflow);
2856       if (!integer_onep (*step_vector))
2857         {
2858           /* On exit from the loop we will have an easy way of calcalating
2859              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2860              until then.  */
2861           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2862           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2863           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2864         }
2865       else
2866         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2867                                              &niters_vector_mult_vf);
2868       /* Update IVs of original loop as if they were advanced by
2869          niters_vector_mult_vf steps.  */
2870       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2871       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2872       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2873                                         update_e);
2874
2875       if (skip_epilog)
2876         {
2877           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2878                                     niters, niters_vector_mult_vf);
2879           guard_bb = single_exit (loop)->dest;
2880           guard_to = split_edge (single_exit (epilog));
2881           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2882                                            skip_vector ? anchor : guard_bb,
2883                                            prob_epilog.invert (),
2884                                            irred_flag);
2885           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2886                                               single_exit (epilog));
2887           /* Only need to handle basic block before epilog loop if it's not
2888              the guard_bb, which is the case when skip_vector is true.  */
2889           if (guard_bb != bb_before_epilog)
2890             {
2891               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2892
2893               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2894             }
2895           scale_loop_profile (epilog, prob_epilog, 0);
2896         }
2897       else
2898         slpeel_update_phi_nodes_for_lcssa (epilog);
2899
2900       unsigned HOST_WIDE_INT bound;
2901       if (bound_scalar.is_constant (&bound))
2902         {
2903           gcc_assert (bound != 0);
2904           /* -1 to convert loop iterations to latch iterations.  */
2905           record_niter_bound (epilog, bound - 1, false, true);
2906         }
2907
2908       delete_update_ssa ();
2909       adjust_vec_debug_stmts ();
2910       scev_reset ();
2911     }
2912
2913   if (vect_epilogues)
2914     {
2915       epilog->aux = epilogue_vinfo;
2916       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2917
2918       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2919
2920       /* We now must calculate the number of NITERS performed by the previous
2921          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2922       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2923                                  niters_prolog, niters_vector_mult_vf);
2924
2925       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2926          determine whether we are coming from the previous vectorized loop
2927          using the update_e edge or the skip_vector basic block using the
2928          skip_e edge.  */
2929       if (skip_vector)
2930         {
2931           gcc_assert (update_e != NULL && skip_e != NULL);
2932           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2933                                            update_e->dest);
2934           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2935           gimple *stmt = gimple_build_assign (new_ssa, niters);
2936           gimple_stmt_iterator gsi;
2937           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2938               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2939             {
2940               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2941               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2942             }
2943           else
2944             {
2945               gsi = gsi_last_bb (update_e->src);
2946               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2947             }
2948
2949           niters = new_ssa;
2950           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2951           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2952                        UNKNOWN_LOCATION);
2953           niters = PHI_RESULT (new_phi);
2954         }
2955
2956       /* Subtract the number of iterations performed by the vectorized loop
2957          from the number of total iterations.  */
2958       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2959                                           before_loop_niters,
2960                                           niters);
2961
2962       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2963       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2964         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2965                        epilogue_niters,
2966                        build_one_cst (TREE_TYPE (epilogue_niters)));
2967
2968       /* Set ADVANCE to the number of iterations performed by the previous
2969          loop and its prologue.  */
2970       *advance = niters;
2971
2972       /* Redo the peeling for niter analysis as the NITERs and alignment
2973          may have been updated to take the main loop into account.  */
2974       determine_peel_for_niter (epilogue_vinfo);
2975     }
2976
2977   adjust_vec.release ();
2978   free_original_copy_tables ();
2979
2980   return vect_epilogues ? epilog : NULL;
2981 }
2982
2983 /* Function vect_create_cond_for_niters_checks.
2984
2985    Create a conditional expression that represents the run-time checks for
2986    loop's niter.  The loop is guaranteed to terminate if the run-time
2987    checks hold.
2988
2989    Input:
2990    COND_EXPR  - input conditional expression.  New conditions will be chained
2991                 with logical AND operation.  If it is NULL, then the function
2992                 is used to return the number of alias checks.
2993    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2994                 to be checked.
2995
2996    Output:
2997    COND_EXPR - conditional expression.
2998
2999    The returned COND_EXPR is the conditional expression to be used in the
3000    if statement that controls which version of the loop gets executed at
3001    runtime.  */
3002
3003 static void
3004 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3005 {
3006   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3007
3008   if (*cond_expr)
3009     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3010                               *cond_expr, part_cond_expr);
3011   else
3012     *cond_expr = part_cond_expr;
3013 }
3014
3015 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3016    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3017
3018 static void
3019 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3020 {
3021   if (*cond_expr)
3022     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3023                               *cond_expr, part_cond_expr);
3024   else
3025     *cond_expr = part_cond_expr;
3026 }
3027
3028 /* Function vect_create_cond_for_align_checks.
3029
3030    Create a conditional expression that represents the alignment checks for
3031    all of data references (array element references) whose alignment must be
3032    checked at runtime.
3033
3034    Input:
3035    COND_EXPR  - input conditional expression.  New conditions will be chained
3036                 with logical AND operation.
3037    LOOP_VINFO - two fields of the loop information are used.
3038                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3039                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3040
3041    Output:
3042    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3043                          expression.
3044    The returned value is the conditional expression to be used in the if
3045    statement that controls which version of the loop gets executed at runtime.
3046
3047    The algorithm makes two assumptions:
3048      1) The number of bytes "n" in a vector is a power of 2.
3049      2) An address "a" is aligned if a%n is zero and that this
3050         test can be done as a&(n-1) == 0.  For example, for 16
3051         byte vectors the test is a&0xf == 0.  */
3052
3053 static void
3054 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3055                                    tree *cond_expr,
3056                                    gimple_seq *cond_expr_stmt_list)
3057 {
3058   vec<stmt_vec_info> may_misalign_stmts
3059     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3060   stmt_vec_info stmt_info;
3061   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3062   tree mask_cst;
3063   unsigned int i;
3064   tree int_ptrsize_type;
3065   char tmp_name[20];
3066   tree or_tmp_name = NULL_TREE;
3067   tree and_tmp_name;
3068   gimple *and_stmt;
3069   tree ptrsize_zero;
3070   tree part_cond_expr;
3071
3072   /* Check that mask is one less than a power of 2, i.e., mask is
3073      all zeros followed by all ones.  */
3074   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3075
3076   int_ptrsize_type = signed_type_for (ptr_type_node);
3077
3078   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3079      of the first vector of the i'th data reference. */
3080
3081   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3082     {
3083       gimple_seq new_stmt_list = NULL;
3084       tree addr_base;
3085       tree addr_tmp_name;
3086       tree new_or_tmp_name;
3087       gimple *addr_stmt, *or_stmt;
3088       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3089       bool negative = tree_int_cst_compare
3090         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3091       tree offset = negative
3092         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3093
3094       /* create: addr_tmp = (int)(address_of_first_vector) */
3095       addr_base =
3096         vect_create_addr_base_for_vector_ref (loop_vinfo,
3097                                               stmt_info, &new_stmt_list,
3098                                               offset);
3099       if (new_stmt_list != NULL)
3100         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3101
3102       sprintf (tmp_name, "addr2int%d", i);
3103       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3104       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3105       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3106
3107       /* The addresses are OR together.  */
3108
3109       if (or_tmp_name != NULL_TREE)
3110         {
3111           /* create: or_tmp = or_tmp | addr_tmp */
3112           sprintf (tmp_name, "orptrs%d", i);
3113           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3114           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3115                                          or_tmp_name, addr_tmp_name);
3116           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3117           or_tmp_name = new_or_tmp_name;
3118         }
3119       else
3120         or_tmp_name = addr_tmp_name;
3121
3122     } /* end for i */
3123
3124   mask_cst = build_int_cst (int_ptrsize_type, mask);
3125
3126   /* create: and_tmp = or_tmp & mask  */
3127   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3128
3129   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3130                                   or_tmp_name, mask_cst);
3131   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3132
3133   /* Make and_tmp the left operand of the conditional test against zero.
3134      if and_tmp has a nonzero bit then some address is unaligned.  */
3135   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3136   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3137                                 and_tmp_name, ptrsize_zero);
3138   chain_cond_expr (cond_expr, part_cond_expr);
3139 }
3140
3141 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3142    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3143    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3144    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3145
3146 static void
3147 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3148 {
3149   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3150   unsigned int i;
3151   vec_object_pair *pair;
3152   FOR_EACH_VEC_ELT (pairs, i, pair)
3153     {
3154       tree addr1 = build_fold_addr_expr (pair->first);
3155       tree addr2 = build_fold_addr_expr (pair->second);
3156       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3157                                          addr1, addr2);
3158       chain_cond_expr (cond_expr, part_cond_expr);
3159     }
3160 }
3161
3162 /* Create an expression that is true when all lower-bound conditions for
3163    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3164
3165 static void
3166 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3167 {
3168   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3169   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3170     {
3171       tree expr = lower_bounds[i].expr;
3172       tree type = unsigned_type_for (TREE_TYPE (expr));
3173       expr = fold_convert (type, expr);
3174       poly_uint64 bound = lower_bounds[i].min_value;
3175       if (!lower_bounds[i].unsigned_p)
3176         {
3177           expr = fold_build2 (PLUS_EXPR, type, expr,
3178                               build_int_cstu (type, bound - 1));
3179           bound += bound - 1;
3180         }
3181       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3182                                          build_int_cstu (type, bound));
3183       chain_cond_expr (cond_expr, part_cond_expr);
3184     }
3185 }
3186
3187 /* Function vect_create_cond_for_alias_checks.
3188
3189    Create a conditional expression that represents the run-time checks for
3190    overlapping of address ranges represented by a list of data references
3191    relations passed as input.
3192
3193    Input:
3194    COND_EXPR  - input conditional expression.  New conditions will be chained
3195                 with logical AND operation.  If it is NULL, then the function
3196                 is used to return the number of alias checks.
3197    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3198                 to be checked.
3199
3200    Output:
3201    COND_EXPR - conditional expression.
3202
3203    The returned COND_EXPR is the conditional expression to be used in the if
3204    statement that controls which version of the loop gets executed at runtime.
3205 */
3206
3207 void
3208 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3209 {
3210   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3211     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3212
3213   if (comp_alias_ddrs.is_empty ())
3214     return;
3215
3216   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3217                                &comp_alias_ddrs, cond_expr);
3218   if (dump_enabled_p ())
3219     dump_printf_loc (MSG_NOTE, vect_location,
3220                      "created %u versioning for alias checks.\n",
3221                      comp_alias_ddrs.length ());
3222 }
3223
3224
3225 /* Function vect_loop_versioning.
3226
3227    If the loop has data references that may or may not be aligned or/and
3228    has data reference relations whose independence was not proven then
3229    two versions of the loop need to be generated, one which is vectorized
3230    and one which isn't.  A test is then generated to control which of the
3231    loops is executed.  The test checks for the alignment of all of the
3232    data references that may or may not be aligned.  An additional
3233    sequence of runtime tests is generated for each pairs of DDRs whose
3234    independence was not proven.  The vectorized version of loop is
3235    executed only if both alias and alignment tests are passed.
3236
3237    The test generated to check which version of loop is executed
3238    is modified to also check for profitability as indicated by the
3239    cost model threshold TH.
3240
3241    The versioning precondition(s) are placed in *COND_EXPR and
3242    *COND_EXPR_STMT_LIST.  */
3243
3244 class loop *
3245 vect_loop_versioning (loop_vec_info loop_vinfo,
3246                       gimple *loop_vectorized_call)
3247 {
3248   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3249   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3250   basic_block condition_bb;
3251   gphi_iterator gsi;
3252   gimple_stmt_iterator cond_exp_gsi;
3253   basic_block merge_bb;
3254   basic_block new_exit_bb;
3255   edge new_exit_e, e;
3256   gphi *orig_phi, *new_phi;
3257   tree cond_expr = NULL_TREE;
3258   gimple_seq cond_expr_stmt_list = NULL;
3259   tree arg;
3260   profile_probability prob = profile_probability::likely ();
3261   gimple_seq gimplify_stmt_list = NULL;
3262   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3263   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3264   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3265   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3266   poly_uint64 versioning_threshold
3267     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3268   tree version_simd_if_cond
3269     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3270   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3271
3272   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3273       && !ordered_p (th, versioning_threshold))
3274     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3275                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3276                                             th - 1));
3277   if (maybe_ne (versioning_threshold, 0U))
3278     {
3279       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3280                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3281                                               versioning_threshold - 1));
3282       if (cond_expr)
3283         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3284                                  expr, cond_expr);
3285       else
3286         cond_expr = expr;
3287     }
3288
3289   if (version_niter)
3290     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3291
3292   if (cond_expr)
3293     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3294                                         &cond_expr_stmt_list,
3295                                         is_gimple_condexpr, NULL_TREE);
3296
3297   if (version_align)
3298     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3299                                        &cond_expr_stmt_list);
3300
3301   if (version_alias)
3302     {
3303       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3304       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3305       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3306     }
3307
3308   if (version_simd_if_cond)
3309     {
3310       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3311       if (flag_checking)
3312         if (basic_block bb
3313             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3314           gcc_assert (bb != loop->header
3315                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3316                       && (scalar_loop == NULL
3317                           || (bb != scalar_loop->header
3318                               && dominated_by_p (CDI_DOMINATORS,
3319                                                  scalar_loop->header, bb))));
3320       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3321       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3322                             version_simd_if_cond, zero);
3323       if (cond_expr)
3324         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3325                                  c, cond_expr);
3326       else
3327         cond_expr = c;
3328       if (dump_enabled_p ())
3329         dump_printf_loc (MSG_NOTE, vect_location,
3330                          "created versioning for simd if condition check.\n");
3331     }
3332
3333   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3334                                       &gimplify_stmt_list,
3335                                       is_gimple_condexpr, NULL_TREE);
3336   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3337
3338   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3339      invariant in.  */
3340   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3341   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3342        !gsi_end_p (gsi); gsi_next (&gsi))
3343     {
3344       gimple *stmt = gsi_stmt (gsi);
3345       update_stmt (stmt);
3346       ssa_op_iter iter;
3347       use_operand_p use_p;
3348       basic_block def_bb;
3349       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3350         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3351             && flow_bb_inside_loop_p (outermost, def_bb))
3352           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3353     }
3354
3355   /* Search for the outermost loop we can version.  Avoid versioning of
3356      non-perfect nests but allow if-conversion versioned loops inside.  */
3357   class loop *loop_to_version = loop;
3358   if (flow_loop_nested_p (outermost, loop))
3359     {
3360       if (dump_enabled_p ())
3361         dump_printf_loc (MSG_NOTE, vect_location,
3362                          "trying to apply versioning to outer loop %d\n",
3363                          outermost->num);
3364       if (outermost->num == 0)
3365         outermost = superloop_at_depth (loop, 1);
3366       /* And avoid applying versioning on non-perfect nests.  */
3367       while (loop_to_version != outermost
3368              && single_exit (loop_outer (loop_to_version))
3369              && (!loop_outer (loop_to_version)->inner->next
3370                  || vect_loop_vectorized_call (loop_to_version))
3371              && (!loop_outer (loop_to_version)->inner->next
3372                  || !loop_outer (loop_to_version)->inner->next->next))
3373         loop_to_version = loop_outer (loop_to_version);
3374     }
3375
3376   /* Apply versioning.  If there is already a scalar version created by
3377      if-conversion re-use that.  Note we cannot re-use the copy of
3378      an if-converted outer-loop when vectorizing the inner loop only.  */
3379   gcond *cond;
3380   if ((!loop_to_version->inner || loop == loop_to_version)
3381       && loop_vectorized_call)
3382     {
3383       gcc_assert (scalar_loop);
3384       condition_bb = gimple_bb (loop_vectorized_call);
3385       cond = as_a <gcond *> (last_stmt (condition_bb));
3386       gimple_cond_set_condition_from_tree (cond, cond_expr);
3387       update_stmt (cond);
3388
3389       if (cond_expr_stmt_list)
3390         {
3391           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3392           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3393                                  GSI_SAME_STMT);
3394         }
3395
3396       /* if-conversion uses profile_probability::always () for both paths,
3397          reset the paths probabilities appropriately.  */
3398       edge te, fe;
3399       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3400       te->probability = prob;
3401       fe->probability = prob.invert ();
3402       /* We can scale loops counts immediately but have to postpone
3403          scaling the scalar loop because we re-use it during peeling.  */
3404       scale_loop_frequencies (loop_to_version, te->probability);
3405       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3406
3407       nloop = scalar_loop;
3408       if (dump_enabled_p ())
3409         dump_printf_loc (MSG_NOTE, vect_location,
3410                          "reusing %sloop version created by if conversion\n",
3411                          loop_to_version != loop ? "outer " : "");
3412     }
3413   else
3414     {
3415       if (loop_to_version != loop
3416           && dump_enabled_p ())
3417         dump_printf_loc (MSG_NOTE, vect_location,
3418                          "applying loop versioning to outer loop %d\n",
3419                          loop_to_version->num);
3420
3421       initialize_original_copy_tables ();
3422       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3423                             prob, prob.invert (), prob, prob.invert (), true);
3424       gcc_assert (nloop);
3425       nloop = get_loop_copy (loop);
3426
3427       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3428          reap those otherwise;  they also refer to the original
3429          loops.  */
3430       class loop *l = loop;
3431       while (gimple *call = vect_loop_vectorized_call (l))
3432         {
3433           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3434           fold_loop_internal_call (call, boolean_false_node);
3435           l = loop_outer (l);
3436         }
3437       free_original_copy_tables ();
3438
3439       if (cond_expr_stmt_list)
3440         {
3441           cond_exp_gsi = gsi_last_bb (condition_bb);
3442           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3443                                  GSI_SAME_STMT);
3444         }
3445
3446       /* Loop versioning violates an assumption we try to maintain during
3447          vectorization - that the loop exit block has a single predecessor.
3448          After versioning, the exit block of both loop versions is the same
3449          basic block (i.e. it has two predecessors). Just in order to simplify
3450          following transformations in the vectorizer, we fix this situation
3451          here by adding a new (empty) block on the exit-edge of the loop,
3452          with the proper loop-exit phis to maintain loop-closed-form.
3453          If loop versioning wasn't done from loop, but scalar_loop instead,
3454          merge_bb will have already just a single successor.  */
3455
3456       merge_bb = single_exit (loop_to_version)->dest;
3457       if (EDGE_COUNT (merge_bb->preds) >= 2)
3458         {
3459           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3460           new_exit_bb = split_edge (single_exit (loop_to_version));
3461           new_exit_e = single_exit (loop_to_version);
3462           e = EDGE_SUCC (new_exit_bb, 0);
3463
3464           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3465                gsi_next (&gsi))
3466             {
3467               tree new_res;
3468               orig_phi = gsi.phi ();
3469               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3470               new_phi = create_phi_node (new_res, new_exit_bb);
3471               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3472               add_phi_arg (new_phi, arg, new_exit_e,
3473                            gimple_phi_arg_location_from_edge (orig_phi, e));
3474               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3475             }
3476         }
3477
3478       update_ssa (TODO_update_ssa);
3479     }
3480
3481   if (version_niter)
3482     {
3483       /* The versioned loop could be infinite, we need to clear existing
3484          niter information which is copied from the original loop.  */
3485       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3486       vect_free_loop_info_assumptions (nloop);
3487       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3488       loop_constraint_set (loop, LOOP_C_INFINITE);
3489     }
3490
3491   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3492       && dump_enabled_p ())
3493     {
3494       if (version_alias)
3495         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3496                          vect_location,
3497                          "loop versioned for vectorization because of "
3498                          "possible aliasing\n");
3499       if (version_align)
3500         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3501                          vect_location,
3502                          "loop versioned for vectorization to enhance "
3503                          "alignment\n");
3504
3505     }
3506
3507   return nloop;
3508 }