gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 324                                rgroup_controls *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->type;
 327   tree dest_masktype = dest_rgm->type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 342         {
 343           tree src = src_rgm->controls[i / 2];
 344           tree dest = dest_rgm->controls[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 375         {
 376           tree src = src_rgm->controls[i / 2];
 377           tree dest = dest_rgm->controls[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 388    for all the rgroup controls in RGC and return a control that is nonzero
 389    when the loop needs to iterate.  Add any new preheader statements to
 390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGC->max_nscalars_per_iter * RGC->factor
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks or zero
 415    lengths for RGC.
 416
 417    Note: the cost of the code generated by this function is modeled
 418    by vect_estimate_min_profitable_iters, so changes here may need
 419    corresponding changes there.  */
 420
 421 static tree
 422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 423                                  gimple_seq *preheader_seq,
 424                                  gimple_stmt_iterator loop_cond_gsi,
 425                                  rgroup_controls *rgc, tree niters,
 426                                  tree niters_skip, bool might_wrap_p)
 427 {
 428   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 429   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 430   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 431
 432   tree ctrl_type = rgc->type;
 433   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 434   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 435   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 436   tree length_limit = NULL_TREE;
 437   /* For length, we need length_limit to ensure length in range.  */
 438   if (!use_masks_p)
 439     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 440
 441   /* Calculate the maximum number of item values that the rgroup
 442      handles in total, the number that it handles for each iteration
 443      of the vector loop, and the number that it should skip during the
 444      first iteration of the vector loop.  */
 445   tree nitems_total = niters;
 446   tree nitems_step = build_int_cst (iv_type, vf);
 447   tree nitems_skip = niters_skip;
 448   if (nitems_per_iter != 1)
 449     {
 450       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 451          these multiplications don't overflow.  */
 452       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 453       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 454       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 455                                    nitems_total, compare_factor);
 456       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 457                                   nitems_step, iv_factor);
 458       if (nitems_skip)
 459         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 460                                     nitems_skip, compare_factor);
 461     }
 462
 463   /* Create an induction variable that counts the number of items
 464      processed.  */
 465   tree index_before_incr, index_after_incr;
 466   gimple_stmt_iterator incr_gsi;
 467   bool insert_after;
 468   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 469   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 470              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 471
 472   tree zero_index = build_int_cst (compare_type, 0);
 473   tree test_index, test_limit, first_limit;
 474   gimple_stmt_iterator *test_gsi;
 475   if (might_wrap_p)
 476     {
 477       /* In principle the loop should stop iterating once the incremented
 478          IV reaches a value greater than or equal to:
 479
 480            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 481
 482          However, there's no guarantee that this addition doesn't overflow
 483          the comparison type, or that the IV hits a value above it before
 484          wrapping around.  We therefore adjust the limit down by one
 485          IV step:
 486
 487            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 488            -[infinite-prec] NITEMS_STEP
 489
 490          and compare the IV against this limit _before_ incrementing it.
 491          Since the comparison type is unsigned, we actually want the
 492          subtraction to saturate at zero:
 493
 494            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 495            -[sat] NITEMS_STEP
 496
 497          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 498
 499            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 500
 501          where the rightmost subtraction can be done directly in
 502          COMPARE_TYPE.  */
 503       test_index = index_before_incr;
 504       tree adjust = gimple_convert (preheader_seq, compare_type,
 505                                     nitems_step);
 506       if (nitems_skip)
 507         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 508                                adjust, nitems_skip);
 509       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 510                                  nitems_total, adjust);
 511       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 512                                  test_limit, adjust);
 513       test_gsi = &incr_gsi;
 514
 515       /* Get a safe limit for the first iteration.  */
 516       if (nitems_skip)
 517         {
 518           /* The first vector iteration can handle at most NITEMS_STEP
 519              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 520              NITEMS_SKIP to that cannot overflow.  */
 521           tree const_limit = build_int_cst (compare_type,
 522                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 523                                             * nitems_per_iter);
 524           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 525                                       nitems_total, const_limit);
 526           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 527                                       first_limit, nitems_skip);
 528         }
 529       else
 530         /* For the first iteration it doesn't matter whether the IV hits
 531            a value above NITEMS_TOTAL.  That only matters for the latch
 532            condition.  */
 533         first_limit = nitems_total;
 534     }
 535   else
 536     {
 537       /* Test the incremented IV, which will always hit a value above
 538          the bound before wrapping.  */
 539       test_index = index_after_incr;
 540       test_limit = nitems_total;
 541       if (nitems_skip)
 542         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 543                                    test_limit, nitems_skip);
 544       test_gsi = &loop_cond_gsi;
 545
 546       first_limit = test_limit;
 547     }
 548
 549   /* Convert the IV value to the comparison type (either a no-op or
 550      a demotion).  */
 551   gimple_seq test_seq = NULL;
 552   test_index = gimple_convert (&test_seq, compare_type, test_index);
 553   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 554
 555   /* Provide a definition of each control in the group.  */
 556   tree next_ctrl = NULL_TREE;
 557   tree ctrl;
 558   unsigned int i;
 559   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 560     {
 561       /* Previous controls will cover BIAS items.  This control covers the
 562          next batch.  */
 563       poly_uint64 bias = nitems_per_ctrl * i;
 564       tree bias_tree = build_int_cst (compare_type, bias);
 565
 566       /* See whether the first iteration of the vector loop is known
 567          to have a full control.  */
 568       poly_uint64 const_limit;
 569       bool first_iteration_full
 570         = (poly_int_tree_p (first_limit, &const_limit)
 571            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 572
 573       /* Rather than have a new IV that starts at BIAS and goes up to
 574          TEST_LIMIT, prefer to use the same 0-based IV for each control
 575          and adjust the bound down by BIAS.  */
 576       tree this_test_limit = test_limit;
 577       if (i != 0)
 578         {
 579           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 580                                           compare_type, this_test_limit,
 581                                           bias_tree);
 582           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 583                                           compare_type, this_test_limit,
 584                                           bias_tree);
 585         }
 586
 587       /* Create the initial control.  First include all items that
 588          are within the loop limit.  */
 589       tree init_ctrl = NULL_TREE;
 590       if (!first_iteration_full)
 591         {
 592           tree start, end;
 593           if (first_limit == test_limit)
 594             {
 595               /* Use a natural test between zero (the initial IV value)
 596                  and the loop limit.  The "else" block would be valid too,
 597                  but this choice can avoid the need to load BIAS_TREE into
 598                  a register.  */
 599               start = zero_index;
 600               end = this_test_limit;
 601             }
 602           else
 603             {
 604               /* FIRST_LIMIT is the maximum number of items handled by the
 605                  first iteration of the vector loop.  Test the portion
 606                  associated with this control.  */
 607               start = bias_tree;
 608               end = first_limit;
 609             }
 610
 611           if (use_masks_p)
 612             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 613                                         start, end, "max_mask");
 614           else
 615             {
 616               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 617               gimple_seq seq = vect_gen_len (init_ctrl, start,
 618                                              end, length_limit);
 619               gimple_seq_add_seq (preheader_seq, seq);
 620             }
 621         }
 622
 623       /* Now AND out the bits that are within the number of skipped
 624          items.  */
 625       poly_uint64 const_skip;
 626       if (nitems_skip
 627           && !(poly_int_tree_p (nitems_skip, &const_skip)
 628                && known_le (const_skip, bias)))
 629         {
 630           gcc_assert (use_masks_p);
 631           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 632                                                     bias_tree, nitems_skip);
 633           if (init_ctrl)
 634             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 635                                       init_ctrl, unskipped_mask);
 636           else
 637             init_ctrl = unskipped_mask;
 638         }
 639
 640       if (!init_ctrl)
 641         {
 642           /* First iteration is full.  */
 643           if (use_masks_p)
 644             init_ctrl = build_minus_one_cst (ctrl_type);
 645           else
 646             init_ctrl = length_limit;
 647         }
 648
 649       /* Get the control value for the next iteration of the loop.  */
 650       if (use_masks_p)
 651         {
 652           gimple_seq stmts = NULL;
 653           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 654                                       this_test_limit, "next_mask");
 655           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 656         }
 657       else
 658         {
 659           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 660           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 661                                          length_limit);
 662           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 663         }
 664
 665       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 666     }
 667   return next_ctrl;
 668 }
 669
 670 /* Set up the iteration condition and rgroup controls for LOOP, given
 671    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 672    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 673    the number of iterations of the original scalar loop that should be
 674    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 675    for vect_set_loop_condition.
 676
 677    Insert the branch-back condition before LOOP_COND_GSI and return the
 678    final gcond.  */
 679
 680 static gcond *
 681 vect_set_loop_condition_partial_vectors (class loop *loop,
 682                                          loop_vec_info loop_vinfo, tree niters,
 683                                          tree final_iv, bool niters_maybe_zero,
 684                                          gimple_stmt_iterator loop_cond_gsi)
 685 {
 686   gimple_seq preheader_seq = NULL;
 687   gimple_seq header_seq = NULL;
 688
 689   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 690   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 691   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 692   tree orig_niters = niters;
 693
 694   /* Type of the initial value of NITERS.  */
 695   tree ni_actual_type = TREE_TYPE (niters);
 696   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 697   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 698
 699   /* Convert NITERS to the same size as the compare.  */
 700   if (compare_precision > ni_actual_precision
 701       && niters_maybe_zero)
 702     {
 703       /* We know that there is always at least one iteration, so if the
 704          count is zero then it must have wrapped.  Cope with this by
 705          subtracting 1 before the conversion and adding 1 to the result.  */
 706       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 707       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 708                              niters, build_minus_one_cst (ni_actual_type));
 709       niters = gimple_convert (&preheader_seq, compare_type, niters);
 710       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 711                              niters, build_one_cst (compare_type));
 712     }
 713   else
 714     niters = gimple_convert (&preheader_seq, compare_type, niters);
 715
 716   /* Iterate over all the rgroups and fill in their controls.  We could use
 717      the first control from any rgroup for the loop condition; here we
 718      arbitrarily pick the last.  */
 719   tree test_ctrl = NULL_TREE;
 720   rgroup_controls *rgc;
 721   unsigned int i;
 722   auto_vec<rgroup_controls> *controls = use_masks_p
 723                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 724                                           : &LOOP_VINFO_LENS (loop_vinfo);
 725   FOR_EACH_VEC_ELT (*controls, i, rgc)
 726     if (!rgc->controls.is_empty ())
 727       {
 728         /* First try using permutes.  This adds a single vector
 729            instruction to the loop for each mask, but needs no extra
 730            loop invariants or IVs.  */
 731         unsigned int nmasks = i + 1;
 732         if (use_masks_p && (nmasks & 1) == 0)
 733           {
 734             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 735             if (!half_rgc->controls.is_empty ()
 736                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 737               continue;
 738           }
 739
 740         /* See whether zero-based IV would ever generate all-false masks
 741            or zero length before wrapping around.  */
 742         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 743
 744         /* Set up all controls for this group.  */
 745         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 746                                                      &preheader_seq,
 747                                                      loop_cond_gsi, rgc,
 748                                                      niters, niters_skip,
 749                                                      might_wrap_p);
 750       }
 751
 752   /* Emit all accumulated statements.  */
 753   add_preheader_seq (loop, preheader_seq);
 754   add_header_seq (loop, header_seq);
 755
 756   /* Get a boolean result that tells us whether to iterate.  */
 757   edge exit_edge = single_exit (loop);
 758   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 759   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 760   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 761                                         NULL_TREE, NULL_TREE);
 762   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 763
 764   /* The loop iterates (NITERS - 1) / VF + 1 times.
 765      Subtract one from this to get the latch count.  */
 766   tree step = build_int_cst (compare_type,
 767                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 768   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 769                                        build_minus_one_cst (compare_type));
 770   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 771                                      niters_minus_one, step);
 772
 773   if (final_iv)
 774     {
 775       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 776       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 777     }
 778
 779   return cond_stmt;
 780 }
 781
 782 /* Like vect_set_loop_condition, but handle the case in which the vector
 783    loop handles exactly VF scalars per iteration.  */
 784
 785 static gcond *
 786 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 787                                 tree final_iv, bool niters_maybe_zero,
 788                                 gimple_stmt_iterator loop_cond_gsi)
 789 {
 790   tree indx_before_incr, indx_after_incr;
 791   gcond *cond_stmt;
 792   gcond *orig_cond;
 793   edge pe = loop_preheader_edge (loop);
 794   edge exit_edge = single_exit (loop);
 795   gimple_stmt_iterator incr_gsi;
 796   bool insert_after;
 797   enum tree_code code;
 798   tree niters_type = TREE_TYPE (niters);
 799
 800   orig_cond = get_loop_exit_condition (loop);
 801   gcc_assert (orig_cond);
 802   loop_cond_gsi = gsi_for_stmt (orig_cond);
 803
 804   tree init, limit;
 805   if (!niters_maybe_zero && integer_onep (step))
 806     {
 807       /* In this case we can use a simple 0-based IV:
 808
 809          A:
 810            x = 0;
 811            do
 812              {
 813                ...
 814                x += 1;
 815              }
 816            while (x < NITERS);  */
 817       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 818       init = build_zero_cst (niters_type);
 819       limit = niters;
 820     }
 821   else
 822     {
 823       /* The following works for all values of NITERS except 0:
 824
 825          B:
 826            x = 0;
 827            do
 828              {
 829                ...
 830                x += STEP;
 831              }
 832            while (x <= NITERS - STEP);
 833
 834          so that the loop continues to iterate if x + STEP - 1 < NITERS
 835          but stops if x + STEP - 1 >= NITERS.
 836
 837          However, if NITERS is zero, x never hits a value above NITERS - STEP
 838          before wrapping around.  There are two obvious ways of dealing with
 839          this:
 840
 841          - start at STEP - 1 and compare x before incrementing it
 842          - start at -1 and compare x after incrementing it
 843
 844          The latter is simpler and is what we use.  The loop in this case
 845          looks like:
 846
 847          C:
 848            x = -1;
 849            do
 850              {
 851                ...
 852                x += STEP;
 853              }
 854            while (x < NITERS - STEP);
 855
 856          In both cases the loop limit is NITERS - STEP.  */
 857       gimple_seq seq = NULL;
 858       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 859       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 860       if (seq)
 861         {
 862           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 863           gcc_assert (!new_bb);
 864         }
 865       if (niters_maybe_zero)
 866         {
 867           /* Case C.  */
 868           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 869           init = build_all_ones_cst (niters_type);
 870         }
 871       else
 872         {
 873           /* Case B.  */
 874           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 875           init = build_zero_cst (niters_type);
 876         }
 877     }
 878
 879   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 880   create_iv (init, step, NULL_TREE, loop,
 881              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 882   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 883                                               true, NULL_TREE, true,
 884                                               GSI_SAME_STMT);
 885   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 886                                      true, GSI_SAME_STMT);
 887
 888   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 889                                  NULL_TREE);
 890
 891   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 892
 893   /* Record the number of latch iterations.  */
 894   if (limit == niters)
 895     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 896        latch count.  */
 897     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 898                                        build_int_cst (niters_type, 1));
 899   else
 900     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 901        Subtract one from this to get the latch count.  */
 902     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 903                                        limit, step);
 904
 905   if (final_iv)
 906     {
 907       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 908                                              indx_after_incr, init);
 909       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 910     }
 911
 912   return cond_stmt;
 913 }
 914
 915 /* If we're using fully-masked loops, make LOOP iterate:
 916
 917       N == (NITERS - 1) / STEP + 1
 918
 919    times.  When NITERS is zero, this is equivalent to making the loop
 920    execute (1 << M) / STEP times, where M is the precision of NITERS.
 921    NITERS_MAYBE_ZERO is true if this last case might occur.
 922
 923    If we're not using fully-masked loops, make LOOP iterate:
 924
 925       N == (NITERS - STEP) / STEP + 1
 926
 927    times, where NITERS is known to be outside the range [1, STEP - 1].
 928    This is equivalent to making the loop execute NITERS / STEP times
 929    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 930    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 931
 932    If FINAL_IV is nonnull, it is an SSA name that should be set to
 933    N * STEP on exit from the loop.
 934
 935    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 936
 937 void
 938 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 939                          tree niters, tree step, tree final_iv,
 940                          bool niters_maybe_zero)
 941 {
 942   gcond *cond_stmt;
 943   gcond *orig_cond = get_loop_exit_condition (loop);
 944   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 945
 946   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 947     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 948                                                          niters, final_iv,
 949                                                          niters_maybe_zero,
 950                                                          loop_cond_gsi);
 951   else
 952     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 953                                                 niters_maybe_zero,
 954                                                 loop_cond_gsi);
 955
 956   /* Remove old loop exit test.  */
 957   stmt_vec_info orig_cond_info;
 958   if (loop_vinfo
 959       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 960     loop_vinfo->remove_stmt (orig_cond_info);
 961   else
 962     gsi_remove (&loop_cond_gsi, true);
 963
 964   if (dump_enabled_p ())
 965     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 966                      cond_stmt);
 967 }
 968
 969 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 970    For all PHI arguments in FROM->dest and TO->dest from those
 971    edges ensure that TO->dest PHI arguments have current_def
 972    to that in from.  */
 973
 974 static void
 975 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 976 {
 977   gimple_stmt_iterator gsi_from, gsi_to;
 978
 979   for (gsi_from = gsi_start_phis (from->dest),
 980        gsi_to = gsi_start_phis (to->dest);
 981        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 982     {
 983       gimple *from_phi = gsi_stmt (gsi_from);
 984       gimple *to_phi = gsi_stmt (gsi_to);
 985       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 986       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 987       if (virtual_operand_p (from_arg))
 988         {
 989           gsi_next (&gsi_from);
 990           continue;
 991         }
 992       if (virtual_operand_p (to_arg))
 993         {
 994           gsi_next (&gsi_to);
 995           continue;
 996         }
 997       if (TREE_CODE (from_arg) != SSA_NAME)
 998         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 999       else if (TREE_CODE (to_arg) == SSA_NAME
1000                && from_arg != to_arg)
1001         {
1002           if (get_current_def (to_arg) == NULL_TREE)
1003             {
1004               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1005                                               TREE_TYPE (get_current_def
1006                                                            (from_arg))));
1007               set_current_def (to_arg, get_current_def (from_arg));
1008             }
1009         }
1010       gsi_next (&gsi_from);
1011       gsi_next (&gsi_to);
1012     }
1013
1014   gphi *from_phi = get_virtual_phi (from->dest);
1015   gphi *to_phi = get_virtual_phi (to->dest);
1016   if (from_phi)
1017     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1018                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1019 }
1020
1021
1022 /* Given LOOP this function generates a new copy of it and puts it
1023    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1024    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1025    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1026    entry or exit of LOOP.  */
1027
1028 class loop *
1029 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1030                                         class loop *scalar_loop, edge e)
1031 {
1032   class loop *new_loop;
1033   basic_block *new_bbs, *bbs, *pbbs;
1034   bool at_exit;
1035   bool was_imm_dom;
1036   basic_block exit_dest;
1037   edge exit, new_exit;
1038   bool duplicate_outer_loop = false;
1039
1040   exit = single_exit (loop);
1041   at_exit = (e == exit);
1042   if (!at_exit && e != loop_preheader_edge (loop))
1043     return NULL;
1044
1045   if (scalar_loop == NULL)
1046     scalar_loop = loop;
1047
1048   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1049   pbbs = bbs + 1;
1050   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1051   /* Allow duplication of outer loops.  */
1052   if (scalar_loop->inner)
1053     duplicate_outer_loop = true;
1054   /* Check whether duplication is possible.  */
1055   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1056     {
1057       free (bbs);
1058       return NULL;
1059     }
1060
1061   /* Generate new loop structure.  */
1062   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1063   duplicate_subloops (scalar_loop, new_loop);
1064
1065   exit_dest = exit->dest;
1066   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1067                                           exit_dest) == loop->header ?
1068                  true : false);
1069
1070   /* Also copy the pre-header, this avoids jumping through hoops to
1071      duplicate the loop entry PHI arguments.  Create an empty
1072      pre-header unconditionally for this.  */
1073   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1074   edge entry_e = single_pred_edge (preheader);
1075   bbs[0] = preheader;
1076   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1077
1078   exit = single_exit (scalar_loop);
1079   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1080             &exit, 1, &new_exit, NULL,
1081             at_exit ? loop->latch : e->src, true);
1082   exit = single_exit (loop);
1083   basic_block new_preheader = new_bbs[0];
1084
1085   /* Before installing PHI arguments make sure that the edges
1086      into them match that of the scalar loop we analyzed.  This
1087      makes sure the SLP tree matches up between the main vectorized
1088      loop and the epilogue vectorized copies.  */
1089   if (single_succ_edge (preheader)->dest_idx
1090       != single_succ_edge (new_bbs[0])->dest_idx)
1091     {
1092       basic_block swap_bb = new_bbs[1];
1093       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1094       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1095       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1096       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1097     }
1098   if (duplicate_outer_loop)
1099     {
1100       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1101       if (loop_preheader_edge (scalar_loop)->dest_idx
1102           != loop_preheader_edge (new_inner_loop)->dest_idx)
1103         {
1104           basic_block swap_bb = new_inner_loop->header;
1105           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1106           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1107           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1108           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1109         }
1110     }
1111
1112   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1113
1114   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1115   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1116     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1117
1118   if (scalar_loop != loop)
1119     {
1120       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1121          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1122          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1123          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1124          header) to have current_def set, so copy them over.  */
1125       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1126                                                 exit);
1127       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1128                                                            0),
1129                                                 EDGE_SUCC (loop->latch, 0));
1130     }
1131
1132   if (at_exit) /* Add the loop copy at exit.  */
1133     {
1134       if (scalar_loop != loop)
1135         {
1136           gphi_iterator gsi;
1137           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1138
1139           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1140                gsi_next (&gsi))
1141             {
1142               gphi *phi = gsi.phi ();
1143               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1144               location_t orig_locus
1145                 = gimple_phi_arg_location_from_edge (phi, e);
1146
1147               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1148             }
1149         }
1150       redirect_edge_and_branch_force (e, new_preheader);
1151       flush_pending_stmts (e);
1152       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1153       if (was_imm_dom || duplicate_outer_loop)
1154         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1155
1156       /* And remove the non-necessary forwarder again.  Keep the other
1157          one so we have a proper pre-header for the loop at the exit edge.  */
1158       redirect_edge_pred (single_succ_edge (preheader),
1159                           single_pred (preheader));
1160       delete_basic_block (preheader);
1161       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1162                                loop_preheader_edge (scalar_loop)->src);
1163     }
1164   else /* Add the copy at entry.  */
1165     {
1166       if (scalar_loop != loop)
1167         {
1168           /* Remove the non-necessary forwarder of scalar_loop again.  */
1169           redirect_edge_pred (single_succ_edge (preheader),
1170                               single_pred (preheader));
1171           delete_basic_block (preheader);
1172           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1173                                    loop_preheader_edge (scalar_loop)->src);
1174           preheader = split_edge (loop_preheader_edge (loop));
1175           entry_e = single_pred_edge (preheader);
1176         }
1177
1178       redirect_edge_and_branch_force (entry_e, new_preheader);
1179       flush_pending_stmts (entry_e);
1180       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1181
1182       redirect_edge_and_branch_force (new_exit, preheader);
1183       flush_pending_stmts (new_exit);
1184       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1185
1186       /* And remove the non-necessary forwarder again.  Keep the other
1187          one so we have a proper pre-header for the loop at the exit edge.  */
1188       redirect_edge_pred (single_succ_edge (new_preheader),
1189                           single_pred (new_preheader));
1190       delete_basic_block (new_preheader);
1191       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1192                                loop_preheader_edge (new_loop)->src);
1193     }
1194
1195   if (scalar_loop != loop)
1196     {
1197       /* Update new_loop->header PHIs, so that on the preheader
1198          edge they are the ones from loop rather than scalar_loop.  */
1199       gphi_iterator gsi_orig, gsi_new;
1200       edge orig_e = loop_preheader_edge (loop);
1201       edge new_e = loop_preheader_edge (new_loop);
1202
1203       for (gsi_orig = gsi_start_phis (loop->header),
1204            gsi_new = gsi_start_phis (new_loop->header);
1205            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1206            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1207         {
1208           gphi *orig_phi = gsi_orig.phi ();
1209           gphi *new_phi = gsi_new.phi ();
1210           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1211           location_t orig_locus
1212             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1213
1214           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1215         }
1216     }
1217
1218   free (new_bbs);
1219   free (bbs);
1220
1221   checking_verify_dominators (CDI_DOMINATORS);
1222
1223   return new_loop;
1224 }
1225
1226
1227 /* Given the condition expression COND, put it as the last statement of
1228    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1229    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1230    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1231    new edge as irreducible if IRREDUCIBLE_P is true.  */
1232
1233 static edge
1234 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1235                        basic_block guard_to, basic_block dom_bb,
1236                        profile_probability probability, bool irreducible_p)
1237 {
1238   gimple_stmt_iterator gsi;
1239   edge new_e, enter_e;
1240   gcond *cond_stmt;
1241   gimple_seq gimplify_stmt_list = NULL;
1242
1243   enter_e = EDGE_SUCC (guard_bb, 0);
1244   enter_e->flags &= ~EDGE_FALLTHRU;
1245   enter_e->flags |= EDGE_FALSE_VALUE;
1246   gsi = gsi_last_bb (guard_bb);
1247
1248   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1249                                  NULL_TREE);
1250   if (gimplify_stmt_list)
1251     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1252
1253   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1254   gsi = gsi_last_bb (guard_bb);
1255   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1256
1257   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1258   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1259
1260   new_e->probability = probability;
1261   if (irreducible_p)
1262     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1263
1264   enter_e->probability = probability.invert ();
1265   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1266
1267   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1268   if (enter_e->dest->loop_father->header == enter_e->dest)
1269     split_edge (enter_e);
1270
1271   return new_e;
1272 }
1273
1274
1275 /* This function verifies that the following restrictions apply to LOOP:
1276    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1277        for innermost loop and 5 basic blocks for outer-loop.
1278    (2) it is single entry, single exit
1279    (3) its exit condition is the last stmt in the header
1280    (4) E is the entry/exit edge of LOOP.
1281  */
1282
1283 bool
1284 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1285 {
1286   edge exit_e = single_exit (loop);
1287   edge entry_e = loop_preheader_edge (loop);
1288   gcond *orig_cond = get_loop_exit_condition (loop);
1289   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1290   unsigned int num_bb = loop->inner? 5 : 2;
1291
1292   /* All loops have an outer scope; the only case loop->outer is NULL is for
1293      the function itself.  */
1294   if (!loop_outer (loop)
1295       || loop->num_nodes != num_bb
1296       || !empty_block_p (loop->latch)
1297       || !single_exit (loop)
1298       /* Verify that new loop exit condition can be trivially modified.  */
1299       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1300       || (e != exit_e && e != entry_e))
1301     return false;
1302
1303   return true;
1304 }
1305
1306 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1307    in the exit bb and rename all the uses after the loop.  This simplifies
1308    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1309    (but normally loop closed SSA form doesn't require virtual PHIs to be
1310    in the same form).  Doing this early simplifies the checking what
1311    uses should be renamed.
1312
1313    If we create a new phi after the loop, return the definition that
1314    applies on entry to the loop, otherwise return null.  */
1315
1316 static tree
1317 create_lcssa_for_virtual_phi (class loop *loop)
1318 {
1319   gphi_iterator gsi;
1320   edge exit_e = single_exit (loop);
1321
1322   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1323     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1324       {
1325         gphi *phi = gsi.phi ();
1326         for (gsi = gsi_start_phis (exit_e->dest);
1327              !gsi_end_p (gsi); gsi_next (&gsi))
1328           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1329             break;
1330         if (gsi_end_p (gsi))
1331           {
1332             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1333             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1334             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1335             imm_use_iterator imm_iter;
1336             gimple *stmt;
1337             use_operand_p use_p;
1338
1339             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1340               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1341             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1342             gimple_phi_set_result (new_phi, new_vop);
1343             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1344               if (stmt != new_phi
1345                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1346                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1347                   SET_USE (use_p, new_vop);
1348
1349             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1350           }
1351         break;
1352       }
1353   return NULL_TREE;
1354 }
1355
1356 /* Function vect_get_loop_location.
1357
1358    Extract the location of the loop in the source code.
1359    If the loop is not well formed for vectorization, an estimated
1360    location is calculated.
1361    Return the loop location if succeed and NULL if not.  */
1362
1363 dump_user_location_t
1364 find_loop_location (class loop *loop)
1365 {
1366   gimple *stmt = NULL;
1367   basic_block bb;
1368   gimple_stmt_iterator si;
1369
1370   if (!loop)
1371     return dump_user_location_t ();
1372
1373   stmt = get_loop_exit_condition (loop);
1374
1375   if (stmt
1376       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1377     return stmt;
1378
1379   /* If we got here the loop is probably not "well formed",
1380      try to estimate the loop location */
1381
1382   if (!loop->header)
1383     return dump_user_location_t ();
1384
1385   bb = loop->header;
1386
1387   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1388     {
1389       stmt = gsi_stmt (si);
1390       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1391         return stmt;
1392     }
1393
1394   return dump_user_location_t ();
1395 }
1396
1397 /* Return true if the phi described by STMT_INFO defines an IV of the
1398    loop to be vectorized.  */
1399
1400 static bool
1401 iv_phi_p (stmt_vec_info stmt_info)
1402 {
1403   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1404   if (virtual_operand_p (PHI_RESULT (phi)))
1405     return false;
1406
1407   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1408       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1409     return false;
1410
1411   return true;
1412 }
1413
1414 /* Function vect_can_advance_ivs_p
1415
1416    In case the number of iterations that LOOP iterates is unknown at compile
1417    time, an epilog loop will be generated, and the loop induction variables
1418    (IVs) will be "advanced" to the value they are supposed to take just before
1419    the epilog loop.  Here we check that the access function of the loop IVs
1420    and the expression that represents the loop bound are simple enough.
1421    These restrictions will be relaxed in the future.  */
1422
1423 bool
1424 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1425 {
1426   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1427   basic_block bb = loop->header;
1428   gphi_iterator gsi;
1429
1430   /* Analyze phi functions of the loop header.  */
1431
1432   if (dump_enabled_p ())
1433     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1434   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1435     {
1436       tree evolution_part;
1437
1438       gphi *phi = gsi.phi ();
1439       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1440       if (dump_enabled_p ())
1441         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1442                          phi_info->stmt);
1443
1444       /* Skip virtual phi's. The data dependences that are associated with
1445          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1446
1447          Skip reduction phis.  */
1448       if (!iv_phi_p (phi_info))
1449         {
1450           if (dump_enabled_p ())
1451             dump_printf_loc (MSG_NOTE, vect_location,
1452                              "reduc or virtual phi. skip.\n");
1453           continue;
1454         }
1455
1456       /* Analyze the evolution function.  */
1457
1458       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1459       if (evolution_part == NULL_TREE)
1460         {
1461           if (dump_enabled_p ())
1462             dump_printf (MSG_MISSED_OPTIMIZATION,
1463                          "No access function or evolution.\n");
1464           return false;
1465         }
1466
1467       /* FORNOW: We do not transform initial conditions of IVs
1468          which evolution functions are not invariants in the loop.  */
1469
1470       if (!expr_invariant_in_loop_p (loop, evolution_part))
1471         {
1472           if (dump_enabled_p ())
1473             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474                              "evolution not invariant in loop.\n");
1475           return false;
1476         }
1477
1478       /* FORNOW: We do not transform initial conditions of IVs
1479          which evolution functions are a polynomial of degree >= 2.  */
1480
1481       if (tree_is_chrec (evolution_part))
1482         {
1483           if (dump_enabled_p ())
1484             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485                              "evolution is chrec.\n");
1486           return false;
1487         }
1488     }
1489
1490   return true;
1491 }
1492
1493
1494 /*   Function vect_update_ivs_after_vectorizer.
1495
1496      "Advance" the induction variables of LOOP to the value they should take
1497      after the execution of LOOP.  This is currently necessary because the
1498      vectorizer does not handle induction variables that are used after the
1499      loop.  Such a situation occurs when the last iterations of LOOP are
1500      peeled, because:
1501      1. We introduced new uses after LOOP for IVs that were not originally used
1502         after LOOP: the IVs of LOOP are now used by an epilog loop.
1503      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1504         times, whereas the loop IVs should be bumped N times.
1505
1506      Input:
1507      - LOOP - a loop that is going to be vectorized. The last few iterations
1508               of LOOP were peeled.
1509      - NITERS - the number of iterations that LOOP executes (before it is
1510                 vectorized). i.e, the number of times the ivs should be bumped.
1511      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1512                   coming out from LOOP on which there are uses of the LOOP ivs
1513                   (this is the path from LOOP->exit to epilog_loop->preheader).
1514
1515                   The new definitions of the ivs are placed in LOOP->exit.
1516                   The phi args associated with the edge UPDATE_E in the bb
1517                   UPDATE_E->dest are updated accordingly.
1518
1519      Assumption 1: Like the rest of the vectorizer, this function assumes
1520      a single loop exit that has a single predecessor.
1521
1522      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1523      organized in the same order.
1524
1525      Assumption 3: The access function of the ivs is simple enough (see
1526      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1527
1528      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1529      coming out of LOOP on which the ivs of LOOP are used (this is the path
1530      that leads to the epilog loop; other paths skip the epilog loop).  This
1531      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1532      needs to have its phis updated.
1533  */
1534
1535 static void
1536 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1537                                   tree niters, edge update_e)
1538 {
1539   gphi_iterator gsi, gsi1;
1540   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1541   basic_block update_bb = update_e->dest;
1542   basic_block exit_bb = single_exit (loop)->dest;
1543
1544   /* Make sure there exists a single-predecessor exit bb:  */
1545   gcc_assert (single_pred_p (exit_bb));
1546   gcc_assert (single_succ_edge (exit_bb) == update_e);
1547
1548   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1549        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1550        gsi_next (&gsi), gsi_next (&gsi1))
1551     {
1552       tree init_expr;
1553       tree step_expr, off;
1554       tree type;
1555       tree var, ni, ni_name;
1556       gimple_stmt_iterator last_gsi;
1557
1558       gphi *phi = gsi.phi ();
1559       gphi *phi1 = gsi1.phi ();
1560       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1561       if (dump_enabled_p ())
1562         dump_printf_loc (MSG_NOTE, vect_location,
1563                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1564
1565       /* Skip reduction and virtual phis.  */
1566       if (!iv_phi_p (phi_info))
1567         {
1568           if (dump_enabled_p ())
1569             dump_printf_loc (MSG_NOTE, vect_location,
1570                              "reduc or virtual phi. skip.\n");
1571           continue;
1572         }
1573
1574       type = TREE_TYPE (gimple_phi_result (phi));
1575       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1576       step_expr = unshare_expr (step_expr);
1577
1578       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1579          of degree >= 2 or exponential.  */
1580       gcc_assert (!tree_is_chrec (step_expr));
1581
1582       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1583
1584       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1585                          fold_convert (TREE_TYPE (step_expr), niters),
1586                          step_expr);
1587       if (POINTER_TYPE_P (type))
1588         ni = fold_build_pointer_plus (init_expr, off);
1589       else
1590         ni = fold_build2 (PLUS_EXPR, type,
1591                           init_expr, fold_convert (type, off));
1592
1593       var = create_tmp_var (type, "tmp");
1594
1595       last_gsi = gsi_last_bb (exit_bb);
1596       gimple_seq new_stmts = NULL;
1597       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1598       /* Exit_bb shouldn't be empty.  */
1599       if (!gsi_end_p (last_gsi))
1600         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1601       else
1602         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1603
1604       /* Fix phi expressions in the successor bb.  */
1605       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1606     }
1607 }
1608
1609 /* Return a gimple value containing the misalignment (measured in vector
1610    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1611    it is away from a perfectly aligned address.  Add any new statements
1612    to SEQ.  */
1613
1614 static tree
1615 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1616 {
1617   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1618   stmt_vec_info stmt_info = dr_info->stmt;
1619   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1620
1621   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1622   unsigned HOST_WIDE_INT target_align_c;
1623   tree target_align_minus_1;
1624
1625   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1626                                         size_zero_node) < 0;
1627   tree offset = (negative
1628                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1629                  : size_zero_node);
1630   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1631                                                           stmt_info, seq,
1632                                                           offset);
1633   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1634   if (target_align.is_constant (&target_align_c))
1635     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1636   else
1637     {
1638       tree vla = build_int_cst (type, target_align);
1639       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1640                                     fold_build2 (MINUS_EXPR, type,
1641                                                  build_int_cst (type, 0), vla));
1642       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1643                                           build_int_cst (type, 1));
1644     }
1645
1646   HOST_WIDE_INT elem_size
1647     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1648   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1649
1650   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1651   tree int_start_addr = fold_convert (type, start_addr);
1652   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1653                                         target_align_minus_1);
1654
1655   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1656   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1657                                         elem_size_log);
1658
1659   return misalign_in_elems;
1660 }
1661
1662 /* Function vect_gen_prolog_loop_niters
1663
1664    Generate the number of iterations which should be peeled as prolog for the
1665    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1666    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1667    As a result, after the execution of this loop, the data reference DR will
1668    refer to an aligned location.  The following computation is generated:
1669
1670    If the misalignment of DR is known at compile time:
1671      addr_mis = int mis = DR_MISALIGNMENT (dr);
1672    Else, compute address misalignment in bytes:
1673      addr_mis = addr & (target_align - 1)
1674
1675    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1676
1677    (elem_size = element type size; an element is the scalar element whose type
1678    is the inner type of the vectype)
1679
1680    The computations will be emitted at the end of BB.  We also compute and
1681    store upper bound (included) of the result in BOUND.
1682
1683    When the step of the data-ref in the loop is not 1 (as in interleaved data
1684    and SLP), the number of iterations of the prolog must be divided by the step
1685    (which is equal to the size of interleaved group).
1686
1687    The above formulas assume that VF == number of elements in the vector. This
1688    may not hold when there are multiple-types in the loop.
1689    In this case, for some data-references in the loop the VF does not represent
1690    the number of elements that fit in the vector.  Therefore, instead of VF we
1691    use TYPE_VECTOR_SUBPARTS.  */
1692
1693 static tree
1694 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1695                              basic_block bb, int *bound)
1696 {
1697   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1698   tree var;
1699   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1700   gimple_seq stmts = NULL, new_stmts = NULL;
1701   tree iters, iters_name;
1702   stmt_vec_info stmt_info = dr_info->stmt;
1703   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1704   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1705
1706   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1707     {
1708       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1709
1710       if (dump_enabled_p ())
1711         dump_printf_loc (MSG_NOTE, vect_location,
1712                          "known peeling = %d.\n", npeel);
1713
1714       iters = build_int_cst (niters_type, npeel);
1715       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1716     }
1717   else
1718     {
1719       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1720       tree type = TREE_TYPE (misalign_in_elems);
1721       HOST_WIDE_INT elem_size
1722         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1723       /* We only do prolog peeling if the target alignment is known at compile
1724          time.  */
1725       poly_uint64 align_in_elems =
1726         exact_div (target_align, elem_size);
1727       tree align_in_elems_minus_1 =
1728         build_int_cst (type, align_in_elems - 1);
1729       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1730
1731       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1732                                  & (align_in_elems - 1)).  */
1733       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1734                                             size_zero_node) < 0;
1735       if (negative)
1736         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1737                              align_in_elems_tree);
1738       else
1739         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1740                              misalign_in_elems);
1741       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1742       iters = fold_convert (niters_type, iters);
1743       unsigned HOST_WIDE_INT align_in_elems_c;
1744       if (align_in_elems.is_constant (&align_in_elems_c))
1745         *bound = align_in_elems_c - 1;
1746       else
1747         *bound = -1;
1748     }
1749
1750   if (dump_enabled_p ())
1751     dump_printf_loc (MSG_NOTE, vect_location,
1752                      "niters for prolog loop: %T\n", iters);
1753
1754   var = create_tmp_var (niters_type, "prolog_loop_niters");
1755   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1756
1757   if (new_stmts)
1758     gimple_seq_add_seq (&stmts, new_stmts);
1759   if (stmts)
1760     {
1761       gcc_assert (single_succ_p (bb));
1762       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1763       if (gsi_end_p (gsi))
1764         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1765       else
1766         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1767     }
1768   return iters_name;
1769 }
1770
1771
1772 /* Function vect_update_init_of_dr
1773
1774    If CODE is PLUS, the vector loop starts NITERS iterations after the
1775    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1776    iterations before the scalar one (using masking to skip inactive
1777    elements).  This function updates the information recorded in DR to
1778    account for the difference.  Specifically, it updates the OFFSET
1779    field of DR_INFO.  */
1780
1781 static void
1782 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1783 {
1784   struct data_reference *dr = dr_info->dr;
1785   tree offset = dr_info->offset;
1786   if (!offset)
1787     offset = build_zero_cst (sizetype);
1788
1789   niters = fold_build2 (MULT_EXPR, sizetype,
1790                         fold_convert (sizetype, niters),
1791                         fold_convert (sizetype, DR_STEP (dr)));
1792   offset = fold_build2 (code, sizetype,
1793                         fold_convert (sizetype, offset), niters);
1794   dr_info->offset = offset;
1795 }
1796
1797
1798 /* Function vect_update_inits_of_drs
1799
1800    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1801    CODE and NITERS are as for vect_update_inits_of_dr.  */
1802
1803 void
1804 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1805                           tree_code code)
1806 {
1807   unsigned int i;
1808   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1809   struct data_reference *dr;
1810
1811   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1812
1813   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1814      here, but since we might use these niters to update the epilogues niters
1815      and data references we can't insert them here as this definition might not
1816      always dominate its uses.  */
1817   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1818     niters = fold_convert (sizetype, niters);
1819
1820   FOR_EACH_VEC_ELT (datarefs, i, dr)
1821     {
1822       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1823       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1824         vect_update_init_of_dr (dr_info, niters, code);
1825     }
1826 }
1827
1828 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1829    by masking.  This involves calculating the number of iterations to
1830    be peeled and then aligning all memory references appropriately.  */
1831
1832 void
1833 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1834 {
1835   tree misalign_in_elems;
1836   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1837
1838   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1839
1840   /* From the information recorded in LOOP_VINFO get the number of iterations
1841      that need to be skipped via masking.  */
1842   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1843     {
1844       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1845                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1846       misalign_in_elems = build_int_cst (type, misalign);
1847     }
1848   else
1849     {
1850       gimple_seq seq1 = NULL, seq2 = NULL;
1851       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1852       misalign_in_elems = fold_convert (type, misalign_in_elems);
1853       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1854                                                 &seq2, true, NULL_TREE);
1855       gimple_seq_add_seq (&seq1, seq2);
1856       if (seq1)
1857         {
1858           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1859           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1860           gcc_assert (!new_bb);
1861         }
1862     }
1863
1864   if (dump_enabled_p ())
1865     dump_printf_loc (MSG_NOTE, vect_location,
1866                      "misalignment for fully-masked loop: %T\n",
1867                      misalign_in_elems);
1868
1869   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1870
1871   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1872 }
1873
1874 /* This function builds ni_name = number of iterations.  Statements
1875    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1876    it to TRUE if new ssa_var is generated.  */
1877
1878 tree
1879 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1880 {
1881   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1882   if (TREE_CODE (ni) == INTEGER_CST)
1883     return ni;
1884   else
1885     {
1886       tree ni_name, var;
1887       gimple_seq stmts = NULL;
1888       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1889
1890       var = create_tmp_var (TREE_TYPE (ni), "niters");
1891       ni_name = force_gimple_operand (ni, &stmts, false, var);
1892       if (stmts)
1893         {
1894           gsi_insert_seq_on_edge_immediate (pe, stmts);
1895           if (new_var_p != NULL)
1896             *new_var_p = true;
1897         }
1898
1899       return ni_name;
1900     }
1901 }
1902
1903 /* Calculate the number of iterations above which vectorized loop will be
1904    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1905    of prolog loop.  If it's integer const, the integer number is also passed
1906    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1907    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1908    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1909    threshold below which the scalar (rather than vectorized) loop will be
1910    executed.  This function stores the upper bound (inclusive) of the result
1911    in BOUND_SCALAR.  */
1912
1913 static tree
1914 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1915                              int bound_prolog, poly_int64 bound_epilog, int th,
1916                              poly_uint64 *bound_scalar,
1917                              bool check_profitability)
1918 {
1919   tree type = TREE_TYPE (niters_prolog);
1920   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1921                              build_int_cst (type, bound_epilog));
1922
1923   *bound_scalar = bound_prolog + bound_epilog;
1924   if (check_profitability)
1925     {
1926       /* TH indicates the minimum niters of vectorized loop, while we
1927          compute the maximum niters of scalar loop.  */
1928       th--;
1929       /* Peeling for constant times.  */
1930       if (int_niters_prolog >= 0)
1931         {
1932           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1933           return build_int_cst (type, *bound_scalar);
1934         }
1935       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1936          and BOUND_EPILOG are inclusive upper bounds.  */
1937       if (known_ge (th, bound_prolog + bound_epilog))
1938         {
1939           *bound_scalar = th;
1940           return build_int_cst (type, th);
1941         }
1942       /* Need to do runtime comparison.  */
1943       else if (maybe_gt (th, bound_epilog))
1944         {
1945           *bound_scalar = upper_bound (*bound_scalar, th);
1946           return fold_build2 (MAX_EXPR, type,
1947                               build_int_cst (type, th), niters);
1948         }
1949     }
1950   return niters;
1951 }
1952
1953 /* NITERS is the number of times that the original scalar loop executes
1954    after peeling.  Work out the maximum number of iterations N that can
1955    be handled by the vectorized form of the loop and then either:
1956
1957    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1958
1959         niters_vector = N
1960
1961    b) set *STEP_VECTOR_PTR to one and generate:
1962
1963         niters_vector = N / vf
1964
1965    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1966    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1967    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1968
1969 void
1970 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1971                              tree *niters_vector_ptr, tree *step_vector_ptr,
1972                              bool niters_no_overflow)
1973 {
1974   tree ni_minus_gap, var;
1975   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1976   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1978   tree log_vf = NULL_TREE;
1979
1980   /* If epilogue loop is required because of data accesses with gaps, we
1981      subtract one iteration from the total number of iterations here for
1982      correct calculation of RATIO.  */
1983   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1984     {
1985       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1986                                   build_one_cst (type));
1987       if (!is_gimple_val (ni_minus_gap))
1988         {
1989           var = create_tmp_var (type, "ni_gap");
1990           gimple *stmts = NULL;
1991           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1992                                                true, var);
1993           gsi_insert_seq_on_edge_immediate (pe, stmts);
1994         }
1995     }
1996   else
1997     ni_minus_gap = niters;
1998
1999   unsigned HOST_WIDE_INT const_vf;
2000   if (vf.is_constant (&const_vf)
2001       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2002     {
2003       /* Create: niters >> log2(vf) */
2004       /* If it's known that niters == number of latch executions + 1 doesn't
2005          overflow, we can generate niters >> log2(vf); otherwise we generate
2006          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2007          will be at least one.  */
2008       log_vf = build_int_cst (type, exact_log2 (const_vf));
2009       if (niters_no_overflow)
2010         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2011       else
2012         niters_vector
2013           = fold_build2 (PLUS_EXPR, type,
2014                          fold_build2 (RSHIFT_EXPR, type,
2015                                       fold_build2 (MINUS_EXPR, type,
2016                                                    ni_minus_gap,
2017                                                    build_int_cst (type, vf)),
2018                                       log_vf),
2019                          build_int_cst (type, 1));
2020       step_vector = build_one_cst (type);
2021     }
2022   else
2023     {
2024       niters_vector = ni_minus_gap;
2025       step_vector = build_int_cst (type, vf);
2026     }
2027
2028   if (!is_gimple_val (niters_vector))
2029     {
2030       var = create_tmp_var (type, "bnd");
2031       gimple_seq stmts = NULL;
2032       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2033       gsi_insert_seq_on_edge_immediate (pe, stmts);
2034       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2035          we set range information to make niters analyzer's life easier.
2036          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2037          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2038       if (stmts != NULL && log_vf)
2039         {
2040           if (niters_no_overflow)
2041             set_range_info (niters_vector, VR_RANGE,
2042                             wi::one (TYPE_PRECISION (type)),
2043                             wi::rshift (wi::max_value (TYPE_PRECISION (type),
2044                                                        TYPE_SIGN (type)),
2045                                         exact_log2 (const_vf),
2046                                         TYPE_SIGN (type)));
2047           /* For VF == 1 the vector IV might also overflow so we cannot
2048              assert a minimum value of 1.  */
2049           else if (const_vf > 1)
2050             set_range_info (niters_vector, VR_RANGE,
2051                             wi::one (TYPE_PRECISION (type)),
2052                             wi::rshift (wi::max_value (TYPE_PRECISION (type),
2053                                                        TYPE_SIGN (type))
2054                                         - (const_vf - 1),
2055                                         exact_log2 (const_vf), TYPE_SIGN (type))
2056                             + 1);
2057         }
2058     }
2059   *niters_vector_ptr = niters_vector;
2060   *step_vector_ptr = step_vector;
2061
2062   return;
2063 }
2064
2065 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2066    loop specified by LOOP_VINFO after vectorization, compute the number
2067    of iterations before vectorization (niters_vector * vf) and store it
2068    to NITERS_VECTOR_MULT_VF_PTR.  */
2069
2070 static void
2071 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2072                                      tree niters_vector,
2073                                      tree *niters_vector_mult_vf_ptr)
2074 {
2075   /* We should be using a step_vector of VF if VF is variable.  */
2076   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2077   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2078   tree type = TREE_TYPE (niters_vector);
2079   tree log_vf = build_int_cst (type, exact_log2 (vf));
2080   basic_block exit_bb = single_exit (loop)->dest;
2081
2082   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2083   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2084                                             niters_vector, log_vf);
2085   if (!is_gimple_val (niters_vector_mult_vf))
2086     {
2087       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2088       gimple_seq stmts = NULL;
2089       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2090                                                     &stmts, true, var);
2091       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2092       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2093     }
2094   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2095 }
2096
2097 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2098    this function searches for the corresponding lcssa phi node in exit
2099    bb of LOOP.  If it is found, return the phi result; otherwise return
2100    NULL.  */
2101
2102 static tree
2103 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2104                 gphi *lcssa_phi)
2105 {
2106   gphi_iterator gsi;
2107   edge e = single_exit (loop);
2108
2109   gcc_assert (single_pred_p (e->dest));
2110   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2111     {
2112       gphi *phi = gsi.phi ();
2113       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2114                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2115         return PHI_RESULT (phi);
2116     }
2117   return NULL_TREE;
2118 }
2119
2120 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2121    from SECOND/FIRST and puts it at the original loop's preheader/exit
2122    edge, the two loops are arranged as below:
2123
2124        preheader_a:
2125      first_loop:
2126        header_a:
2127          i_1 = PHI<i_0, i_2>;
2128          ...
2129          i_2 = i_1 + 1;
2130          if (cond_a)
2131            goto latch_a;
2132          else
2133            goto between_bb;
2134        latch_a:
2135          goto header_a;
2136
2137        between_bb:
2138          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2139
2140      second_loop:
2141        header_b:
2142          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2143                                  or with i_2 if no LCSSA phi is created
2144                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2145          ...
2146          i_4 = i_3 + 1;
2147          if (cond_b)
2148            goto latch_b;
2149          else
2150            goto exit_bb;
2151        latch_b:
2152          goto header_b;
2153
2154        exit_bb:
2155
2156    This function creates loop closed SSA for the first loop; update the
2157    second loop's PHI nodes by replacing argument on incoming edge with the
2158    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2159    is false, Loop closed ssa phis will only be created for non-iv phis for
2160    the first loop.
2161
2162    This function assumes exit bb of the first loop is preheader bb of the
2163    second loop, i.e, between_bb in the example code.  With PHIs updated,
2164    the second loop will execute rest iterations of the first.  */
2165
2166 static void
2167 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2168                                    class loop *first, class loop *second,
2169                                    bool create_lcssa_for_iv_phis)
2170 {
2171   gphi_iterator gsi_update, gsi_orig;
2172   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2173
2174   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2175   edge second_preheader_e = loop_preheader_edge (second);
2176   basic_block between_bb = single_exit (first)->dest;
2177
2178   gcc_assert (between_bb == second_preheader_e->src);
2179   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2180   /* Either the first loop or the second is the loop to be vectorized.  */
2181   gcc_assert (loop == first || loop == second);
2182
2183   for (gsi_orig = gsi_start_phis (first->header),
2184        gsi_update = gsi_start_phis (second->header);
2185        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2186        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2187     {
2188       gphi *orig_phi = gsi_orig.phi ();
2189       gphi *update_phi = gsi_update.phi ();
2190
2191       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2192       /* Generate lcssa PHI node for the first loop.  */
2193       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2194       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2195       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2196         {
2197           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2198           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2199           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2200           arg = new_res;
2201         }
2202
2203       /* Update PHI node in the second loop by replacing arg on the loop's
2204          incoming edge.  */
2205       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2206     }
2207
2208   /* For epilogue peeling we have to make sure to copy all LC PHIs
2209      for correct vectorization of live stmts.  */
2210   if (loop == first)
2211     {
2212       basic_block orig_exit = single_exit (second)->dest;
2213       for (gsi_orig = gsi_start_phis (orig_exit);
2214            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2215         {
2216           gphi *orig_phi = gsi_orig.phi ();
2217           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2218           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2219             continue;
2220
2221           /* Already created in the above loop.   */
2222           if (find_guard_arg (first, second, orig_phi))
2223             continue;
2224
2225           tree new_res = copy_ssa_name (orig_arg);
2226           gphi *lcphi = create_phi_node (new_res, between_bb);
2227           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2228         }
2229     }
2230 }
2231
2232 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2233    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2234    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2235    appear like below:
2236
2237        guard_bb:
2238          if (cond)
2239            goto merge_bb;
2240          else
2241            goto skip_loop;
2242
2243      skip_loop:
2244        header_a:
2245          i_1 = PHI<i_0, i_2>;
2246          ...
2247          i_2 = i_1 + 1;
2248          if (cond_a)
2249            goto latch_a;
2250          else
2251            goto exit_a;
2252        latch_a:
2253          goto header_a;
2254
2255        exit_a:
2256          i_5 = PHI<i_2>;
2257
2258        merge_bb:
2259          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2260
2261      update_loop:
2262        header_b:
2263          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2264          ...
2265          i_4 = i_3 + 1;
2266          if (cond_b)
2267            goto latch_b;
2268          else
2269            goto exit_bb;
2270        latch_b:
2271          goto header_b;
2272
2273        exit_bb:
2274
2275    This function creates PHI nodes at merge_bb and replaces the use of i_5
2276    in the update_loop's PHI node with the result of new PHI result.  */
2277
2278 static void
2279 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2280                                     class loop *update_loop,
2281                                     edge guard_edge, edge merge_edge)
2282 {
2283   location_t merge_loc, guard_loc;
2284   edge orig_e = loop_preheader_edge (skip_loop);
2285   edge update_e = loop_preheader_edge (update_loop);
2286   gphi_iterator gsi_orig, gsi_update;
2287
2288   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2289         gsi_update = gsi_start_phis (update_loop->header));
2290        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2291        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2292     {
2293       gphi *orig_phi = gsi_orig.phi ();
2294       gphi *update_phi = gsi_update.phi ();
2295
2296       /* Generate new phi node at merge bb of the guard.  */
2297       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2298       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2299
2300       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2301          args in NEW_PHI for these edges.  */
2302       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2303       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2304       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2305       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2306       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2307       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2308
2309       /* Update phi in UPDATE_PHI.  */
2310       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2311     }
2312 }
2313
2314 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2315    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2316    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2317    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2318    The CFG looks like:
2319
2320      loop:
2321        header_a:
2322          i_1 = PHI<i_0, i_2>;
2323          ...
2324          i_2 = i_1 + 1;
2325          if (cond_a)
2326            goto latch_a;
2327          else
2328            goto exit_a;
2329        latch_a:
2330          goto header_a;
2331
2332        exit_a:
2333
2334        guard_bb:
2335          if (cond)
2336            goto merge_bb;
2337          else
2338            goto epilog_loop;
2339
2340        ;; fall_through_bb
2341
2342      epilog_loop:
2343        header_b:
2344          i_3 = PHI<i_2, i_4>;
2345          ...
2346          i_4 = i_3 + 1;
2347          if (cond_b)
2348            goto latch_b;
2349          else
2350            goto merge_bb;
2351        latch_b:
2352          goto header_b;
2353
2354        merge_bb:
2355          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2356
2357        exit_bb:
2358          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2359
2360    For each name used out side EPILOG (i.e - for each name that has a lcssa
2361    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2362    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2363    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2364    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2365    in exit_bb will also be updated.  */
2366
2367 static void
2368 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2369                                     edge guard_edge, edge merge_edge)
2370 {
2371   gphi_iterator gsi;
2372   basic_block merge_bb = guard_edge->dest;
2373
2374   gcc_assert (single_succ_p (merge_bb));
2375   edge e = single_succ_edge (merge_bb);
2376   basic_block exit_bb = e->dest;
2377   gcc_assert (single_pred_p (exit_bb));
2378   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2379
2380   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2381     {
2382       gphi *update_phi = gsi.phi ();
2383       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2384
2385       tree merge_arg = NULL_TREE;
2386
2387       /* If the old argument is a SSA_NAME use its current_def.  */
2388       if (TREE_CODE (old_arg) == SSA_NAME)
2389         merge_arg = get_current_def (old_arg);
2390       /* If it's a constant or doesn't have a current_def, just use the old
2391          argument.  */
2392       if (!merge_arg)
2393         merge_arg = old_arg;
2394
2395       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2396       /* If the var is live after loop but not a reduction, we simply
2397          use the old arg.  */
2398       if (!guard_arg)
2399         guard_arg = old_arg;
2400
2401       /* Create new phi node in MERGE_BB:  */
2402       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2403       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2404
2405       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2406          the two PHI args in merge_phi for these edges.  */
2407       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2408       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2409
2410       /* Update the original phi in exit_bb.  */
2411       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2412     }
2413 }
2414
2415 /* EPILOG loop is duplicated from the original loop for vectorizing,
2416    the arg of its loop closed ssa PHI needs to be updated.  */
2417
2418 static void
2419 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2420 {
2421   gphi_iterator gsi;
2422   basic_block exit_bb = single_exit (epilog)->dest;
2423
2424   gcc_assert (single_pred_p (exit_bb));
2425   edge e = EDGE_PRED (exit_bb, 0);
2426   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2427     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2428 }
2429
2430 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2431    iterate exactly CONST_NITERS times.  Make a final decision about
2432    whether the epilogue loop should be used, returning true if so.  */
2433
2434 static bool
2435 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2436                              unsigned HOST_WIDE_INT const_niters)
2437 {
2438   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2439      using an epilogue loop for a single scalar iteration, even if
2440      we could in principle implement that using partial vectors.  */
2441   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2442   if (const_niters <= gap_niters + 1)
2443     return false;
2444
2445   /* Install the number of iterations.  */
2446   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2447   tree niters_tree = build_int_cst (niters_type, const_niters);
2448   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2449
2450   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2451   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2452
2453   /* Decide what to do if the number of epilogue iterations is not
2454      a multiple of the epilogue loop's vectorization factor.  */
2455   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2456 }
2457
2458 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2459    Return a value that equals:
2460
2461    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2462    - SKIP_VALUE when the main loop is skipped.  */
2463
2464 tree
2465 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2466                            tree skip_value)
2467 {
2468   gcc_assert (loop_vinfo->main_loop_edge);
2469
2470   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2471   basic_block bb = loop_vinfo->main_loop_edge->dest;
2472   gphi *new_phi = create_phi_node (phi_result, bb);
2473   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2474                UNKNOWN_LOCATION);
2475   add_phi_arg (new_phi, skip_value,
2476                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2477   return phi_result;
2478 }
2479
2480 /* Function vect_do_peeling.
2481
2482    Input:
2483    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2484
2485        preheader:
2486      LOOP:
2487        header_bb:
2488          loop_body
2489          if (exit_loop_cond) goto exit_bb
2490          else                goto header_bb
2491        exit_bb:
2492
2493    - NITERS: The number of iterations of the loop.
2494    - NITERSM1: The number of iterations of the loop's latch.
2495    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2496    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2497                               CHECK_PROFITABILITY is true.
2498    Output:
2499    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2500      iterate after vectorization; see vect_set_loop_condition for details.
2501    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2502      should be set to the number of scalar iterations handled by the
2503      vector loop.  The SSA name is only used on exit from the loop.
2504
2505    This function peels prolog and epilog from the loop, adds guards skipping
2506    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2507    would look like:
2508
2509        guard_bb_1:
2510          if (prefer_scalar_loop) goto merge_bb_1
2511          else                    goto guard_bb_2
2512
2513        guard_bb_2:
2514          if (skip_prolog) goto merge_bb_2
2515          else             goto prolog_preheader
2516
2517        prolog_preheader:
2518      PROLOG:
2519        prolog_header_bb:
2520          prolog_body
2521          if (exit_prolog_cond) goto prolog_exit_bb
2522          else                  goto prolog_header_bb
2523        prolog_exit_bb:
2524
2525        merge_bb_2:
2526
2527        vector_preheader:
2528      VECTOR LOOP:
2529        vector_header_bb:
2530          vector_body
2531          if (exit_vector_cond) goto vector_exit_bb
2532          else                  goto vector_header_bb
2533        vector_exit_bb:
2534
2535        guard_bb_3:
2536          if (skip_epilog) goto merge_bb_3
2537          else             goto epilog_preheader
2538
2539        merge_bb_1:
2540
2541        epilog_preheader:
2542      EPILOG:
2543        epilog_header_bb:
2544          epilog_body
2545          if (exit_epilog_cond) goto merge_bb_3
2546          else                  goto epilog_header_bb
2547
2548        merge_bb_3:
2549
2550    Note this function peels prolog and epilog only if it's necessary,
2551    as well as guards.
2552    This function returns the epilogue loop if a decision was made to vectorize
2553    it, otherwise NULL.
2554
2555    The analysis resulting in this epilogue loop's loop_vec_info was performed
2556    in the same vect_analyze_loop call as the main loop's.  At that time
2557    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2558    vectorization factors than the main loop.  This list is stored in the main
2559    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2560    vectorize the epilogue loop for a lower vectorization factor,  the
2561    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2562    updated and linked to the epilogue loop.  This is later used to vectorize
2563    the epilogue.  The reason the loop_vec_info needs updating is that it was
2564    constructed based on the original main loop, and the epilogue loop is a
2565    copy of this loop, so all links pointing to statements in the original loop
2566    need updating.  Furthermore, these loop_vec_infos share the
2567    data_reference's records, which will also need to be updated.
2568
2569    TODO: Guard for prefer_scalar_loop should be emitted along with
2570    versioning conditions if loop versioning is needed.  */
2571
2572
2573 class loop *
2574 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2575                  tree *niters_vector, tree *step_vector,
2576                  tree *niters_vector_mult_vf_var, int th,
2577                  bool check_profitability, bool niters_no_overflow,
2578                  tree *advance)
2579 {
2580   edge e, guard_e;
2581   tree type = TREE_TYPE (niters), guard_cond;
2582   basic_block guard_bb, guard_to;
2583   profile_probability prob_prolog, prob_vector, prob_epilog;
2584   int estimated_vf;
2585   int prolog_peeling = 0;
2586   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2587   bool vect_epilogues_updated_niters = false;
2588   /* We currently do not support prolog peeling if the target alignment is not
2589      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2590      target alignment being constant.  */
2591   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2592   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2593     return NULL;
2594
2595   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2596     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2597
2598   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2599   poly_uint64 bound_epilog = 0;
2600   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2601       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602     bound_epilog += vf - 1;
2603   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2604     bound_epilog += 1;
2605   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2606   poly_uint64 bound_scalar = bound_epilog;
2607
2608   if (!prolog_peeling && !epilog_peeling)
2609     return NULL;
2610
2611   /* Before doing any peeling make sure to reset debug binds outside of
2612      the loop refering to defs not in LC SSA.  */
2613   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2614   for (unsigned i = 0; i < loop->num_nodes; ++i)
2615     {
2616       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2617       imm_use_iterator ui;
2618       gimple *use_stmt;
2619       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2620            gsi_next (&gsi))
2621         {
2622           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2623             if (gimple_debug_bind_p (use_stmt)
2624                 && loop != gimple_bb (use_stmt)->loop_father
2625                 && !flow_loop_nested_p (loop,
2626                                         gimple_bb (use_stmt)->loop_father))
2627               {
2628                 gimple_debug_bind_reset_value (use_stmt);
2629                 update_stmt (use_stmt);
2630               }
2631         }
2632       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2633            gsi_next (&gsi))
2634         {
2635           ssa_op_iter op_iter;
2636           def_operand_p def_p;
2637           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2638             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2639               if (gimple_debug_bind_p (use_stmt)
2640                   && loop != gimple_bb (use_stmt)->loop_father
2641                   && !flow_loop_nested_p (loop,
2642                                           gimple_bb (use_stmt)->loop_father))
2643                 {
2644                   gimple_debug_bind_reset_value (use_stmt);
2645                   update_stmt (use_stmt);
2646                 }
2647         }
2648     }
2649
2650   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2651   estimated_vf = vect_vf_for_cost (loop_vinfo);
2652   if (estimated_vf == 2)
2653     estimated_vf = 3;
2654   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2655                         .apply_scale (estimated_vf - 1, estimated_vf);
2656
2657   class loop *prolog, *epilog = NULL;
2658   class loop *first_loop = loop;
2659   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2660
2661   /* We might have a queued need to update virtual SSA form.  As we
2662      delete the update SSA machinery below after doing a regular
2663      incremental SSA update during loop copying make sure we don't
2664      lose that fact.
2665      ???  Needing to update virtual SSA form by renaming is unfortunate
2666      but not all of the vectorizer code inserting new loads / stores
2667      properly assigns virtual operands to those statements.  */
2668   update_ssa (TODO_update_ssa_only_virtuals);
2669
2670   create_lcssa_for_virtual_phi (loop);
2671
2672   /* If we're vectorizing an epilogue loop, the update_ssa above will
2673      have ensured that the virtual operand is in SSA form throughout the
2674      vectorized main loop.  Normally it is possible to trace the updated
2675      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2676      back to scalar-stmt vuses, meaning that the effect of the SSA update
2677      remains local to the main loop.  However, there are rare cases in
2678      which the vectorized loop has vdefs even when the original scalar
2679      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2680      introduces clobbers of the temporary vector array, which in turn
2681      needs new vdefs.  If the scalar loop doesn't write to memory, these
2682      new vdefs will be the only ones in the vector loop.
2683
2684      In that case, update_ssa will have added a new virtual phi to the
2685      main loop, which previously didn't need one.  Ensure that we (locally)
2686      maintain LCSSA form for the virtual operand, just as we would have
2687      done if the virtual phi had existed from the outset.  This makes it
2688      easier to duplicate the scalar epilogue loop below.  */
2689   tree vop_to_rename = NULL_TREE;
2690   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2691     {
2692       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2693       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2694     }
2695
2696   if (MAY_HAVE_DEBUG_BIND_STMTS)
2697     {
2698       gcc_assert (!adjust_vec.exists ());
2699       adjust_vec.create (32);
2700     }
2701   initialize_original_copy_tables ();
2702
2703   /* Record the anchor bb at which the guard should be placed if the scalar
2704      loop might be preferred.  */
2705   basic_block anchor = loop_preheader_edge (loop)->src;
2706
2707   /* Generate the number of iterations for the prolog loop.  We do this here
2708      so that we can also get the upper bound on the number of iterations.  */
2709   tree niters_prolog;
2710   int bound_prolog = 0;
2711   if (prolog_peeling)
2712     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2713                                                   &bound_prolog);
2714   else
2715     niters_prolog = build_int_cst (type, 0);
2716
2717   loop_vec_info epilogue_vinfo = NULL;
2718   if (vect_epilogues)
2719     {
2720       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2721       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2722     }
2723
2724   tree niters_vector_mult_vf = NULL_TREE;
2725   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2726   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2727   edge update_e = NULL, skip_e = NULL;
2728   unsigned int lowest_vf = constant_lower_bound (vf);
2729   /* If we know the number of scalar iterations for the main loop we should
2730      check whether after the main loop there are enough iterations left over
2731      for the epilogue.  */
2732   if (vect_epilogues
2733       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2734       && prolog_peeling >= 0
2735       && known_eq (vf, lowest_vf))
2736     {
2737       unsigned HOST_WIDE_INT eiters
2738         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2739            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2740
2741       eiters -= prolog_peeling;
2742       eiters
2743         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2744
2745       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2746         {
2747           delete epilogue_vinfo;
2748           epilogue_vinfo = NULL;
2749           if (loop_vinfo->epilogue_vinfos.length () == 0)
2750             {
2751               vect_epilogues = false;
2752               break;
2753             }
2754           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2755           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2756         }
2757       vect_epilogues_updated_niters = true;
2758     }
2759   /* Prolog loop may be skipped.  */
2760   bool skip_prolog = (prolog_peeling != 0);
2761   /* Skip this loop to epilog when there are not enough iterations to enter this
2762      vectorized loop.  If true we should perform runtime checks on the NITERS
2763      to check whether we should skip the current vectorized loop.  If we know
2764      the number of scalar iterations we may choose to add a runtime check if
2765      this number "maybe" smaller than the number of iterations required
2766      when we know the number of scalar iterations may potentially
2767      be smaller than the number of iterations required to enter this loop, for
2768      this we use the upper bounds on the prolog and epilog peeling.  When we
2769      don't know the number of iterations and don't require versioning it is
2770      because we have asserted that there are enough scalar iterations to enter
2771      the main loop, so this skip is not necessary.  When we are versioning then
2772      we only add such a skip if we have chosen to vectorize the epilogue.  */
2773   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2774                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2775                                   bound_prolog + bound_epilog)
2776                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2777                          || vect_epilogues));
2778   /* Epilog loop must be executed if the number of iterations for epilog
2779      loop is known at compile time, otherwise we need to add a check at
2780      the end of vector loop and skip to the end of epilog loop.  */
2781   bool skip_epilog = (prolog_peeling < 0
2782                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2783                       || !vf.is_constant ());
2784   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2785   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2786     skip_epilog = false;
2787
2788   if (skip_vector)
2789     {
2790       split_edge (loop_preheader_edge (loop));
2791
2792       /* Due to the order in which we peel prolog and epilog, we first
2793          propagate probability to the whole loop.  The purpose is to
2794          avoid adjusting probabilities of both prolog and vector loops
2795          separately.  Note in this case, the probability of epilog loop
2796          needs to be scaled back later.  */
2797       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2798       if (prob_vector.initialized_p ())
2799         {
2800           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2801           scale_loop_profile (loop, prob_vector, 0);
2802         }
2803     }
2804
2805   dump_user_location_t loop_loc = find_loop_location (loop);
2806   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2807   if (vect_epilogues)
2808     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2809        use the original scalar loop as remaining epilogue if necessary.  */
2810     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2811       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2812
2813   if (prolog_peeling)
2814     {
2815       e = loop_preheader_edge (loop);
2816       if (!slpeel_can_duplicate_loop_p (loop, e))
2817         {
2818           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2819                            "loop can't be duplicated to preheader edge.\n");
2820           gcc_unreachable ();
2821         }
2822       /* Peel prolog and put it on preheader edge of loop.  */
2823       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2824       if (!prolog)
2825         {
2826           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2827                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2828           gcc_unreachable ();
2829         }
2830       prolog->force_vectorize = false;
2831       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2832       first_loop = prolog;
2833       reset_original_copy_tables ();
2834
2835       /* Update the number of iterations for prolog loop.  */
2836       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2837       vect_set_loop_condition (prolog, NULL, niters_prolog,
2838                                step_prolog, NULL_TREE, false);
2839
2840       /* Skip the prolog loop.  */
2841       if (skip_prolog)
2842         {
2843           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2844                                     niters_prolog, build_int_cst (type, 0));
2845           guard_bb = loop_preheader_edge (prolog)->src;
2846           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2847           guard_to = split_edge (loop_preheader_edge (loop));
2848           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2849                                            guard_to, guard_bb,
2850                                            prob_prolog.invert (),
2851                                            irred_flag);
2852           e = EDGE_PRED (guard_to, 0);
2853           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2854           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2855
2856           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2857           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2858         }
2859
2860       /* Update init address of DRs.  */
2861       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2862       /* Update niters for vector loop.  */
2863       LOOP_VINFO_NITERS (loop_vinfo)
2864         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2865       LOOP_VINFO_NITERSM1 (loop_vinfo)
2866         = fold_build2 (MINUS_EXPR, type,
2867                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2868       bool new_var_p = false;
2869       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2870       /* It's guaranteed that vector loop bound before vectorization is at
2871          least VF, so set range information for newly generated var.  */
2872       if (new_var_p)
2873         set_range_info (niters, VR_RANGE,
2874                         wi::to_wide (build_int_cst (type, vf)),
2875                         wi::to_wide (TYPE_MAX_VALUE (type)));
2876
2877       /* Prolog iterates at most bound_prolog times, latch iterates at
2878          most bound_prolog - 1 times.  */
2879       record_niter_bound (prolog, bound_prolog - 1, false, true);
2880       delete_update_ssa ();
2881       adjust_vec_debug_stmts ();
2882       scev_reset ();
2883     }
2884
2885   if (epilog_peeling)
2886     {
2887       e = single_exit (loop);
2888       if (!slpeel_can_duplicate_loop_p (loop, e))
2889         {
2890           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2891                            "loop can't be duplicated to exit edge.\n");
2892           gcc_unreachable ();
2893         }
2894       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2895          said epilog then we should use a copy of the main loop as a starting
2896          point.  This loop may have already had some preliminary transformations
2897          to allow for more optimal vectorization, for example if-conversion.
2898          If we are not vectorizing the epilog then we should use the scalar loop
2899          as the transformations mentioned above make less or no sense when not
2900          vectorizing.  */
2901       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2902       if (vop_to_rename)
2903         {
2904           /* Vectorizing the main loop can sometimes introduce a vdef to
2905              a loop that previously didn't have one; see the comment above
2906              the definition of VOP_TO_RENAME for details.  The definition
2907              D that holds on E will then be different from the definition
2908              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2909              rename VOP_TO_RENAME to D when copying the loop.
2910
2911              The virtual operand is in LCSSA form for the main loop,
2912              and no stmt between the main loop and E needs a vdef,
2913              so we know that D is provided by a phi rather than by a
2914              vdef on a normal gimple stmt.  */
2915           basic_block vdef_bb = e->src;
2916           gphi *vphi;
2917           while (!(vphi = get_virtual_phi (vdef_bb)))
2918             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2919           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2920           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2921         }
2922       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2923       if (!epilog)
2924         {
2925           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2926                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2927           gcc_unreachable ();
2928         }
2929       epilog->force_vectorize = false;
2930       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2931
2932       /* Scalar version loop may be preferred.  In this case, add guard
2933          and skip to epilog.  Note this only happens when the number of
2934          iterations of loop is unknown at compile time, otherwise this
2935          won't be vectorized.  */
2936       if (skip_vector)
2937         {
2938           /* Additional epilogue iteration is peeled if gap exists.  */
2939           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2940                                                 bound_prolog, bound_epilog,
2941                                                 th, &bound_scalar,
2942                                                 check_profitability);
2943           /* Build guard against NITERSM1 since NITERS may overflow.  */
2944           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2945           guard_bb = anchor;
2946           guard_to = split_edge (loop_preheader_edge (epilog));
2947           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2948                                            guard_to, guard_bb,
2949                                            prob_vector.invert (),
2950                                            irred_flag);
2951           skip_e = guard_e;
2952           e = EDGE_PRED (guard_to, 0);
2953           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2954           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2955
2956           /* Simply propagate profile info from guard_bb to guard_to which is
2957              a merge point of control flow.  */
2958           guard_to->count = guard_bb->count;
2959
2960           /* Scale probability of epilog loop back.
2961              FIXME: We should avoid scaling down and back up.  Profile may
2962              get lost if we scale down to 0.  */
2963           basic_block *bbs = get_loop_body (epilog);
2964           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2965             bbs[i]->count = bbs[i]->count.apply_scale
2966                                  (bbs[i]->count,
2967                                   bbs[i]->count.apply_probability
2968                                     (prob_vector));
2969           free (bbs);
2970         }
2971
2972       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2973       /* If loop is peeled for non-zero constant times, now niters refers to
2974          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2975          overflows.  */
2976       niters_no_overflow |= (prolog_peeling > 0);
2977       vect_gen_vector_loop_niters (loop_vinfo, niters,
2978                                    niters_vector, step_vector,
2979                                    niters_no_overflow);
2980       if (!integer_onep (*step_vector))
2981         {
2982           /* On exit from the loop we will have an easy way of calcalating
2983              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2984              until then.  */
2985           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2986           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2987           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2988         }
2989       else
2990         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2991                                              &niters_vector_mult_vf);
2992       /* Update IVs of original loop as if they were advanced by
2993          niters_vector_mult_vf steps.  */
2994       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2995       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2996       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2997                                         update_e);
2998
2999       if (skip_epilog)
3000         {
3001           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3002                                     niters, niters_vector_mult_vf);
3003           guard_bb = single_exit (loop)->dest;
3004           guard_to = split_edge (single_exit (epilog));
3005           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3006                                            skip_vector ? anchor : guard_bb,
3007                                            prob_epilog.invert (),
3008                                            irred_flag);
3009           if (vect_epilogues)
3010             epilogue_vinfo->skip_this_loop_edge = guard_e;
3011           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3012                                               single_exit (epilog));
3013           /* Only need to handle basic block before epilog loop if it's not
3014              the guard_bb, which is the case when skip_vector is true.  */
3015           if (guard_bb != bb_before_epilog)
3016             {
3017               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3018
3019               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3020             }
3021           scale_loop_profile (epilog, prob_epilog, 0);
3022         }
3023       else
3024         slpeel_update_phi_nodes_for_lcssa (epilog);
3025
3026       unsigned HOST_WIDE_INT bound;
3027       if (bound_scalar.is_constant (&bound))
3028         {
3029           gcc_assert (bound != 0);
3030           /* -1 to convert loop iterations to latch iterations.  */
3031           record_niter_bound (epilog, bound - 1, false, true);
3032         }
3033
3034       delete_update_ssa ();
3035       adjust_vec_debug_stmts ();
3036       scev_reset ();
3037     }
3038
3039   if (vect_epilogues)
3040     {
3041       epilog->aux = epilogue_vinfo;
3042       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3043
3044       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3045
3046       /* We now must calculate the number of NITERS performed by the previous
3047          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3048       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3049                                  niters_prolog, niters_vector_mult_vf);
3050
3051       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3052          determine whether we are coming from the previous vectorized loop
3053          using the update_e edge or the skip_vector basic block using the
3054          skip_e edge.  */
3055       if (skip_vector)
3056         {
3057           gcc_assert (update_e != NULL
3058                       && skip_e != NULL
3059                       && !vect_epilogues_updated_niters);
3060           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3061                                            update_e->dest);
3062           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3063           gimple *stmt = gimple_build_assign (new_ssa, niters);
3064           gimple_stmt_iterator gsi;
3065           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3066               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3067             {
3068               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3069               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3070             }
3071           else
3072             {
3073               gsi = gsi_last_bb (update_e->src);
3074               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3075             }
3076
3077           niters = new_ssa;
3078           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3079           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3080                        UNKNOWN_LOCATION);
3081           niters = PHI_RESULT (new_phi);
3082           epilogue_vinfo->main_loop_edge = update_e;
3083           epilogue_vinfo->skip_main_loop_edge = skip_e;
3084         }
3085
3086       /* Set ADVANCE to the number of iterations performed by the previous
3087          loop and its prologue.  */
3088       *advance = niters;
3089
3090       if (!vect_epilogues_updated_niters)
3091         {
3092           /* Subtract the number of iterations performed by the vectorized loop
3093              from the number of total iterations.  */
3094           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3095                                               before_loop_niters,
3096                                               niters);
3097
3098           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3099           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3100             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3101                            epilogue_niters,
3102                            build_one_cst (TREE_TYPE (epilogue_niters)));
3103
3104           /* Decide what to do if the number of epilogue iterations is not
3105              a multiple of the epilogue loop's vectorization factor.
3106              We should have rejected the loop during the analysis phase
3107              if this fails.  */
3108           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3109                                                            true))
3110             gcc_unreachable ();
3111         }
3112     }
3113
3114   adjust_vec.release ();
3115   free_original_copy_tables ();
3116
3117   return vect_epilogues ? epilog : NULL;
3118 }
3119
3120 /* Function vect_create_cond_for_niters_checks.
3121
3122    Create a conditional expression that represents the run-time checks for
3123    loop's niter.  The loop is guaranteed to terminate if the run-time
3124    checks hold.
3125
3126    Input:
3127    COND_EXPR  - input conditional expression.  New conditions will be chained
3128                 with logical AND operation.  If it is NULL, then the function
3129                 is used to return the number of alias checks.
3130    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3131                 to be checked.
3132
3133    Output:
3134    COND_EXPR - conditional expression.
3135
3136    The returned COND_EXPR is the conditional expression to be used in the
3137    if statement that controls which version of the loop gets executed at
3138    runtime.  */
3139
3140 static void
3141 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3142 {
3143   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3144
3145   if (*cond_expr)
3146     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3147                               *cond_expr, part_cond_expr);
3148   else
3149     *cond_expr = part_cond_expr;
3150 }
3151
3152 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3153    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3154
3155 static void
3156 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3157 {
3158   if (*cond_expr)
3159     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3160                               *cond_expr, part_cond_expr);
3161   else
3162     *cond_expr = part_cond_expr;
3163 }
3164
3165 /* Function vect_create_cond_for_align_checks.
3166
3167    Create a conditional expression that represents the alignment checks for
3168    all of data references (array element references) whose alignment must be
3169    checked at runtime.
3170
3171    Input:
3172    COND_EXPR  - input conditional expression.  New conditions will be chained
3173                 with logical AND operation.
3174    LOOP_VINFO - two fields of the loop information are used.
3175                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3176                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3177
3178    Output:
3179    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3180                          expression.
3181    The returned value is the conditional expression to be used in the if
3182    statement that controls which version of the loop gets executed at runtime.
3183
3184    The algorithm makes two assumptions:
3185      1) The number of bytes "n" in a vector is a power of 2.
3186      2) An address "a" is aligned if a%n is zero and that this
3187         test can be done as a&(n-1) == 0.  For example, for 16
3188         byte vectors the test is a&0xf == 0.  */
3189
3190 static void
3191 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3192                                    tree *cond_expr,
3193                                    gimple_seq *cond_expr_stmt_list)
3194 {
3195   const vec<stmt_vec_info> &may_misalign_stmts
3196     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3197   stmt_vec_info stmt_info;
3198   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3199   tree mask_cst;
3200   unsigned int i;
3201   tree int_ptrsize_type;
3202   char tmp_name[20];
3203   tree or_tmp_name = NULL_TREE;
3204   tree and_tmp_name;
3205   gimple *and_stmt;
3206   tree ptrsize_zero;
3207   tree part_cond_expr;
3208
3209   /* Check that mask is one less than a power of 2, i.e., mask is
3210      all zeros followed by all ones.  */
3211   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3212
3213   int_ptrsize_type = signed_type_for (ptr_type_node);
3214
3215   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3216      of the first vector of the i'th data reference. */
3217
3218   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3219     {
3220       gimple_seq new_stmt_list = NULL;
3221       tree addr_base;
3222       tree addr_tmp_name;
3223       tree new_or_tmp_name;
3224       gimple *addr_stmt, *or_stmt;
3225       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3226       bool negative = tree_int_cst_compare
3227         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3228       tree offset = negative
3229         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3230
3231       /* create: addr_tmp = (int)(address_of_first_vector) */
3232       addr_base =
3233         vect_create_addr_base_for_vector_ref (loop_vinfo,
3234                                               stmt_info, &new_stmt_list,
3235                                               offset);
3236       if (new_stmt_list != NULL)
3237         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3238
3239       sprintf (tmp_name, "addr2int%d", i);
3240       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3241       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3242       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3243
3244       /* The addresses are OR together.  */
3245
3246       if (or_tmp_name != NULL_TREE)
3247         {
3248           /* create: or_tmp = or_tmp | addr_tmp */
3249           sprintf (tmp_name, "orptrs%d", i);
3250           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3251           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3252                                          or_tmp_name, addr_tmp_name);
3253           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3254           or_tmp_name = new_or_tmp_name;
3255         }
3256       else
3257         or_tmp_name = addr_tmp_name;
3258
3259     } /* end for i */
3260
3261   mask_cst = build_int_cst (int_ptrsize_type, mask);
3262
3263   /* create: and_tmp = or_tmp & mask  */
3264   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3265
3266   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3267                                   or_tmp_name, mask_cst);
3268   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3269
3270   /* Make and_tmp the left operand of the conditional test against zero.
3271      if and_tmp has a nonzero bit then some address is unaligned.  */
3272   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3273   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3274                                 and_tmp_name, ptrsize_zero);
3275   chain_cond_expr (cond_expr, part_cond_expr);
3276 }
3277
3278 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3279    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3280    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3281    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3282
3283 static void
3284 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3285 {
3286   const vec<vec_object_pair> &pairs
3287     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3288   unsigned int i;
3289   vec_object_pair *pair;
3290   FOR_EACH_VEC_ELT (pairs, i, pair)
3291     {
3292       tree addr1 = build_fold_addr_expr (pair->first);
3293       tree addr2 = build_fold_addr_expr (pair->second);
3294       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3295                                          addr1, addr2);
3296       chain_cond_expr (cond_expr, part_cond_expr);
3297     }
3298 }
3299
3300 /* Create an expression that is true when all lower-bound conditions for
3301    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3302
3303 static void
3304 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3305 {
3306   const vec<vec_lower_bound> &lower_bounds
3307     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3308   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3309     {
3310       tree expr = lower_bounds[i].expr;
3311       tree type = unsigned_type_for (TREE_TYPE (expr));
3312       expr = fold_convert (type, expr);
3313       poly_uint64 bound = lower_bounds[i].min_value;
3314       if (!lower_bounds[i].unsigned_p)
3315         {
3316           expr = fold_build2 (PLUS_EXPR, type, expr,
3317                               build_int_cstu (type, bound - 1));
3318           bound += bound - 1;
3319         }
3320       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3321                                          build_int_cstu (type, bound));
3322       chain_cond_expr (cond_expr, part_cond_expr);
3323     }
3324 }
3325
3326 /* Function vect_create_cond_for_alias_checks.
3327
3328    Create a conditional expression that represents the run-time checks for
3329    overlapping of address ranges represented by a list of data references
3330    relations passed as input.
3331
3332    Input:
3333    COND_EXPR  - input conditional expression.  New conditions will be chained
3334                 with logical AND operation.  If it is NULL, then the function
3335                 is used to return the number of alias checks.
3336    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3337                 to be checked.
3338
3339    Output:
3340    COND_EXPR - conditional expression.
3341
3342    The returned COND_EXPR is the conditional expression to be used in the if
3343    statement that controls which version of the loop gets executed at runtime.
3344 */
3345
3346 void
3347 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3348 {
3349   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3350     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3351
3352   if (comp_alias_ddrs.is_empty ())
3353     return;
3354
3355   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3356                                &comp_alias_ddrs, cond_expr);
3357   if (dump_enabled_p ())
3358     dump_printf_loc (MSG_NOTE, vect_location,
3359                      "created %u versioning for alias checks.\n",
3360                      comp_alias_ddrs.length ());
3361 }
3362
3363
3364 /* Function vect_loop_versioning.
3365
3366    If the loop has data references that may or may not be aligned or/and
3367    has data reference relations whose independence was not proven then
3368    two versions of the loop need to be generated, one which is vectorized
3369    and one which isn't.  A test is then generated to control which of the
3370    loops is executed.  The test checks for the alignment of all of the
3371    data references that may or may not be aligned.  An additional
3372    sequence of runtime tests is generated for each pairs of DDRs whose
3373    independence was not proven.  The vectorized version of loop is
3374    executed only if both alias and alignment tests are passed.
3375
3376    The test generated to check which version of loop is executed
3377    is modified to also check for profitability as indicated by the
3378    cost model threshold TH.
3379
3380    The versioning precondition(s) are placed in *COND_EXPR and
3381    *COND_EXPR_STMT_LIST.  */
3382
3383 class loop *
3384 vect_loop_versioning (loop_vec_info loop_vinfo,
3385                       gimple *loop_vectorized_call)
3386 {
3387   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3388   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3389   basic_block condition_bb;
3390   gphi_iterator gsi;
3391   gimple_stmt_iterator cond_exp_gsi;
3392   basic_block merge_bb;
3393   basic_block new_exit_bb;
3394   edge new_exit_e, e;
3395   gphi *orig_phi, *new_phi;
3396   tree cond_expr = NULL_TREE;
3397   gimple_seq cond_expr_stmt_list = NULL;
3398   tree arg;
3399   profile_probability prob = profile_probability::likely ();
3400   gimple_seq gimplify_stmt_list = NULL;
3401   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3402   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3403   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3404   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3405   poly_uint64 versioning_threshold
3406     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3407   tree version_simd_if_cond
3408     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3409   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3410
3411   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3412       && !ordered_p (th, versioning_threshold))
3413     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3414                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3415                                             th - 1));
3416   if (maybe_ne (versioning_threshold, 0U))
3417     {
3418       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3419                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3420                                               versioning_threshold - 1));
3421       if (cond_expr)
3422         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3423                                  expr, cond_expr);
3424       else
3425         cond_expr = expr;
3426     }
3427
3428   if (version_niter)
3429     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3430
3431   if (cond_expr)
3432     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3433                                         &cond_expr_stmt_list,
3434                                         is_gimple_condexpr, NULL_TREE);
3435
3436   if (version_align)
3437     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3438                                        &cond_expr_stmt_list);
3439
3440   if (version_alias)
3441     {
3442       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3443       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3444       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3445     }
3446
3447   if (version_simd_if_cond)
3448     {
3449       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3450       if (flag_checking)
3451         if (basic_block bb
3452             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3453           gcc_assert (bb != loop->header
3454                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3455                       && (scalar_loop == NULL
3456                           || (bb != scalar_loop->header
3457                               && dominated_by_p (CDI_DOMINATORS,
3458                                                  scalar_loop->header, bb))));
3459       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3460       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3461                             version_simd_if_cond, zero);
3462       if (cond_expr)
3463         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3464                                  c, cond_expr);
3465       else
3466         cond_expr = c;
3467       if (dump_enabled_p ())
3468         dump_printf_loc (MSG_NOTE, vect_location,
3469                          "created versioning for simd if condition check.\n");
3470     }
3471
3472   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3473                                       &gimplify_stmt_list,
3474                                       is_gimple_condexpr, NULL_TREE);
3475   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3476
3477   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3478      invariant in.  */
3479   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3480   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3481        !gsi_end_p (gsi); gsi_next (&gsi))
3482     {
3483       gimple *stmt = gsi_stmt (gsi);
3484       update_stmt (stmt);
3485       ssa_op_iter iter;
3486       use_operand_p use_p;
3487       basic_block def_bb;
3488       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3489         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3490             && flow_bb_inside_loop_p (outermost, def_bb))
3491           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3492     }
3493
3494   /* Search for the outermost loop we can version.  Avoid versioning of
3495      non-perfect nests but allow if-conversion versioned loops inside.  */
3496   class loop *loop_to_version = loop;
3497   if (flow_loop_nested_p (outermost, loop))
3498     {
3499       if (dump_enabled_p ())
3500         dump_printf_loc (MSG_NOTE, vect_location,
3501                          "trying to apply versioning to outer loop %d\n",
3502                          outermost->num);
3503       if (outermost->num == 0)
3504         outermost = superloop_at_depth (loop, 1);
3505       /* And avoid applying versioning on non-perfect nests.  */
3506       while (loop_to_version != outermost
3507              && single_exit (loop_outer (loop_to_version))
3508              && (!loop_outer (loop_to_version)->inner->next
3509                  || vect_loop_vectorized_call (loop_to_version))
3510              && (!loop_outer (loop_to_version)->inner->next
3511                  || !loop_outer (loop_to_version)->inner->next->next))
3512         loop_to_version = loop_outer (loop_to_version);
3513     }
3514
3515   /* Apply versioning.  If there is already a scalar version created by
3516      if-conversion re-use that.  Note we cannot re-use the copy of
3517      an if-converted outer-loop when vectorizing the inner loop only.  */
3518   gcond *cond;
3519   if ((!loop_to_version->inner || loop == loop_to_version)
3520       && loop_vectorized_call)
3521     {
3522       gcc_assert (scalar_loop);
3523       condition_bb = gimple_bb (loop_vectorized_call);
3524       cond = as_a <gcond *> (last_stmt (condition_bb));
3525       gimple_cond_set_condition_from_tree (cond, cond_expr);
3526       update_stmt (cond);
3527
3528       if (cond_expr_stmt_list)
3529         {
3530           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3531           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3532                                  GSI_SAME_STMT);
3533         }
3534
3535       /* if-conversion uses profile_probability::always () for both paths,
3536          reset the paths probabilities appropriately.  */
3537       edge te, fe;
3538       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3539       te->probability = prob;
3540       fe->probability = prob.invert ();
3541       /* We can scale loops counts immediately but have to postpone
3542          scaling the scalar loop because we re-use it during peeling.  */
3543       scale_loop_frequencies (loop_to_version, te->probability);
3544       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3545
3546       nloop = scalar_loop;
3547       if (dump_enabled_p ())
3548         dump_printf_loc (MSG_NOTE, vect_location,
3549                          "reusing %sloop version created by if conversion\n",
3550                          loop_to_version != loop ? "outer " : "");
3551     }
3552   else
3553     {
3554       if (loop_to_version != loop
3555           && dump_enabled_p ())
3556         dump_printf_loc (MSG_NOTE, vect_location,
3557                          "applying loop versioning to outer loop %d\n",
3558                          loop_to_version->num);
3559
3560       initialize_original_copy_tables ();
3561       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3562                             prob, prob.invert (), prob, prob.invert (), true);
3563       gcc_assert (nloop);
3564       nloop = get_loop_copy (loop);
3565
3566       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3567          reap those otherwise;  they also refer to the original
3568          loops.  */
3569       class loop *l = loop;
3570       while (gimple *call = vect_loop_vectorized_call (l))
3571         {
3572           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3573           fold_loop_internal_call (call, boolean_false_node);
3574           l = loop_outer (l);
3575         }
3576       free_original_copy_tables ();
3577
3578       if (cond_expr_stmt_list)
3579         {
3580           cond_exp_gsi = gsi_last_bb (condition_bb);
3581           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3582                                  GSI_SAME_STMT);
3583         }
3584
3585       /* Loop versioning violates an assumption we try to maintain during
3586          vectorization - that the loop exit block has a single predecessor.
3587          After versioning, the exit block of both loop versions is the same
3588          basic block (i.e. it has two predecessors). Just in order to simplify
3589          following transformations in the vectorizer, we fix this situation
3590          here by adding a new (empty) block on the exit-edge of the loop,
3591          with the proper loop-exit phis to maintain loop-closed-form.
3592          If loop versioning wasn't done from loop, but scalar_loop instead,
3593          merge_bb will have already just a single successor.  */
3594
3595       merge_bb = single_exit (loop_to_version)->dest;
3596       if (EDGE_COUNT (merge_bb->preds) >= 2)
3597         {
3598           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3599           new_exit_bb = split_edge (single_exit (loop_to_version));
3600           new_exit_e = single_exit (loop_to_version);
3601           e = EDGE_SUCC (new_exit_bb, 0);
3602
3603           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3604                gsi_next (&gsi))
3605             {
3606               tree new_res;
3607               orig_phi = gsi.phi ();
3608               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3609               new_phi = create_phi_node (new_res, new_exit_bb);
3610               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3611               add_phi_arg (new_phi, arg, new_exit_e,
3612                            gimple_phi_arg_location_from_edge (orig_phi, e));
3613               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3614             }
3615         }
3616
3617       update_ssa (TODO_update_ssa);
3618     }
3619
3620   if (version_niter)
3621     {
3622       /* The versioned loop could be infinite, we need to clear existing
3623          niter information which is copied from the original loop.  */
3624       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3625       vect_free_loop_info_assumptions (nloop);
3626     }
3627
3628   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3629       && dump_enabled_p ())
3630     {
3631       if (version_alias)
3632         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3633                          vect_location,
3634                          "loop versioned for vectorization because of "
3635                          "possible aliasing\n");
3636       if (version_align)
3637         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3638                          vect_location,
3639                          "loop versioned for vectorization to enhance "
3640                          "alignment\n");
3641
3642     }
3643
3644   return nloop;
3645 }