gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 260    the mask should have during the first iteration and NEXT_MASK is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
 265                     tree next_mask)
 266 {
 267   gphi *phi = create_phi_node (mask, loop->header);
 268   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 324                                rgroup_masks *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->mask_type;
 327   tree dest_masktype = dest_rgm->mask_type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 342         {
 343           tree src = src_rgm->masks[i / 2];
 344           tree dest = dest_rgm->masks[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 375         {
 376           tree src = src_rgm->masks[i / 2];
 377           tree dest = dest_rgm->masks[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 388    all the masks in RGM and return a mask that is nonzero when the loop
 389    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 390    Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGM->max_nscalars_per_iter
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGM->max_nscalars_per_iter
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks for RGM.  */
 415
 416 static tree
 417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
 418                               gimple_seq *preheader_seq,
 419                               gimple_stmt_iterator loop_cond_gsi,
 420                               rgroup_masks *rgm, tree niters, tree niters_skip,
 421                               bool might_wrap_p)
 422 {
 423   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 424   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
 425   tree mask_type = rgm->mask_type;
 426   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 427   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 428   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 429
 430   /* Calculate the maximum number of scalar values that the rgroup
 431      handles in total, the number that it handles for each iteration
 432      of the vector loop, and the number that it should skip during the
 433      first iteration of the vector loop.  */
 434   tree nscalars_total = niters;
 435   tree nscalars_step = build_int_cst (iv_type, vf);
 436   tree nscalars_skip = niters_skip;
 437   if (nscalars_per_iter != 1)
 438     {
 439       /* We checked before choosing to use a fully-masked loop that these
 440          multiplications don't overflow.  */
 441       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
 442       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
 443       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 444                                      nscalars_total, compare_factor);
 445       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 446                                     nscalars_step, iv_factor);
 447       if (nscalars_skip)
 448         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 449                                       nscalars_skip, compare_factor);
 450     }
 451
 452   /* Create an induction variable that counts the number of scalars
 453      processed.  */
 454   tree index_before_incr, index_after_incr;
 455   gimple_stmt_iterator incr_gsi;
 456   bool insert_after;
 457   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 458   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
 459              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 460
 461   tree zero_index = build_int_cst (compare_type, 0);
 462   tree test_index, test_limit, first_limit;
 463   gimple_stmt_iterator *test_gsi;
 464   if (might_wrap_p)
 465     {
 466       /* In principle the loop should stop iterating once the incremented
 467          IV reaches a value greater than or equal to:
 468
 469            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 470
 471          However, there's no guarantee that this addition doesn't overflow
 472          the comparison type, or that the IV hits a value above it before
 473          wrapping around.  We therefore adjust the limit down by one
 474          IV step:
 475
 476            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 477            -[infinite-prec] NSCALARS_STEP
 478
 479          and compare the IV against this limit _before_ incrementing it.
 480          Since the comparison type is unsigned, we actually want the
 481          subtraction to saturate at zero:
 482
 483            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 484            -[sat] NSCALARS_STEP
 485
 486          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 487
 488            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 489
 490          where the rightmost subtraction can be done directly in
 491          COMPARE_TYPE.  */
 492       test_index = index_before_incr;
 493       tree adjust = gimple_convert (preheader_seq, compare_type,
 494                                     nscalars_step);
 495       if (nscalars_skip)
 496         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 497                                adjust, nscalars_skip);
 498       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 499                                  nscalars_total, adjust);
 500       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 501                                  test_limit, adjust);
 502       test_gsi = &incr_gsi;
 503
 504       /* Get a safe limit for the first iteration.  */
 505       if (nscalars_skip)
 506         {
 507           /* The first vector iteration can handle at most NSCALARS_STEP
 508              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 509              NSCALARS_SKIP to that cannot overflow.  */
 510           tree const_limit = build_int_cst (compare_type,
 511                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 512                                             * nscalars_per_iter);
 513           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 514                                       nscalars_total, const_limit);
 515           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 516                                       first_limit, nscalars_skip);
 517         }
 518       else
 519         /* For the first iteration it doesn't matter whether the IV hits
 520            a value above NSCALARS_TOTAL.  That only matters for the latch
 521            condition.  */
 522         first_limit = nscalars_total;
 523     }
 524   else
 525     {
 526       /* Test the incremented IV, which will always hit a value above
 527          the bound before wrapping.  */
 528       test_index = index_after_incr;
 529       test_limit = nscalars_total;
 530       if (nscalars_skip)
 531         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 532                                    test_limit, nscalars_skip);
 533       test_gsi = &loop_cond_gsi;
 534
 535       first_limit = test_limit;
 536     }
 537
 538   /* Convert the IV value to the comparison type (either a no-op or
 539      a demotion).  */
 540   gimple_seq test_seq = NULL;
 541   test_index = gimple_convert (&test_seq, compare_type, test_index);
 542   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 543
 544   /* Provide a definition of each mask in the group.  */
 545   tree next_mask = NULL_TREE;
 546   tree mask;
 547   unsigned int i;
 548   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 549     {
 550       /* Previous masks will cover BIAS scalars.  This mask covers the
 551          next batch.  */
 552       poly_uint64 bias = nscalars_per_mask * i;
 553       tree bias_tree = build_int_cst (compare_type, bias);
 554       gimple *tmp_stmt;
 555
 556       /* See whether the first iteration of the vector loop is known
 557          to have a full mask.  */
 558       poly_uint64 const_limit;
 559       bool first_iteration_full
 560         = (poly_int_tree_p (first_limit, &const_limit)
 561            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 562
 563       /* Rather than have a new IV that starts at BIAS and goes up to
 564          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 565          and adjust the bound down by BIAS.  */
 566       tree this_test_limit = test_limit;
 567       if (i != 0)
 568         {
 569           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 570                                           compare_type, this_test_limit,
 571                                           bias_tree);
 572           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 573                                           compare_type, this_test_limit,
 574                                           bias_tree);
 575         }
 576
 577       /* Create the initial mask.  First include all scalars that
 578          are within the loop limit.  */
 579       tree init_mask = NULL_TREE;
 580       if (!first_iteration_full)
 581         {
 582           tree start, end;
 583           if (first_limit == test_limit)
 584             {
 585               /* Use a natural test between zero (the initial IV value)
 586                  and the loop limit.  The "else" block would be valid too,
 587                  but this choice can avoid the need to load BIAS_TREE into
 588                  a register.  */
 589               start = zero_index;
 590               end = this_test_limit;
 591             }
 592           else
 593             {
 594               /* FIRST_LIMIT is the maximum number of scalars handled by the
 595                  first iteration of the vector loop.  Test the portion
 596                  associated with this mask.  */
 597               start = bias_tree;
 598               end = first_limit;
 599             }
 600
 601           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 602           tmp_stmt = vect_gen_while (init_mask, start, end);
 603           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 604         }
 605
 606       /* Now AND out the bits that are within the number of skipped
 607          scalars.  */
 608       poly_uint64 const_skip;
 609       if (nscalars_skip
 610           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 611                && known_le (const_skip, bias)))
 612         {
 613           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 614                                                     bias_tree, nscalars_skip);
 615           if (init_mask)
 616             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 617                                       init_mask, unskipped_mask);
 618           else
 619             init_mask = unskipped_mask;
 620         }
 621
 622       if (!init_mask)
 623         /* First iteration is full.  */
 624         init_mask = build_minus_one_cst (mask_type);
 625
 626       /* Get the mask value for the next iteration of the loop.  */
 627       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 628       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 629       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 630
 631       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 632     }
 633   return next_mask;
 634 }
 635
 636 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 637    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 638    number of iterations of the original scalar loop that should be
 639    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 640    as for vect_set_loop_condition.
 641
 642    Insert the branch-back condition before LOOP_COND_GSI and return the
 643    final gcond.  */
 644
 645 static gcond *
 646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
 647                                 tree niters, tree final_iv,
 648                                 bool niters_maybe_zero,
 649                                 gimple_stmt_iterator loop_cond_gsi)
 650 {
 651   gimple_seq preheader_seq = NULL;
 652   gimple_seq header_seq = NULL;
 653
 654   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 655   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 656   tree orig_niters = niters;
 657
 658   /* Type of the initial value of NITERS.  */
 659   tree ni_actual_type = TREE_TYPE (niters);
 660   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 661   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 662
 663   /* Convert NITERS to the same size as the compare.  */
 664   if (compare_precision > ni_actual_precision
 665       && niters_maybe_zero)
 666     {
 667       /* We know that there is always at least one iteration, so if the
 668          count is zero then it must have wrapped.  Cope with this by
 669          subtracting 1 before the conversion and adding 1 to the result.  */
 670       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 671       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 672                              niters, build_minus_one_cst (ni_actual_type));
 673       niters = gimple_convert (&preheader_seq, compare_type, niters);
 674       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 675                              niters, build_one_cst (compare_type));
 676     }
 677   else
 678     niters = gimple_convert (&preheader_seq, compare_type, niters);
 679
 680   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
 681
 682   /* Iterate over all the rgroups and fill in their masks.  We could use
 683      the first mask from any rgroup for the loop condition; here we
 684      arbitrarily pick the last.  */
 685   tree test_mask = NULL_TREE;
 686   rgroup_masks *rgm;
 687   unsigned int i;
 688   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 689   FOR_EACH_VEC_ELT (*masks, i, rgm)
 690     if (!rgm->masks.is_empty ())
 691       {
 692         /* First try using permutes.  This adds a single vector
 693            instruction to the loop for each mask, but needs no extra
 694            loop invariants or IVs.  */
 695         unsigned int nmasks = i + 1;
 696         if ((nmasks & 1) == 0)
 697           {
 698             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 699             if (!half_rgm->masks.is_empty ()
 700                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 701               continue;
 702           }
 703
 704         /* See whether zero-based IV would ever generate all-false masks
 705            before wrapping around.  */
 706         bool might_wrap_p
 707           = (iv_limit == -1
 708              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 709                                     UNSIGNED)
 710                  > compare_precision));
 711
 712         /* Set up all masks for this group.  */
 713         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 714                                                   &preheader_seq,
 715                                                   loop_cond_gsi, rgm,
 716                                                   niters, niters_skip,
 717                                                   might_wrap_p);
 718       }
 719
 720   /* Emit all accumulated statements.  */
 721   add_preheader_seq (loop, preheader_seq);
 722   add_header_seq (loop, header_seq);
 723
 724   /* Get a boolean result that tells us whether to iterate.  */
 725   edge exit_edge = single_exit (loop);
 726   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 727   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 728   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 729                                         NULL_TREE, NULL_TREE);
 730   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 731
 732   /* The loop iterates (NITERS - 1) / VF + 1 times.
 733      Subtract one from this to get the latch count.  */
 734   tree step = build_int_cst (compare_type,
 735                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 736   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 737                                        build_minus_one_cst (compare_type));
 738   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 739                                      niters_minus_one, step);
 740
 741   if (final_iv)
 742     {
 743       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 744       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 745     }
 746
 747   return cond_stmt;
 748 }
 749
 750 /* Like vect_set_loop_condition, but handle the case in which there
 751    are no loop masks.  */
 752
 753 static gcond *
 754 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
 755                                   tree step, tree final_iv,
 756                                   bool niters_maybe_zero,
 757                                   gimple_stmt_iterator loop_cond_gsi)
 758 {
 759   tree indx_before_incr, indx_after_incr;
 760   gcond *cond_stmt;
 761   gcond *orig_cond;
 762   edge pe = loop_preheader_edge (loop);
 763   edge exit_edge = single_exit (loop);
 764   gimple_stmt_iterator incr_gsi;
 765   bool insert_after;
 766   enum tree_code code;
 767   tree niters_type = TREE_TYPE (niters);
 768
 769   orig_cond = get_loop_exit_condition (loop);
 770   gcc_assert (orig_cond);
 771   loop_cond_gsi = gsi_for_stmt (orig_cond);
 772
 773   tree init, limit;
 774   if (!niters_maybe_zero && integer_onep (step))
 775     {
 776       /* In this case we can use a simple 0-based IV:
 777
 778          A:
 779            x = 0;
 780            do
 781              {
 782                ...
 783                x += 1;
 784              }
 785            while (x < NITERS);  */
 786       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 787       init = build_zero_cst (niters_type);
 788       limit = niters;
 789     }
 790   else
 791     {
 792       /* The following works for all values of NITERS except 0:
 793
 794          B:
 795            x = 0;
 796            do
 797              {
 798                ...
 799                x += STEP;
 800              }
 801            while (x <= NITERS - STEP);
 802
 803          so that the loop continues to iterate if x + STEP - 1 < NITERS
 804          but stops if x + STEP - 1 >= NITERS.
 805
 806          However, if NITERS is zero, x never hits a value above NITERS - STEP
 807          before wrapping around.  There are two obvious ways of dealing with
 808          this:
 809
 810          - start at STEP - 1 and compare x before incrementing it
 811          - start at -1 and compare x after incrementing it
 812
 813          The latter is simpler and is what we use.  The loop in this case
 814          looks like:
 815
 816          C:
 817            x = -1;
 818            do
 819              {
 820                ...
 821                x += STEP;
 822              }
 823            while (x < NITERS - STEP);
 824
 825          In both cases the loop limit is NITERS - STEP.  */
 826       gimple_seq seq = NULL;
 827       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 828       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 829       if (seq)
 830         {
 831           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 832           gcc_assert (!new_bb);
 833         }
 834       if (niters_maybe_zero)
 835         {
 836           /* Case C.  */
 837           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 838           init = build_all_ones_cst (niters_type);
 839         }
 840       else
 841         {
 842           /* Case B.  */
 843           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 844           init = build_zero_cst (niters_type);
 845         }
 846     }
 847
 848   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 849   create_iv (init, step, NULL_TREE, loop,
 850              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 851   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 852                                               true, NULL_TREE, true,
 853                                               GSI_SAME_STMT);
 854   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 855                                      true, GSI_SAME_STMT);
 856
 857   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 858                                  NULL_TREE);
 859
 860   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 861
 862   /* Record the number of latch iterations.  */
 863   if (limit == niters)
 864     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 865        latch count.  */
 866     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 867                                        build_int_cst (niters_type, 1));
 868   else
 869     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 870        Subtract one from this to get the latch count.  */
 871     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 872                                        limit, step);
 873
 874   if (final_iv)
 875     {
 876       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 877                                              indx_after_incr, init);
 878       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 879     }
 880
 881   return cond_stmt;
 882 }
 883
 884 /* If we're using fully-masked loops, make LOOP iterate:
 885
 886       N == (NITERS - 1) / STEP + 1
 887
 888    times.  When NITERS is zero, this is equivalent to making the loop
 889    execute (1 << M) / STEP times, where M is the precision of NITERS.
 890    NITERS_MAYBE_ZERO is true if this last case might occur.
 891
 892    If we're not using fully-masked loops, make LOOP iterate:
 893
 894       N == (NITERS - STEP) / STEP + 1
 895
 896    times, where NITERS is known to be outside the range [1, STEP - 1].
 897    This is equivalent to making the loop execute NITERS / STEP times
 898    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 899    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 900
 901    If FINAL_IV is nonnull, it is an SSA name that should be set to
 902    N * STEP on exit from the loop.
 903
 904    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 905
 906 void
 907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 908                          tree niters, tree step, tree final_iv,
 909                          bool niters_maybe_zero)
 910 {
 911   gcond *cond_stmt;
 912   gcond *orig_cond = get_loop_exit_condition (loop);
 913   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 914
 915   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 916     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 917                                                 final_iv, niters_maybe_zero,
 918                                                 loop_cond_gsi);
 919   else
 920     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 921                                                   final_iv, niters_maybe_zero,
 922                                                   loop_cond_gsi);
 923
 924   /* Remove old loop exit test.  */
 925   stmt_vec_info orig_cond_info;
 926   if (loop_vinfo
 927       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 928     loop_vinfo->remove_stmt (orig_cond_info);
 929   else
 930     gsi_remove (&loop_cond_gsi, true);
 931
 932   if (dump_enabled_p ())
 933     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 934                      cond_stmt);
 935 }
 936
 937 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 938    For all PHI arguments in FROM->dest and TO->dest from those
 939    edges ensure that TO->dest PHI arguments have current_def
 940    to that in from.  */
 941
 942 static void
 943 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 944 {
 945   gimple_stmt_iterator gsi_from, gsi_to;
 946
 947   for (gsi_from = gsi_start_phis (from->dest),
 948        gsi_to = gsi_start_phis (to->dest);
 949        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 950     {
 951       gimple *from_phi = gsi_stmt (gsi_from);
 952       gimple *to_phi = gsi_stmt (gsi_to);
 953       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 954       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 955       if (virtual_operand_p (from_arg))
 956         {
 957           gsi_next (&gsi_from);
 958           continue;
 959         }
 960       if (virtual_operand_p (to_arg))
 961         {
 962           gsi_next (&gsi_to);
 963           continue;
 964         }
 965       if (TREE_CODE (from_arg) != SSA_NAME)
 966         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 967       else if (TREE_CODE (to_arg) == SSA_NAME
 968                && from_arg != to_arg)
 969         {
 970           if (get_current_def (to_arg) == NULL_TREE)
 971             {
 972               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 973                                               TREE_TYPE (get_current_def
 974                                                            (from_arg))));
 975               set_current_def (to_arg, get_current_def (from_arg));
 976             }
 977         }
 978       gsi_next (&gsi_from);
 979       gsi_next (&gsi_to);
 980     }
 981
 982   gphi *from_phi = get_virtual_phi (from->dest);
 983   gphi *to_phi = get_virtual_phi (to->dest);
 984   if (from_phi)
 985     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 986                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 987 }
 988
 989
 990 /* Given LOOP this function generates a new copy of it and puts it
 991    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 992    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 993    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 994    entry or exit of LOOP.  */
 995
 996 class loop *
 997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
 998                                         class loop *scalar_loop, edge e)
 999 {
1000   class loop *new_loop;
1001   basic_block *new_bbs, *bbs, *pbbs;
1002   bool at_exit;
1003   bool was_imm_dom;
1004   basic_block exit_dest;
1005   edge exit, new_exit;
1006   bool duplicate_outer_loop = false;
1007
1008   exit = single_exit (loop);
1009   at_exit = (e == exit);
1010   if (!at_exit && e != loop_preheader_edge (loop))
1011     return NULL;
1012
1013   if (scalar_loop == NULL)
1014     scalar_loop = loop;
1015
1016   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017   pbbs = bbs + 1;
1018   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019   /* Allow duplication of outer loops.  */
1020   if (scalar_loop->inner)
1021     duplicate_outer_loop = true;
1022   /* Check whether duplication is possible.  */
1023   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1024     {
1025       free (bbs);
1026       return NULL;
1027     }
1028
1029   /* Generate new loop structure.  */
1030   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031   duplicate_subloops (scalar_loop, new_loop);
1032
1033   exit_dest = exit->dest;
1034   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035                                           exit_dest) == loop->header ?
1036                  true : false);
1037
1038   /* Also copy the pre-header, this avoids jumping through hoops to
1039      duplicate the loop entry PHI arguments.  Create an empty
1040      pre-header unconditionally for this.  */
1041   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042   edge entry_e = single_pred_edge (preheader);
1043   bbs[0] = preheader;
1044   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1045
1046   exit = single_exit (scalar_loop);
1047   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048             &exit, 1, &new_exit, NULL,
1049             at_exit ? loop->latch : e->src, true);
1050   exit = single_exit (loop);
1051   basic_block new_preheader = new_bbs[0];
1052
1053   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054
1055   if (scalar_loop != loop)
1056     {
1057       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1058          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1059          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1060          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1061          header) to have current_def set, so copy them over.  */
1062       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1063                                                 exit);
1064       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1065                                                            0),
1066                                                 EDGE_SUCC (loop->latch, 0));
1067     }
1068
1069   if (at_exit) /* Add the loop copy at exit.  */
1070     {
1071       if (scalar_loop != loop)
1072         {
1073           gphi_iterator gsi;
1074           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1075
1076           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1077                gsi_next (&gsi))
1078             {
1079               gphi *phi = gsi.phi ();
1080               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1081               location_t orig_locus
1082                 = gimple_phi_arg_location_from_edge (phi, e);
1083
1084               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1085             }
1086         }
1087       redirect_edge_and_branch_force (e, new_preheader);
1088       flush_pending_stmts (e);
1089       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1090       if (was_imm_dom || duplicate_outer_loop)
1091         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1092
1093       /* And remove the non-necessary forwarder again.  Keep the other
1094          one so we have a proper pre-header for the loop at the exit edge.  */
1095       redirect_edge_pred (single_succ_edge (preheader),
1096                           single_pred (preheader));
1097       delete_basic_block (preheader);
1098       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1099                                loop_preheader_edge (scalar_loop)->src);
1100     }
1101   else /* Add the copy at entry.  */
1102     {
1103       if (scalar_loop != loop)
1104         {
1105           /* Remove the non-necessary forwarder of scalar_loop again.  */
1106           redirect_edge_pred (single_succ_edge (preheader),
1107                               single_pred (preheader));
1108           delete_basic_block (preheader);
1109           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1110                                    loop_preheader_edge (scalar_loop)->src);
1111           preheader = split_edge (loop_preheader_edge (loop));
1112           entry_e = single_pred_edge (preheader);
1113         }
1114
1115       redirect_edge_and_branch_force (entry_e, new_preheader);
1116       flush_pending_stmts (entry_e);
1117       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1118
1119       redirect_edge_and_branch_force (new_exit, preheader);
1120       flush_pending_stmts (new_exit);
1121       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1122
1123       /* And remove the non-necessary forwarder again.  Keep the other
1124          one so we have a proper pre-header for the loop at the exit edge.  */
1125       redirect_edge_pred (single_succ_edge (new_preheader),
1126                           single_pred (new_preheader));
1127       delete_basic_block (new_preheader);
1128       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1129                                loop_preheader_edge (new_loop)->src);
1130     }
1131
1132   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1133   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1134     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1135
1136   if (scalar_loop != loop)
1137     {
1138       /* Update new_loop->header PHIs, so that on the preheader
1139          edge they are the ones from loop rather than scalar_loop.  */
1140       gphi_iterator gsi_orig, gsi_new;
1141       edge orig_e = loop_preheader_edge (loop);
1142       edge new_e = loop_preheader_edge (new_loop);
1143
1144       for (gsi_orig = gsi_start_phis (loop->header),
1145            gsi_new = gsi_start_phis (new_loop->header);
1146            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148         {
1149           gphi *orig_phi = gsi_orig.phi ();
1150           gphi *new_phi = gsi_new.phi ();
1151           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152           location_t orig_locus
1153             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154
1155           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156         }
1157     }
1158
1159   free (new_bbs);
1160   free (bbs);
1161
1162   checking_verify_dominators (CDI_DOMINATORS);
1163
1164   return new_loop;
1165 }
1166
1167
1168 /* Given the condition expression COND, put it as the last statement of
1169    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1171    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1172    new edge as irreducible if IRREDUCIBLE_P is true.  */
1173
1174 static edge
1175 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176                        basic_block guard_to, basic_block dom_bb,
1177                        profile_probability probability, bool irreducible_p)
1178 {
1179   gimple_stmt_iterator gsi;
1180   edge new_e, enter_e;
1181   gcond *cond_stmt;
1182   gimple_seq gimplify_stmt_list = NULL;
1183
1184   enter_e = EDGE_SUCC (guard_bb, 0);
1185   enter_e->flags &= ~EDGE_FALLTHRU;
1186   enter_e->flags |= EDGE_FALSE_VALUE;
1187   gsi = gsi_last_bb (guard_bb);
1188
1189   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190                                  NULL_TREE);
1191   if (gimplify_stmt_list)
1192     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1193
1194   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195   gsi = gsi_last_bb (guard_bb);
1196   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197
1198   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1199   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1200
1201   new_e->probability = probability;
1202   if (irreducible_p)
1203     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204
1205   enter_e->probability = probability.invert ();
1206   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1207
1208   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1209   if (enter_e->dest->loop_father->header == enter_e->dest)
1210     split_edge (enter_e);
1211
1212   return new_e;
1213 }
1214
1215
1216 /* This function verifies that the following restrictions apply to LOOP:
1217    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218        for innermost loop and 5 basic blocks for outer-loop.
1219    (2) it is single entry, single exit
1220    (3) its exit condition is the last stmt in the header
1221    (4) E is the entry/exit edge of LOOP.
1222  */
1223
1224 bool
1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1226 {
1227   edge exit_e = single_exit (loop);
1228   edge entry_e = loop_preheader_edge (loop);
1229   gcond *orig_cond = get_loop_exit_condition (loop);
1230   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231   unsigned int num_bb = loop->inner? 5 : 2;
1232
1233   /* All loops have an outer scope; the only case loop->outer is NULL is for
1234      the function itself.  */
1235   if (!loop_outer (loop)
1236       || loop->num_nodes != num_bb
1237       || !empty_block_p (loop->latch)
1238       || !single_exit (loop)
1239       /* Verify that new loop exit condition can be trivially modified.  */
1240       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241       || (e != exit_e && e != entry_e))
1242     return false;
1243
1244   return true;
1245 }
1246
1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248    in the exit bb and rename all the uses after the loop.  This simplifies
1249    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250    (but normally loop closed SSA form doesn't require virtual PHIs to be
1251    in the same form).  Doing this early simplifies the checking what
1252    uses should be renamed.
1253
1254    If we create a new phi after the loop, return the definition that
1255    applies on entry to the loop, otherwise return null.  */
1256
1257 static tree
1258 create_lcssa_for_virtual_phi (class loop *loop)
1259 {
1260   gphi_iterator gsi;
1261   edge exit_e = single_exit (loop);
1262
1263   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1265       {
1266         gphi *phi = gsi.phi ();
1267         for (gsi = gsi_start_phis (exit_e->dest);
1268              !gsi_end_p (gsi); gsi_next (&gsi))
1269           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270             break;
1271         if (gsi_end_p (gsi))
1272           {
1273             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276             imm_use_iterator imm_iter;
1277             gimple *stmt;
1278             use_operand_p use_p;
1279
1280             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283             gimple_phi_set_result (new_phi, new_vop);
1284             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285               if (stmt != new_phi
1286                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288                   SET_USE (use_p, new_vop);
1289
1290             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1291           }
1292         break;
1293       }
1294   return NULL_TREE;
1295 }
1296
1297 /* Function vect_get_loop_location.
1298
1299    Extract the location of the loop in the source code.
1300    If the loop is not well formed for vectorization, an estimated
1301    location is calculated.
1302    Return the loop location if succeed and NULL if not.  */
1303
1304 dump_user_location_t
1305 find_loop_location (class loop *loop)
1306 {
1307   gimple *stmt = NULL;
1308   basic_block bb;
1309   gimple_stmt_iterator si;
1310
1311   if (!loop)
1312     return dump_user_location_t ();
1313
1314   stmt = get_loop_exit_condition (loop);
1315
1316   if (stmt
1317       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318     return stmt;
1319
1320   /* If we got here the loop is probably not "well formed",
1321      try to estimate the loop location */
1322
1323   if (!loop->header)
1324     return dump_user_location_t ();
1325
1326   bb = loop->header;
1327
1328   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329     {
1330       stmt = gsi_stmt (si);
1331       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332         return stmt;
1333     }
1334
1335   return dump_user_location_t ();
1336 }
1337
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339    loop to be vectorized.  */
1340
1341 static bool
1342 iv_phi_p (stmt_vec_info stmt_info)
1343 {
1344   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345   if (virtual_operand_p (PHI_RESULT (phi)))
1346     return false;
1347
1348   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350     return false;
1351
1352   return true;
1353 }
1354
1355 /* Function vect_can_advance_ivs_p
1356
1357    In case the number of iterations that LOOP iterates is unknown at compile
1358    time, an epilog loop will be generated, and the loop induction variables
1359    (IVs) will be "advanced" to the value they are supposed to take just before
1360    the epilog loop.  Here we check that the access function of the loop IVs
1361    and the expression that represents the loop bound are simple enough.
1362    These restrictions will be relaxed in the future.  */
1363
1364 bool
1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block bb = loop->header;
1369   gphi_iterator gsi;
1370
1371   /* Analyze phi functions of the loop header.  */
1372
1373   if (dump_enabled_p ())
1374     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376     {
1377       tree evolution_part;
1378
1379       gphi *phi = gsi.phi ();
1380       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381       if (dump_enabled_p ())
1382         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383                          phi_info->stmt);
1384
1385       /* Skip virtual phi's. The data dependences that are associated with
1386          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1387
1388          Skip reduction phis.  */
1389       if (!iv_phi_p (phi_info))
1390         {
1391           if (dump_enabled_p ())
1392             dump_printf_loc (MSG_NOTE, vect_location,
1393                              "reduc or virtual phi. skip.\n");
1394           continue;
1395         }
1396
1397       /* Analyze the evolution function.  */
1398
1399       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400       if (evolution_part == NULL_TREE)
1401         {
1402           if (dump_enabled_p ())
1403             dump_printf (MSG_MISSED_OPTIMIZATION,
1404                          "No access function or evolution.\n");
1405           return false;
1406         }
1407
1408       /* FORNOW: We do not transform initial conditions of IVs
1409          which evolution functions are not invariants in the loop.  */
1410
1411       if (!expr_invariant_in_loop_p (loop, evolution_part))
1412         {
1413           if (dump_enabled_p ())
1414             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415                              "evolution not invariant in loop.\n");
1416           return false;
1417         }
1418
1419       /* FORNOW: We do not transform initial conditions of IVs
1420          which evolution functions are a polynomial of degree >= 2.  */
1421
1422       if (tree_is_chrec (evolution_part))
1423         {
1424           if (dump_enabled_p ())
1425             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426                              "evolution is chrec.\n");
1427           return false;
1428         }
1429     }
1430
1431   return true;
1432 }
1433
1434
1435 /*   Function vect_update_ivs_after_vectorizer.
1436
1437      "Advance" the induction variables of LOOP to the value they should take
1438      after the execution of LOOP.  This is currently necessary because the
1439      vectorizer does not handle induction variables that are used after the
1440      loop.  Such a situation occurs when the last iterations of LOOP are
1441      peeled, because:
1442      1. We introduced new uses after LOOP for IVs that were not originally used
1443         after LOOP: the IVs of LOOP are now used by an epilog loop.
1444      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445         times, whereas the loop IVs should be bumped N times.
1446
1447      Input:
1448      - LOOP - a loop that is going to be vectorized. The last few iterations
1449               of LOOP were peeled.
1450      - NITERS - the number of iterations that LOOP executes (before it is
1451                 vectorized). i.e, the number of times the ivs should be bumped.
1452      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453                   coming out from LOOP on which there are uses of the LOOP ivs
1454                   (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456                   The new definitions of the ivs are placed in LOOP->exit.
1457                   The phi args associated with the edge UPDATE_E in the bb
1458                   UPDATE_E->dest are updated accordingly.
1459
1460      Assumption 1: Like the rest of the vectorizer, this function assumes
1461      a single loop exit that has a single predecessor.
1462
1463      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464      organized in the same order.
1465
1466      Assumption 3: The access function of the ivs is simple enough (see
1467      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1468
1469      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470      coming out of LOOP on which the ivs of LOOP are used (this is the path
1471      that leads to the epilog loop; other paths skip the epilog loop).  This
1472      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473      needs to have its phis updated.
1474  */
1475
1476 static void
1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478                                   tree niters, edge update_e)
1479 {
1480   gphi_iterator gsi, gsi1;
1481   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482   basic_block update_bb = update_e->dest;
1483   basic_block exit_bb = single_exit (loop)->dest;
1484
1485   /* Make sure there exists a single-predecessor exit bb:  */
1486   gcc_assert (single_pred_p (exit_bb));
1487   gcc_assert (single_succ_edge (exit_bb) == update_e);
1488
1489   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491        gsi_next (&gsi), gsi_next (&gsi1))
1492     {
1493       tree init_expr;
1494       tree step_expr, off;
1495       tree type;
1496       tree var, ni, ni_name;
1497       gimple_stmt_iterator last_gsi;
1498
1499       gphi *phi = gsi.phi ();
1500       gphi *phi1 = gsi1.phi ();
1501       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502       if (dump_enabled_p ())
1503         dump_printf_loc (MSG_NOTE, vect_location,
1504                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1505
1506       /* Skip reduction and virtual phis.  */
1507       if (!iv_phi_p (phi_info))
1508         {
1509           if (dump_enabled_p ())
1510             dump_printf_loc (MSG_NOTE, vect_location,
1511                              "reduc or virtual phi. skip.\n");
1512           continue;
1513         }
1514
1515       type = TREE_TYPE (gimple_phi_result (phi));
1516       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517       step_expr = unshare_expr (step_expr);
1518
1519       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520          of degree >= 2 or exponential.  */
1521       gcc_assert (!tree_is_chrec (step_expr));
1522
1523       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1524
1525       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526                          fold_convert (TREE_TYPE (step_expr), niters),
1527                          step_expr);
1528       if (POINTER_TYPE_P (type))
1529         ni = fold_build_pointer_plus (init_expr, off);
1530       else
1531         ni = fold_build2 (PLUS_EXPR, type,
1532                           init_expr, fold_convert (type, off));
1533
1534       var = create_tmp_var (type, "tmp");
1535
1536       last_gsi = gsi_last_bb (exit_bb);
1537       gimple_seq new_stmts = NULL;
1538       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539       /* Exit_bb shouldn't be empty.  */
1540       if (!gsi_end_p (last_gsi))
1541         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542       else
1543         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1544
1545       /* Fix phi expressions in the successor bb.  */
1546       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1547     }
1548 }
1549
1550 /* Return a gimple value containing the misalignment (measured in vector
1551    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552    it is away from a perfectly aligned address.  Add any new statements
1553    to SEQ.  */
1554
1555 static tree
1556 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557 {
1558   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559   stmt_vec_info stmt_info = dr_info->stmt;
1560   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561
1562   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563   unsigned HOST_WIDE_INT target_align_c;
1564   tree target_align_minus_1;
1565
1566   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567                                         size_zero_node) < 0;
1568   tree offset = (negative
1569                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570                  : size_zero_node);
1571   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1572                                                           stmt_info, seq,
1573                                                           offset);
1574   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1575   if (target_align.is_constant (&target_align_c))
1576     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1577   else
1578     {
1579       tree vla = build_int_cst (type, target_align);
1580       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1581                                     fold_build2 (MINUS_EXPR, type,
1582                                                  build_int_cst (type, 0), vla));
1583       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1584                                           build_int_cst (type, 1));
1585     }
1586
1587   HOST_WIDE_INT elem_size
1588     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1589   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1590
1591   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1592   tree int_start_addr = fold_convert (type, start_addr);
1593   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1594                                         target_align_minus_1);
1595
1596   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1597   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1598                                         elem_size_log);
1599
1600   return misalign_in_elems;
1601 }
1602
1603 /* Function vect_gen_prolog_loop_niters
1604
1605    Generate the number of iterations which should be peeled as prolog for the
1606    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1607    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1608    As a result, after the execution of this loop, the data reference DR will
1609    refer to an aligned location.  The following computation is generated:
1610
1611    If the misalignment of DR is known at compile time:
1612      addr_mis = int mis = DR_MISALIGNMENT (dr);
1613    Else, compute address misalignment in bytes:
1614      addr_mis = addr & (target_align - 1)
1615
1616    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1617
1618    (elem_size = element type size; an element is the scalar element whose type
1619    is the inner type of the vectype)
1620
1621    The computations will be emitted at the end of BB.  We also compute and
1622    store upper bound (included) of the result in BOUND.
1623
1624    When the step of the data-ref in the loop is not 1 (as in interleaved data
1625    and SLP), the number of iterations of the prolog must be divided by the step
1626    (which is equal to the size of interleaved group).
1627
1628    The above formulas assume that VF == number of elements in the vector. This
1629    may not hold when there are multiple-types in the loop.
1630    In this case, for some data-references in the loop the VF does not represent
1631    the number of elements that fit in the vector.  Therefore, instead of VF we
1632    use TYPE_VECTOR_SUBPARTS.  */
1633
1634 static tree
1635 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1636                              basic_block bb, int *bound)
1637 {
1638   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1639   tree var;
1640   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1641   gimple_seq stmts = NULL, new_stmts = NULL;
1642   tree iters, iters_name;
1643   stmt_vec_info stmt_info = dr_info->stmt;
1644   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1645   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1646
1647   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1648     {
1649       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1650
1651       if (dump_enabled_p ())
1652         dump_printf_loc (MSG_NOTE, vect_location,
1653                          "known peeling = %d.\n", npeel);
1654
1655       iters = build_int_cst (niters_type, npeel);
1656       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1657     }
1658   else
1659     {
1660       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1661       tree type = TREE_TYPE (misalign_in_elems);
1662       HOST_WIDE_INT elem_size
1663         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1664       /* We only do prolog peeling if the target alignment is known at compile
1665          time.  */
1666       poly_uint64 align_in_elems =
1667         exact_div (target_align, elem_size);
1668       tree align_in_elems_minus_1 =
1669         build_int_cst (type, align_in_elems - 1);
1670       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1671
1672       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1673                                  & (align_in_elems - 1)).  */
1674       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1675                                             size_zero_node) < 0;
1676       if (negative)
1677         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1678                              align_in_elems_tree);
1679       else
1680         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1681                              misalign_in_elems);
1682       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1683       iters = fold_convert (niters_type, iters);
1684       unsigned HOST_WIDE_INT align_in_elems_c;
1685       if (align_in_elems.is_constant (&align_in_elems_c))
1686         *bound = align_in_elems_c - 1;
1687       else
1688         *bound = -1;
1689     }
1690
1691   if (dump_enabled_p ())
1692     dump_printf_loc (MSG_NOTE, vect_location,
1693                      "niters for prolog loop: %T\n", iters);
1694
1695   var = create_tmp_var (niters_type, "prolog_loop_niters");
1696   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1697
1698   if (new_stmts)
1699     gimple_seq_add_seq (&stmts, new_stmts);
1700   if (stmts)
1701     {
1702       gcc_assert (single_succ_p (bb));
1703       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1704       if (gsi_end_p (gsi))
1705         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1706       else
1707         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1708     }
1709   return iters_name;
1710 }
1711
1712
1713 /* Function vect_update_init_of_dr
1714
1715    If CODE is PLUS, the vector loop starts NITERS iterations after the
1716    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1717    iterations before the scalar one (using masking to skip inactive
1718    elements).  This function updates the information recorded in DR to
1719    account for the difference.  Specifically, it updates the OFFSET
1720    field of DR_INFO.  */
1721
1722 static void
1723 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1724 {
1725   struct data_reference *dr = dr_info->dr;
1726   tree offset = dr_info->offset;
1727   if (!offset)
1728     offset = build_zero_cst (sizetype);
1729
1730   niters = fold_build2 (MULT_EXPR, sizetype,
1731                         fold_convert (sizetype, niters),
1732                         fold_convert (sizetype, DR_STEP (dr)));
1733   offset = fold_build2 (code, sizetype,
1734                         fold_convert (sizetype, offset), niters);
1735   dr_info->offset = offset;
1736 }
1737
1738
1739 /* Function vect_update_inits_of_drs
1740
1741    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1742    CODE and NITERS are as for vect_update_inits_of_dr.  */
1743
1744 void
1745 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1746                           tree_code code)
1747 {
1748   unsigned int i;
1749   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1750   struct data_reference *dr;
1751
1752   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1753
1754   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1755      here, but since we might use these niters to update the epilogues niters
1756      and data references we can't insert them here as this definition might not
1757      always dominate its uses.  */
1758   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1759     niters = fold_convert (sizetype, niters);
1760
1761   FOR_EACH_VEC_ELT (datarefs, i, dr)
1762     {
1763       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1764       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1765         vect_update_init_of_dr (dr_info, niters, code);
1766     }
1767 }
1768
1769 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770    by masking.  This involves calculating the number of iterations to
1771    be peeled and then aligning all memory references appropriately.  */
1772
1773 void
1774 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1775 {
1776   tree misalign_in_elems;
1777   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1778
1779   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1780
1781   /* From the information recorded in LOOP_VINFO get the number of iterations
1782      that need to be skipped via masking.  */
1783   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784     {
1785       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787       misalign_in_elems = build_int_cst (type, misalign);
1788     }
1789   else
1790     {
1791       gimple_seq seq1 = NULL, seq2 = NULL;
1792       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793       misalign_in_elems = fold_convert (type, misalign_in_elems);
1794       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795                                                 &seq2, true, NULL_TREE);
1796       gimple_seq_add_seq (&seq1, seq2);
1797       if (seq1)
1798         {
1799           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801           gcc_assert (!new_bb);
1802         }
1803     }
1804
1805   if (dump_enabled_p ())
1806     dump_printf_loc (MSG_NOTE, vect_location,
1807                      "misalignment for fully-masked loop: %T\n",
1808                      misalign_in_elems);
1809
1810   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1811
1812   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1813 }
1814
1815 /* This function builds ni_name = number of iterations.  Statements
1816    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1817    it to TRUE if new ssa_var is generated.  */
1818
1819 tree
1820 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1821 {
1822   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1823   if (TREE_CODE (ni) == INTEGER_CST)
1824     return ni;
1825   else
1826     {
1827       tree ni_name, var;
1828       gimple_seq stmts = NULL;
1829       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1830
1831       var = create_tmp_var (TREE_TYPE (ni), "niters");
1832       ni_name = force_gimple_operand (ni, &stmts, false, var);
1833       if (stmts)
1834         {
1835           gsi_insert_seq_on_edge_immediate (pe, stmts);
1836           if (new_var_p != NULL)
1837             *new_var_p = true;
1838         }
1839
1840       return ni_name;
1841     }
1842 }
1843
1844 /* Calculate the number of iterations above which vectorized loop will be
1845    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1846    of prolog loop.  If it's integer const, the integer number is also passed
1847    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1848    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1849    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1850    threshold below which the scalar (rather than vectorized) loop will be
1851    executed.  This function stores the upper bound (inclusive) of the result
1852    in BOUND_SCALAR.  */
1853
1854 static tree
1855 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856                              int bound_prolog, poly_int64 bound_epilog, int th,
1857                              poly_uint64 *bound_scalar,
1858                              bool check_profitability)
1859 {
1860   tree type = TREE_TYPE (niters_prolog);
1861   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862                              build_int_cst (type, bound_epilog));
1863
1864   *bound_scalar = bound_prolog + bound_epilog;
1865   if (check_profitability)
1866     {
1867       /* TH indicates the minimum niters of vectorized loop, while we
1868          compute the maximum niters of scalar loop.  */
1869       th--;
1870       /* Peeling for constant times.  */
1871       if (int_niters_prolog >= 0)
1872         {
1873           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1874           return build_int_cst (type, *bound_scalar);
1875         }
1876       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1877          and BOUND_EPILOG are inclusive upper bounds.  */
1878       if (known_ge (th, bound_prolog + bound_epilog))
1879         {
1880           *bound_scalar = th;
1881           return build_int_cst (type, th);
1882         }
1883       /* Need to do runtime comparison.  */
1884       else if (maybe_gt (th, bound_epilog))
1885         {
1886           *bound_scalar = upper_bound (*bound_scalar, th);
1887           return fold_build2 (MAX_EXPR, type,
1888                               build_int_cst (type, th), niters);
1889         }
1890     }
1891   return niters;
1892 }
1893
1894 /* NITERS is the number of times that the original scalar loop executes
1895    after peeling.  Work out the maximum number of iterations N that can
1896    be handled by the vectorized form of the loop and then either:
1897
1898    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1899
1900         niters_vector = N
1901
1902    b) set *STEP_VECTOR_PTR to one and generate:
1903
1904         niters_vector = N / vf
1905
1906    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1908    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1909
1910 void
1911 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912                              tree *niters_vector_ptr, tree *step_vector_ptr,
1913                              bool niters_no_overflow)
1914 {
1915   tree ni_minus_gap, var;
1916   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1918   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919   tree log_vf = NULL_TREE;
1920
1921   /* If epilogue loop is required because of data accesses with gaps, we
1922      subtract one iteration from the total number of iterations here for
1923      correct calculation of RATIO.  */
1924   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1925     {
1926       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927                                   build_one_cst (type));
1928       if (!is_gimple_val (ni_minus_gap))
1929         {
1930           var = create_tmp_var (type, "ni_gap");
1931           gimple *stmts = NULL;
1932           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1933                                                true, var);
1934           gsi_insert_seq_on_edge_immediate (pe, stmts);
1935         }
1936     }
1937   else
1938     ni_minus_gap = niters;
1939
1940   unsigned HOST_WIDE_INT const_vf;
1941   if (vf.is_constant (&const_vf)
1942       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1943     {
1944       /* Create: niters >> log2(vf) */
1945       /* If it's known that niters == number of latch executions + 1 doesn't
1946          overflow, we can generate niters >> log2(vf); otherwise we generate
1947          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1948          will be at least one.  */
1949       log_vf = build_int_cst (type, exact_log2 (const_vf));
1950       if (niters_no_overflow)
1951         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1952       else
1953         niters_vector
1954           = fold_build2 (PLUS_EXPR, type,
1955                          fold_build2 (RSHIFT_EXPR, type,
1956                                       fold_build2 (MINUS_EXPR, type,
1957                                                    ni_minus_gap,
1958                                                    build_int_cst (type, vf)),
1959                                       log_vf),
1960                          build_int_cst (type, 1));
1961       step_vector = build_one_cst (type);
1962     }
1963   else
1964     {
1965       niters_vector = ni_minus_gap;
1966       step_vector = build_int_cst (type, vf);
1967     }
1968
1969   if (!is_gimple_val (niters_vector))
1970     {
1971       var = create_tmp_var (type, "bnd");
1972       gimple_seq stmts = NULL;
1973       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1974       gsi_insert_seq_on_edge_immediate (pe, stmts);
1975       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976          we set range information to make niters analyzer's life easier.  */
1977       if (stmts != NULL && log_vf)
1978         set_range_info (niters_vector, VR_RANGE,
1979                         wi::to_wide (build_int_cst (type, 1)),
1980                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1981                                                   TYPE_MAX_VALUE (type),
1982                                                   log_vf)));
1983     }
1984   *niters_vector_ptr = niters_vector;
1985   *step_vector_ptr = step_vector;
1986
1987   return;
1988 }
1989
1990 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1991    loop specified by LOOP_VINFO after vectorization, compute the number
1992    of iterations before vectorization (niters_vector * vf) and store it
1993    to NITERS_VECTOR_MULT_VF_PTR.  */
1994
1995 static void
1996 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1997                                      tree niters_vector,
1998                                      tree *niters_vector_mult_vf_ptr)
1999 {
2000   /* We should be using a step_vector of VF if VF is variable.  */
2001   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2002   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2003   tree type = TREE_TYPE (niters_vector);
2004   tree log_vf = build_int_cst (type, exact_log2 (vf));
2005   basic_block exit_bb = single_exit (loop)->dest;
2006
2007   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2008   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2009                                             niters_vector, log_vf);
2010   if (!is_gimple_val (niters_vector_mult_vf))
2011     {
2012       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2013       gimple_seq stmts = NULL;
2014       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2015                                                     &stmts, true, var);
2016       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2017       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2018     }
2019   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2020 }
2021
2022 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2023    this function searches for the corresponding lcssa phi node in exit
2024    bb of LOOP.  If it is found, return the phi result; otherwise return
2025    NULL.  */
2026
2027 static tree
2028 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2029                 gphi *lcssa_phi)
2030 {
2031   gphi_iterator gsi;
2032   edge e = single_exit (loop);
2033
2034   gcc_assert (single_pred_p (e->dest));
2035   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2036     {
2037       gphi *phi = gsi.phi ();
2038       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2039                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2040         return PHI_RESULT (phi);
2041     }
2042   return NULL_TREE;
2043 }
2044
2045 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2046    from SECOND/FIRST and puts it at the original loop's preheader/exit
2047    edge, the two loops are arranged as below:
2048
2049        preheader_a:
2050      first_loop:
2051        header_a:
2052          i_1 = PHI<i_0, i_2>;
2053          ...
2054          i_2 = i_1 + 1;
2055          if (cond_a)
2056            goto latch_a;
2057          else
2058            goto between_bb;
2059        latch_a:
2060          goto header_a;
2061
2062        between_bb:
2063          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2064
2065      second_loop:
2066        header_b:
2067          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2068                                  or with i_2 if no LCSSA phi is created
2069                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2070          ...
2071          i_4 = i_3 + 1;
2072          if (cond_b)
2073            goto latch_b;
2074          else
2075            goto exit_bb;
2076        latch_b:
2077          goto header_b;
2078
2079        exit_bb:
2080
2081    This function creates loop closed SSA for the first loop; update the
2082    second loop's PHI nodes by replacing argument on incoming edge with the
2083    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2084    is false, Loop closed ssa phis will only be created for non-iv phis for
2085    the first loop.
2086
2087    This function assumes exit bb of the first loop is preheader bb of the
2088    second loop, i.e, between_bb in the example code.  With PHIs updated,
2089    the second loop will execute rest iterations of the first.  */
2090
2091 static void
2092 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2093                                    class loop *first, class loop *second,
2094                                    bool create_lcssa_for_iv_phis)
2095 {
2096   gphi_iterator gsi_update, gsi_orig;
2097   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2098
2099   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2100   edge second_preheader_e = loop_preheader_edge (second);
2101   basic_block between_bb = single_exit (first)->dest;
2102
2103   gcc_assert (between_bb == second_preheader_e->src);
2104   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2105   /* Either the first loop or the second is the loop to be vectorized.  */
2106   gcc_assert (loop == first || loop == second);
2107
2108   for (gsi_orig = gsi_start_phis (first->header),
2109        gsi_update = gsi_start_phis (second->header);
2110        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2111        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2112     {
2113       gphi *orig_phi = gsi_orig.phi ();
2114       gphi *update_phi = gsi_update.phi ();
2115
2116       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2117       /* Generate lcssa PHI node for the first loop.  */
2118       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2119       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2120       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2121         {
2122           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2123           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2124           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2125           arg = new_res;
2126         }
2127
2128       /* Update PHI node in the second loop by replacing arg on the loop's
2129          incoming edge.  */
2130       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2131     }
2132
2133   /* For epilogue peeling we have to make sure to copy all LC PHIs
2134      for correct vectorization of live stmts.  */
2135   if (loop == first)
2136     {
2137       basic_block orig_exit = single_exit (second)->dest;
2138       for (gsi_orig = gsi_start_phis (orig_exit);
2139            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2140         {
2141           gphi *orig_phi = gsi_orig.phi ();
2142           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2143           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2144             continue;
2145
2146           /* Already created in the above loop.   */
2147           if (find_guard_arg (first, second, orig_phi))
2148             continue;
2149
2150           tree new_res = copy_ssa_name (orig_arg);
2151           gphi *lcphi = create_phi_node (new_res, between_bb);
2152           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2153         }
2154     }
2155 }
2156
2157 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2158    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2159    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2160    appear like below:
2161
2162        guard_bb:
2163          if (cond)
2164            goto merge_bb;
2165          else
2166            goto skip_loop;
2167
2168      skip_loop:
2169        header_a:
2170          i_1 = PHI<i_0, i_2>;
2171          ...
2172          i_2 = i_1 + 1;
2173          if (cond_a)
2174            goto latch_a;
2175          else
2176            goto exit_a;
2177        latch_a:
2178          goto header_a;
2179
2180        exit_a:
2181          i_5 = PHI<i_2>;
2182
2183        merge_bb:
2184          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2185
2186      update_loop:
2187        header_b:
2188          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2189          ...
2190          i_4 = i_3 + 1;
2191          if (cond_b)
2192            goto latch_b;
2193          else
2194            goto exit_bb;
2195        latch_b:
2196          goto header_b;
2197
2198        exit_bb:
2199
2200    This function creates PHI nodes at merge_bb and replaces the use of i_5
2201    in the update_loop's PHI node with the result of new PHI result.  */
2202
2203 static void
2204 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2205                                     class loop *update_loop,
2206                                     edge guard_edge, edge merge_edge)
2207 {
2208   location_t merge_loc, guard_loc;
2209   edge orig_e = loop_preheader_edge (skip_loop);
2210   edge update_e = loop_preheader_edge (update_loop);
2211   gphi_iterator gsi_orig, gsi_update;
2212
2213   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2214         gsi_update = gsi_start_phis (update_loop->header));
2215        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2216        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2217     {
2218       gphi *orig_phi = gsi_orig.phi ();
2219       gphi *update_phi = gsi_update.phi ();
2220
2221       /* Generate new phi node at merge bb of the guard.  */
2222       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2223       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2224
2225       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2226          args in NEW_PHI for these edges.  */
2227       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2228       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2229       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2230       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2231       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2232       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2233
2234       /* Update phi in UPDATE_PHI.  */
2235       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2236     }
2237 }
2238
2239 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2240    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2241    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2242    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2243    The CFG looks like:
2244
2245      loop:
2246        header_a:
2247          i_1 = PHI<i_0, i_2>;
2248          ...
2249          i_2 = i_1 + 1;
2250          if (cond_a)
2251            goto latch_a;
2252          else
2253            goto exit_a;
2254        latch_a:
2255          goto header_a;
2256
2257        exit_a:
2258
2259        guard_bb:
2260          if (cond)
2261            goto merge_bb;
2262          else
2263            goto epilog_loop;
2264
2265        ;; fall_through_bb
2266
2267      epilog_loop:
2268        header_b:
2269          i_3 = PHI<i_2, i_4>;
2270          ...
2271          i_4 = i_3 + 1;
2272          if (cond_b)
2273            goto latch_b;
2274          else
2275            goto merge_bb;
2276        latch_b:
2277          goto header_b;
2278
2279        merge_bb:
2280          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2281
2282        exit_bb:
2283          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2284
2285    For each name used out side EPILOG (i.e - for each name that has a lcssa
2286    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2287    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2288    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2289    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2290    in exit_bb will also be updated.  */
2291
2292 static void
2293 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2294                                     edge guard_edge, edge merge_edge)
2295 {
2296   gphi_iterator gsi;
2297   basic_block merge_bb = guard_edge->dest;
2298
2299   gcc_assert (single_succ_p (merge_bb));
2300   edge e = single_succ_edge (merge_bb);
2301   basic_block exit_bb = e->dest;
2302   gcc_assert (single_pred_p (exit_bb));
2303   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2304
2305   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2306     {
2307       gphi *update_phi = gsi.phi ();
2308       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2309
2310       tree merge_arg = NULL_TREE;
2311
2312       /* If the old argument is a SSA_NAME use its current_def.  */
2313       if (TREE_CODE (old_arg) == SSA_NAME)
2314         merge_arg = get_current_def (old_arg);
2315       /* If it's a constant or doesn't have a current_def, just use the old
2316          argument.  */
2317       if (!merge_arg)
2318         merge_arg = old_arg;
2319
2320       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2321       /* If the var is live after loop but not a reduction, we simply
2322          use the old arg.  */
2323       if (!guard_arg)
2324         guard_arg = old_arg;
2325
2326       /* Create new phi node in MERGE_BB:  */
2327       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2328       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2329
2330       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2331          the two PHI args in merge_phi for these edges.  */
2332       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2333       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2334
2335       /* Update the original phi in exit_bb.  */
2336       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2337     }
2338 }
2339
2340 /* EPILOG loop is duplicated from the original loop for vectorizing,
2341    the arg of its loop closed ssa PHI needs to be updated.  */
2342
2343 static void
2344 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2345 {
2346   gphi_iterator gsi;
2347   basic_block exit_bb = single_exit (epilog)->dest;
2348
2349   gcc_assert (single_pred_p (exit_bb));
2350   edge e = EDGE_PRED (exit_bb, 0);
2351   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2352     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2353 }
2354
2355 /* Function vect_do_peeling.
2356
2357    Input:
2358    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2359
2360        preheader:
2361      LOOP:
2362        header_bb:
2363          loop_body
2364          if (exit_loop_cond) goto exit_bb
2365          else                goto header_bb
2366        exit_bb:
2367
2368    - NITERS: The number of iterations of the loop.
2369    - NITERSM1: The number of iterations of the loop's latch.
2370    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2371    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2372                               CHECK_PROFITABILITY is true.
2373    Output:
2374    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2375      iterate after vectorization; see vect_set_loop_condition for details.
2376    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2377      should be set to the number of scalar iterations handled by the
2378      vector loop.  The SSA name is only used on exit from the loop.
2379
2380    This function peels prolog and epilog from the loop, adds guards skipping
2381    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2382    would look like:
2383
2384        guard_bb_1:
2385          if (prefer_scalar_loop) goto merge_bb_1
2386          else                    goto guard_bb_2
2387
2388        guard_bb_2:
2389          if (skip_prolog) goto merge_bb_2
2390          else             goto prolog_preheader
2391
2392        prolog_preheader:
2393      PROLOG:
2394        prolog_header_bb:
2395          prolog_body
2396          if (exit_prolog_cond) goto prolog_exit_bb
2397          else                  goto prolog_header_bb
2398        prolog_exit_bb:
2399
2400        merge_bb_2:
2401
2402        vector_preheader:
2403      VECTOR LOOP:
2404        vector_header_bb:
2405          vector_body
2406          if (exit_vector_cond) goto vector_exit_bb
2407          else                  goto vector_header_bb
2408        vector_exit_bb:
2409
2410        guard_bb_3:
2411          if (skip_epilog) goto merge_bb_3
2412          else             goto epilog_preheader
2413
2414        merge_bb_1:
2415
2416        epilog_preheader:
2417      EPILOG:
2418        epilog_header_bb:
2419          epilog_body
2420          if (exit_epilog_cond) goto merge_bb_3
2421          else                  goto epilog_header_bb
2422
2423        merge_bb_3:
2424
2425    Note this function peels prolog and epilog only if it's necessary,
2426    as well as guards.
2427    This function returns the epilogue loop if a decision was made to vectorize
2428    it, otherwise NULL.
2429
2430    The analysis resulting in this epilogue loop's loop_vec_info was performed
2431    in the same vect_analyze_loop call as the main loop's.  At that time
2432    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2433    vectorization factors than the main loop.  This list is stored in the main
2434    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2435    vectorize the epilogue loop for a lower vectorization factor,  the
2436    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2437    updated and linked to the epilogue loop.  This is later used to vectorize
2438    the epilogue.  The reason the loop_vec_info needs updating is that it was
2439    constructed based on the original main loop, and the epilogue loop is a
2440    copy of this loop, so all links pointing to statements in the original loop
2441    need updating.  Furthermore, these loop_vec_infos share the
2442    data_reference's records, which will also need to be updated.
2443
2444    TODO: Guard for prefer_scalar_loop should be emitted along with
2445    versioning conditions if loop versioning is needed.  */
2446
2447
2448 class loop *
2449 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2450                  tree *niters_vector, tree *step_vector,
2451                  tree *niters_vector_mult_vf_var, int th,
2452                  bool check_profitability, bool niters_no_overflow,
2453                  tree *advance)
2454 {
2455   edge e, guard_e;
2456   tree type = TREE_TYPE (niters), guard_cond;
2457   basic_block guard_bb, guard_to;
2458   profile_probability prob_prolog, prob_vector, prob_epilog;
2459   int estimated_vf;
2460   int prolog_peeling = 0;
2461   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2462   /* We currently do not support prolog peeling if the target alignment is not
2463      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2464      target alignment being constant.  */
2465   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2466   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2467     return NULL;
2468
2469   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2470     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2471
2472   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2473   poly_uint64 bound_epilog = 0;
2474   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2475       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2476     bound_epilog += vf - 1;
2477   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2478     bound_epilog += 1;
2479   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2480   poly_uint64 bound_scalar = bound_epilog;
2481
2482   if (!prolog_peeling && !epilog_peeling)
2483     return NULL;
2484
2485   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2486   estimated_vf = vect_vf_for_cost (loop_vinfo);
2487   if (estimated_vf == 2)
2488     estimated_vf = 3;
2489   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2490                         .apply_scale (estimated_vf - 1, estimated_vf);
2491
2492   class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2493   class loop *first_loop = loop;
2494   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2495
2496   /* We might have a queued need to update virtual SSA form.  As we
2497      delete the update SSA machinery below after doing a regular
2498      incremental SSA update during loop copying make sure we don't
2499      lose that fact.
2500      ???  Needing to update virtual SSA form by renaming is unfortunate
2501      but not all of the vectorizer code inserting new loads / stores
2502      properly assigns virtual operands to those statements.  */
2503   update_ssa (TODO_update_ssa_only_virtuals);
2504
2505   create_lcssa_for_virtual_phi (loop);
2506
2507   /* If we're vectorizing an epilogue loop, the update_ssa above will
2508      have ensured that the virtual operand is in SSA form throughout the
2509      vectorized main loop.  Normally it is possible to trace the updated
2510      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2511      back to scalar-stmt vuses, meaning that the effect of the SSA update
2512      remains local to the main loop.  However, there are rare cases in
2513      which the vectorized loop has vdefs even when the original scalar
2514      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2515      introduces clobbers of the temporary vector array, which in turn
2516      needs new vdefs.  If the scalar loop doesn't write to memory, these
2517      new vdefs will be the only ones in the vector loop.
2518
2519      In that case, update_ssa will have added a new virtual phi to the
2520      main loop, which previously didn't need one.  Ensure that we (locally)
2521      maintain LCSSA form for the virtual operand, just as we would have
2522      done if the virtual phi had existed from the outset.  This makes it
2523      easier to duplicate the scalar epilogue loop below.  */
2524   tree vop_to_rename = NULL_TREE;
2525   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2526     {
2527       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2528       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2529     }
2530
2531   if (MAY_HAVE_DEBUG_BIND_STMTS)
2532     {
2533       gcc_assert (!adjust_vec.exists ());
2534       adjust_vec.create (32);
2535     }
2536   initialize_original_copy_tables ();
2537
2538   /* Record the anchor bb at which the guard should be placed if the scalar
2539      loop might be preferred.  */
2540   basic_block anchor = loop_preheader_edge (loop)->src;
2541
2542   /* Generate the number of iterations for the prolog loop.  We do this here
2543      so that we can also get the upper bound on the number of iterations.  */
2544   tree niters_prolog;
2545   int bound_prolog = 0;
2546   if (prolog_peeling)
2547     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2548                                                   &bound_prolog);
2549   else
2550     niters_prolog = build_int_cst (type, 0);
2551
2552   loop_vec_info epilogue_vinfo = NULL;
2553   if (vect_epilogues)
2554     {
2555       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2556       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2557     }
2558
2559   tree niters_vector_mult_vf = NULL_TREE;
2560   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2561   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2562   edge update_e = NULL, skip_e = NULL;
2563   unsigned int lowest_vf = constant_lower_bound (vf);
2564   /* If we know the number of scalar iterations for the main loop we should
2565      check whether after the main loop there are enough iterations left over
2566      for the epilogue.  */
2567   if (vect_epilogues
2568       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2569       && prolog_peeling >= 0
2570       && known_eq (vf, lowest_vf))
2571     {
2572       unsigned HOST_WIDE_INT eiters
2573         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2574            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2575
2576       eiters -= prolog_peeling;
2577       eiters
2578         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2579
2580       unsigned int ratio;
2581       unsigned int epilogue_gaps
2582         = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2583       while (!(constant_multiple_p
2584                (GET_MODE_SIZE (loop_vinfo->vector_mode),
2585                 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2586                && eiters >= lowest_vf / ratio + epilogue_gaps))
2587         {
2588           delete epilogue_vinfo;
2589           epilogue_vinfo = NULL;
2590           if (loop_vinfo->epilogue_vinfos.length () == 0)
2591             {
2592               vect_epilogues = false;
2593               break;
2594             }
2595           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2596           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2597           epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2598         }
2599     }
2600   /* Prolog loop may be skipped.  */
2601   bool skip_prolog = (prolog_peeling != 0);
2602   /* Skip this loop to epilog when there are not enough iterations to enter this
2603      vectorized loop.  If true we should perform runtime checks on the NITERS
2604      to check whether we should skip the current vectorized loop.  If we know
2605      the number of scalar iterations we may choose to add a runtime check if
2606      this number "maybe" smaller than the number of iterations required
2607      when we know the number of scalar iterations may potentially
2608      be smaller than the number of iterations required to enter this loop, for
2609      this we use the upper bounds on the prolog and epilog peeling.  When we
2610      don't know the number of iterations and don't require versioning it is
2611      because we have asserted that there are enough scalar iterations to enter
2612      the main loop, so this skip is not necessary.  When we are versioning then
2613      we only add such a skip if we have chosen to vectorize the epilogue.  */
2614   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2615                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2616                                   bound_prolog + bound_epilog)
2617                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2618                          || vect_epilogues));
2619   /* Epilog loop must be executed if the number of iterations for epilog
2620      loop is known at compile time, otherwise we need to add a check at
2621      the end of vector loop and skip to the end of epilog loop.  */
2622   bool skip_epilog = (prolog_peeling < 0
2623                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624                       || !vf.is_constant ());
2625   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2626   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2627     skip_epilog = false;
2628
2629   if (skip_vector)
2630     {
2631       split_edge (loop_preheader_edge (loop));
2632
2633       /* Due to the order in which we peel prolog and epilog, we first
2634          propagate probability to the whole loop.  The purpose is to
2635          avoid adjusting probabilities of both prolog and vector loops
2636          separately.  Note in this case, the probability of epilog loop
2637          needs to be scaled back later.  */
2638       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2639       if (prob_vector.initialized_p ())
2640         {
2641           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2642           scale_loop_profile (loop, prob_vector, 0);
2643         }
2644     }
2645
2646   dump_user_location_t loop_loc = find_loop_location (loop);
2647   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2648   if (vect_epilogues)
2649     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2650        use the original scalar loop as remaining epilogue if necessary.  */
2651     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2652       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2653
2654   if (prolog_peeling)
2655     {
2656       e = loop_preheader_edge (loop);
2657       if (!slpeel_can_duplicate_loop_p (loop, e))
2658         {
2659           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2660                            "loop can't be duplicated to preheader edge.\n");
2661           gcc_unreachable ();
2662         }
2663       /* Peel prolog and put it on preheader edge of loop.  */
2664       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2665       if (!prolog)
2666         {
2667           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2668                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2669           gcc_unreachable ();
2670         }
2671       prolog->force_vectorize = false;
2672       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2673       first_loop = prolog;
2674       reset_original_copy_tables ();
2675
2676       /* Update the number of iterations for prolog loop.  */
2677       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2678       vect_set_loop_condition (prolog, NULL, niters_prolog,
2679                                step_prolog, NULL_TREE, false);
2680
2681       /* Skip the prolog loop.  */
2682       if (skip_prolog)
2683         {
2684           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2685                                     niters_prolog, build_int_cst (type, 0));
2686           guard_bb = loop_preheader_edge (prolog)->src;
2687           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2688           guard_to = split_edge (loop_preheader_edge (loop));
2689           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2690                                            guard_to, guard_bb,
2691                                            prob_prolog.invert (),
2692                                            irred_flag);
2693           e = EDGE_PRED (guard_to, 0);
2694           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2695           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2696
2697           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2698           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2699         }
2700
2701       /* Update init address of DRs.  */
2702       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2703       /* Update niters for vector loop.  */
2704       LOOP_VINFO_NITERS (loop_vinfo)
2705         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2706       LOOP_VINFO_NITERSM1 (loop_vinfo)
2707         = fold_build2 (MINUS_EXPR, type,
2708                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2709       bool new_var_p = false;
2710       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2711       /* It's guaranteed that vector loop bound before vectorization is at
2712          least VF, so set range information for newly generated var.  */
2713       if (new_var_p)
2714         set_range_info (niters, VR_RANGE,
2715                         wi::to_wide (build_int_cst (type, vf)),
2716                         wi::to_wide (TYPE_MAX_VALUE (type)));
2717
2718       /* Prolog iterates at most bound_prolog times, latch iterates at
2719          most bound_prolog - 1 times.  */
2720       record_niter_bound (prolog, bound_prolog - 1, false, true);
2721       delete_update_ssa ();
2722       adjust_vec_debug_stmts ();
2723       scev_reset ();
2724     }
2725
2726   if (epilog_peeling)
2727     {
2728       e = single_exit (loop);
2729       if (!slpeel_can_duplicate_loop_p (loop, e))
2730         {
2731           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2732                            "loop can't be duplicated to exit edge.\n");
2733           gcc_unreachable ();
2734         }
2735       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2736          said epilog then we should use a copy of the main loop as a starting
2737          point.  This loop may have already had some preliminary transformations
2738          to allow for more optimal vectorization, for example if-conversion.
2739          If we are not vectorizing the epilog then we should use the scalar loop
2740          as the transformations mentioned above make less or no sense when not
2741          vectorizing.  */
2742       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2743       if (vop_to_rename)
2744         {
2745           /* Vectorizing the main loop can sometimes introduce a vdef to
2746              a loop that previously didn't have one; see the comment above
2747              the definition of VOP_TO_RENAME for details.  The definition
2748              D that holds on E will then be different from the definition
2749              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2750              rename VOP_TO_RENAME to D when copying the loop.
2751
2752              The virtual operand is in LCSSA form for the main loop,
2753              and no stmt between the main loop and E needs a vdef,
2754              so we know that D is provided by a phi rather than by a
2755              vdef on a normal gimple stmt.  */
2756           basic_block vdef_bb = e->src;
2757           gphi *vphi;
2758           while (!(vphi = get_virtual_phi (vdef_bb)))
2759             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2760           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2761           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2762         }
2763       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2764       if (!epilog)
2765         {
2766           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2767                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2768           gcc_unreachable ();
2769         }
2770       epilog->force_vectorize = false;
2771       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2772
2773       /* Scalar version loop may be preferred.  In this case, add guard
2774          and skip to epilog.  Note this only happens when the number of
2775          iterations of loop is unknown at compile time, otherwise this
2776          won't be vectorized.  */
2777       if (skip_vector)
2778         {
2779           /* Additional epilogue iteration is peeled if gap exists.  */
2780           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2781                                                 bound_prolog, bound_epilog,
2782                                                 th, &bound_scalar,
2783                                                 check_profitability);
2784           /* Build guard against NITERSM1 since NITERS may overflow.  */
2785           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2786           guard_bb = anchor;
2787           guard_to = split_edge (loop_preheader_edge (epilog));
2788           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2789                                            guard_to, guard_bb,
2790                                            prob_vector.invert (),
2791                                            irred_flag);
2792           skip_e = guard_e;
2793           e = EDGE_PRED (guard_to, 0);
2794           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2795           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2796
2797           /* Simply propagate profile info from guard_bb to guard_to which is
2798              a merge point of control flow.  */
2799           guard_to->count = guard_bb->count;
2800
2801           /* Scale probability of epilog loop back.
2802              FIXME: We should avoid scaling down and back up.  Profile may
2803              get lost if we scale down to 0.  */
2804           basic_block *bbs = get_loop_body (epilog);
2805           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2806             bbs[i]->count = bbs[i]->count.apply_scale
2807                                  (bbs[i]->count,
2808                                   bbs[i]->count.apply_probability
2809                                     (prob_vector));
2810           free (bbs);
2811         }
2812
2813       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2814       /* If loop is peeled for non-zero constant times, now niters refers to
2815          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2816          overflows.  */
2817       niters_no_overflow |= (prolog_peeling > 0);
2818       vect_gen_vector_loop_niters (loop_vinfo, niters,
2819                                    niters_vector, step_vector,
2820                                    niters_no_overflow);
2821       if (!integer_onep (*step_vector))
2822         {
2823           /* On exit from the loop we will have an easy way of calcalating
2824              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2825              until then.  */
2826           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2827           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2828           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2829         }
2830       else
2831         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2832                                              &niters_vector_mult_vf);
2833       /* Update IVs of original loop as if they were advanced by
2834          niters_vector_mult_vf steps.  */
2835       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2836       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2837       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2838                                         update_e);
2839
2840       if (skip_epilog)
2841         {
2842           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2843                                     niters, niters_vector_mult_vf);
2844           guard_bb = single_exit (loop)->dest;
2845           guard_to = split_edge (single_exit (epilog));
2846           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2847                                            skip_vector ? anchor : guard_bb,
2848                                            prob_epilog.invert (),
2849                                            irred_flag);
2850           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2851                                               single_exit (epilog));
2852           /* Only need to handle basic block before epilog loop if it's not
2853              the guard_bb, which is the case when skip_vector is true.  */
2854           if (guard_bb != bb_before_epilog)
2855             {
2856               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2857
2858               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2859             }
2860           scale_loop_profile (epilog, prob_epilog, 0);
2861         }
2862       else
2863         slpeel_update_phi_nodes_for_lcssa (epilog);
2864
2865       unsigned HOST_WIDE_INT bound;
2866       if (bound_scalar.is_constant (&bound))
2867         {
2868           gcc_assert (bound != 0);
2869           /* -1 to convert loop iterations to latch iterations.  */
2870           record_niter_bound (epilog, bound - 1, false, true);
2871         }
2872
2873       delete_update_ssa ();
2874       adjust_vec_debug_stmts ();
2875       scev_reset ();
2876     }
2877
2878   if (vect_epilogues)
2879     {
2880       epilog->aux = epilogue_vinfo;
2881       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2882
2883       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2884
2885       /* We now must calculate the number of NITERS performed by the previous
2886          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2887       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2888                                  niters_prolog, niters_vector_mult_vf);
2889
2890       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2891          determine whether we are coming from the previous vectorized loop
2892          using the update_e edge or the skip_vector basic block using the
2893          skip_e edge.  */
2894       if (skip_vector)
2895         {
2896           gcc_assert (update_e != NULL && skip_e != NULL);
2897           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2898                                            update_e->dest);
2899           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2900           gimple *stmt = gimple_build_assign (new_ssa, niters);
2901           gimple_stmt_iterator gsi;
2902           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2903               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2904             {
2905               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2906               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2907             }
2908           else
2909             {
2910               gsi = gsi_last_bb (update_e->src);
2911               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2912             }
2913
2914           niters = new_ssa;
2915           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2916           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2917                        UNKNOWN_LOCATION);
2918           niters = PHI_RESULT (new_phi);
2919         }
2920
2921       /* Subtract the number of iterations performed by the vectorized loop
2922          from the number of total iterations.  */
2923       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2924                                           before_loop_niters,
2925                                           niters);
2926
2927       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2928       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2929         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2930                        epilogue_niters,
2931                        build_one_cst (TREE_TYPE (epilogue_niters)));
2932
2933       /* Set ADVANCE to the number of iterations performed by the previous
2934          loop and its prologue.  */
2935       *advance = niters;
2936
2937       /* Redo the peeling for niter analysis as the NITERs and alignment
2938          may have been updated to take the main loop into account.  */
2939       determine_peel_for_niter (epilogue_vinfo);
2940     }
2941
2942   adjust_vec.release ();
2943   free_original_copy_tables ();
2944
2945   return vect_epilogues ? epilog : NULL;
2946 }
2947
2948 /* Function vect_create_cond_for_niters_checks.
2949
2950    Create a conditional expression that represents the run-time checks for
2951    loop's niter.  The loop is guaranteed to terminate if the run-time
2952    checks hold.
2953
2954    Input:
2955    COND_EXPR  - input conditional expression.  New conditions will be chained
2956                 with logical AND operation.  If it is NULL, then the function
2957                 is used to return the number of alias checks.
2958    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2959                 to be checked.
2960
2961    Output:
2962    COND_EXPR - conditional expression.
2963
2964    The returned COND_EXPR is the conditional expression to be used in the
2965    if statement that controls which version of the loop gets executed at
2966    runtime.  */
2967
2968 static void
2969 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2970 {
2971   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2972
2973   if (*cond_expr)
2974     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2975                               *cond_expr, part_cond_expr);
2976   else
2977     *cond_expr = part_cond_expr;
2978 }
2979
2980 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2981    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2982
2983 static void
2984 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2985 {
2986   if (*cond_expr)
2987     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2988                               *cond_expr, part_cond_expr);
2989   else
2990     *cond_expr = part_cond_expr;
2991 }
2992
2993 /* Function vect_create_cond_for_align_checks.
2994
2995    Create a conditional expression that represents the alignment checks for
2996    all of data references (array element references) whose alignment must be
2997    checked at runtime.
2998
2999    Input:
3000    COND_EXPR  - input conditional expression.  New conditions will be chained
3001                 with logical AND operation.
3002    LOOP_VINFO - two fields of the loop information are used.
3003                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3004                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3005
3006    Output:
3007    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3008                          expression.
3009    The returned value is the conditional expression to be used in the if
3010    statement that controls which version of the loop gets executed at runtime.
3011
3012    The algorithm makes two assumptions:
3013      1) The number of bytes "n" in a vector is a power of 2.
3014      2) An address "a" is aligned if a%n is zero and that this
3015         test can be done as a&(n-1) == 0.  For example, for 16
3016         byte vectors the test is a&0xf == 0.  */
3017
3018 static void
3019 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3020                                    tree *cond_expr,
3021                                    gimple_seq *cond_expr_stmt_list)
3022 {
3023   vec<stmt_vec_info> may_misalign_stmts
3024     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3025   stmt_vec_info stmt_info;
3026   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3027   tree mask_cst;
3028   unsigned int i;
3029   tree int_ptrsize_type;
3030   char tmp_name[20];
3031   tree or_tmp_name = NULL_TREE;
3032   tree and_tmp_name;
3033   gimple *and_stmt;
3034   tree ptrsize_zero;
3035   tree part_cond_expr;
3036
3037   /* Check that mask is one less than a power of 2, i.e., mask is
3038      all zeros followed by all ones.  */
3039   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3040
3041   int_ptrsize_type = signed_type_for (ptr_type_node);
3042
3043   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3044      of the first vector of the i'th data reference. */
3045
3046   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3047     {
3048       gimple_seq new_stmt_list = NULL;
3049       tree addr_base;
3050       tree addr_tmp_name;
3051       tree new_or_tmp_name;
3052       gimple *addr_stmt, *or_stmt;
3053       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3054       bool negative = tree_int_cst_compare
3055         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3056       tree offset = negative
3057         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3058
3059       /* create: addr_tmp = (int)(address_of_first_vector) */
3060       addr_base =
3061         vect_create_addr_base_for_vector_ref (loop_vinfo,
3062                                               stmt_info, &new_stmt_list,
3063                                               offset);
3064       if (new_stmt_list != NULL)
3065         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3066
3067       sprintf (tmp_name, "addr2int%d", i);
3068       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3069       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3070       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3071
3072       /* The addresses are OR together.  */
3073
3074       if (or_tmp_name != NULL_TREE)
3075         {
3076           /* create: or_tmp = or_tmp | addr_tmp */
3077           sprintf (tmp_name, "orptrs%d", i);
3078           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3079           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3080                                          or_tmp_name, addr_tmp_name);
3081           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3082           or_tmp_name = new_or_tmp_name;
3083         }
3084       else
3085         or_tmp_name = addr_tmp_name;
3086
3087     } /* end for i */
3088
3089   mask_cst = build_int_cst (int_ptrsize_type, mask);
3090
3091   /* create: and_tmp = or_tmp & mask  */
3092   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3093
3094   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3095                                   or_tmp_name, mask_cst);
3096   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3097
3098   /* Make and_tmp the left operand of the conditional test against zero.
3099      if and_tmp has a nonzero bit then some address is unaligned.  */
3100   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3101   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3102                                 and_tmp_name, ptrsize_zero);
3103   chain_cond_expr (cond_expr, part_cond_expr);
3104 }
3105
3106 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3107    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3108    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3109    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3110
3111 static void
3112 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3113 {
3114   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3115   unsigned int i;
3116   vec_object_pair *pair;
3117   FOR_EACH_VEC_ELT (pairs, i, pair)
3118     {
3119       tree addr1 = build_fold_addr_expr (pair->first);
3120       tree addr2 = build_fold_addr_expr (pair->second);
3121       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3122                                          addr1, addr2);
3123       chain_cond_expr (cond_expr, part_cond_expr);
3124     }
3125 }
3126
3127 /* Create an expression that is true when all lower-bound conditions for
3128    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3129
3130 static void
3131 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3132 {
3133   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3134   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3135     {
3136       tree expr = lower_bounds[i].expr;
3137       tree type = unsigned_type_for (TREE_TYPE (expr));
3138       expr = fold_convert (type, expr);
3139       poly_uint64 bound = lower_bounds[i].min_value;
3140       if (!lower_bounds[i].unsigned_p)
3141         {
3142           expr = fold_build2 (PLUS_EXPR, type, expr,
3143                               build_int_cstu (type, bound - 1));
3144           bound += bound - 1;
3145         }
3146       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3147                                          build_int_cstu (type, bound));
3148       chain_cond_expr (cond_expr, part_cond_expr);
3149     }
3150 }
3151
3152 /* Function vect_create_cond_for_alias_checks.
3153
3154    Create a conditional expression that represents the run-time checks for
3155    overlapping of address ranges represented by a list of data references
3156    relations passed as input.
3157
3158    Input:
3159    COND_EXPR  - input conditional expression.  New conditions will be chained
3160                 with logical AND operation.  If it is NULL, then the function
3161                 is used to return the number of alias checks.
3162    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3163                 to be checked.
3164
3165    Output:
3166    COND_EXPR - conditional expression.
3167
3168    The returned COND_EXPR is the conditional expression to be used in the if
3169    statement that controls which version of the loop gets executed at runtime.
3170 */
3171
3172 void
3173 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3174 {
3175   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3176     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3177
3178   if (comp_alias_ddrs.is_empty ())
3179     return;
3180
3181   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3182                                &comp_alias_ddrs, cond_expr);
3183   if (dump_enabled_p ())
3184     dump_printf_loc (MSG_NOTE, vect_location,
3185                      "created %u versioning for alias checks.\n",
3186                      comp_alias_ddrs.length ());
3187 }
3188
3189
3190 /* Function vect_loop_versioning.
3191
3192    If the loop has data references that may or may not be aligned or/and
3193    has data reference relations whose independence was not proven then
3194    two versions of the loop need to be generated, one which is vectorized
3195    and one which isn't.  A test is then generated to control which of the
3196    loops is executed.  The test checks for the alignment of all of the
3197    data references that may or may not be aligned.  An additional
3198    sequence of runtime tests is generated for each pairs of DDRs whose
3199    independence was not proven.  The vectorized version of loop is
3200    executed only if both alias and alignment tests are passed.
3201
3202    The test generated to check which version of loop is executed
3203    is modified to also check for profitability as indicated by the
3204    cost model threshold TH.
3205
3206    The versioning precondition(s) are placed in *COND_EXPR and
3207    *COND_EXPR_STMT_LIST.  */
3208
3209 class loop *
3210 vect_loop_versioning (loop_vec_info loop_vinfo,
3211                       gimple *loop_vectorized_call)
3212 {
3213   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3214   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3215   basic_block condition_bb;
3216   gphi_iterator gsi;
3217   gimple_stmt_iterator cond_exp_gsi;
3218   basic_block merge_bb;
3219   basic_block new_exit_bb;
3220   edge new_exit_e, e;
3221   gphi *orig_phi, *new_phi;
3222   tree cond_expr = NULL_TREE;
3223   gimple_seq cond_expr_stmt_list = NULL;
3224   tree arg;
3225   profile_probability prob = profile_probability::likely ();
3226   gimple_seq gimplify_stmt_list = NULL;
3227   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3228   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3229   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3230   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3231   poly_uint64 versioning_threshold
3232     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3233   tree version_simd_if_cond
3234     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3235   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3236
3237   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3238       && !ordered_p (th, versioning_threshold))
3239     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3240                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3241                                             th - 1));
3242   if (maybe_ne (versioning_threshold, 0U))
3243     {
3244       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3245                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3246                                               versioning_threshold - 1));
3247       if (cond_expr)
3248         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3249                                  expr, cond_expr);
3250       else
3251         cond_expr = expr;
3252     }
3253
3254   if (version_niter)
3255     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3256
3257   if (cond_expr)
3258     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3259                                         &cond_expr_stmt_list,
3260                                         is_gimple_condexpr, NULL_TREE);
3261
3262   if (version_align)
3263     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3264                                        &cond_expr_stmt_list);
3265
3266   if (version_alias)
3267     {
3268       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3269       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3270       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3271     }
3272
3273   if (version_simd_if_cond)
3274     {
3275       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3276       if (flag_checking)
3277         if (basic_block bb
3278             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3279           gcc_assert (bb != loop->header
3280                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3281                       && (scalar_loop == NULL
3282                           || (bb != scalar_loop->header
3283                               && dominated_by_p (CDI_DOMINATORS,
3284                                                  scalar_loop->header, bb))));
3285       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3286       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3287                             version_simd_if_cond, zero);
3288       if (cond_expr)
3289         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3290                                  c, cond_expr);
3291       else
3292         cond_expr = c;
3293       if (dump_enabled_p ())
3294         dump_printf_loc (MSG_NOTE, vect_location,
3295                          "created versioning for simd if condition check.\n");
3296     }
3297
3298   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3299                                       &gimplify_stmt_list,
3300                                       is_gimple_condexpr, NULL_TREE);
3301   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3302
3303   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3304      invariant in.  */
3305   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3306   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3307        !gsi_end_p (gsi); gsi_next (&gsi))
3308     {
3309       gimple *stmt = gsi_stmt (gsi);
3310       update_stmt (stmt);
3311       ssa_op_iter iter;
3312       use_operand_p use_p;
3313       basic_block def_bb;
3314       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3315         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3316             && flow_bb_inside_loop_p (outermost, def_bb))
3317           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3318     }
3319
3320   /* Search for the outermost loop we can version.  Avoid versioning of
3321      non-perfect nests but allow if-conversion versioned loops inside.  */
3322   class loop *loop_to_version = loop;
3323   if (flow_loop_nested_p (outermost, loop))
3324     {
3325       if (dump_enabled_p ())
3326         dump_printf_loc (MSG_NOTE, vect_location,
3327                          "trying to apply versioning to outer loop %d\n",
3328                          outermost->num);
3329       if (outermost->num == 0)
3330         outermost = superloop_at_depth (loop, 1);
3331       /* And avoid applying versioning on non-perfect nests.  */
3332       while (loop_to_version != outermost
3333              && single_exit (loop_outer (loop_to_version))
3334              && (!loop_outer (loop_to_version)->inner->next
3335                  || vect_loop_vectorized_call (loop_to_version))
3336              && (!loop_outer (loop_to_version)->inner->next
3337                  || !loop_outer (loop_to_version)->inner->next->next))
3338         loop_to_version = loop_outer (loop_to_version);
3339     }
3340
3341   /* Apply versioning.  If there is already a scalar version created by
3342      if-conversion re-use that.  Note we cannot re-use the copy of
3343      an if-converted outer-loop when vectorizing the inner loop only.  */
3344   gcond *cond;
3345   if ((!loop_to_version->inner || loop == loop_to_version)
3346       && loop_vectorized_call)
3347     {
3348       gcc_assert (scalar_loop);
3349       condition_bb = gimple_bb (loop_vectorized_call);
3350       cond = as_a <gcond *> (last_stmt (condition_bb));
3351       gimple_cond_set_condition_from_tree (cond, cond_expr);
3352       update_stmt (cond);
3353
3354       if (cond_expr_stmt_list)
3355         {
3356           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3357           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3358                                  GSI_SAME_STMT);
3359         }
3360
3361       /* if-conversion uses profile_probability::always () for both paths,
3362          reset the paths probabilities appropriately.  */
3363       edge te, fe;
3364       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3365       te->probability = prob;
3366       fe->probability = prob.invert ();
3367       /* We can scale loops counts immediately but have to postpone
3368          scaling the scalar loop because we re-use it during peeling.  */
3369       scale_loop_frequencies (loop_to_version, te->probability);
3370       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3371
3372       nloop = scalar_loop;
3373       if (dump_enabled_p ())
3374         dump_printf_loc (MSG_NOTE, vect_location,
3375                          "reusing %sloop version created by if conversion\n",
3376                          loop_to_version != loop ? "outer " : "");
3377     }
3378   else
3379     {
3380       if (loop_to_version != loop
3381           && dump_enabled_p ())
3382         dump_printf_loc (MSG_NOTE, vect_location,
3383                          "applying loop versioning to outer loop %d\n",
3384                          loop_to_version->num);
3385
3386       initialize_original_copy_tables ();
3387       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3388                             prob, prob.invert (), prob, prob.invert (), true);
3389       gcc_assert (nloop);
3390       nloop = get_loop_copy (loop);
3391
3392       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3393          reap those otherwise;  they also refer to the original
3394          loops.  */
3395       class loop *l = loop;
3396       while (gimple *call = vect_loop_vectorized_call (l))
3397         {
3398           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3399           fold_loop_internal_call (call, boolean_false_node);
3400           l = loop_outer (l);
3401         }
3402       free_original_copy_tables ();
3403
3404       if (cond_expr_stmt_list)
3405         {
3406           cond_exp_gsi = gsi_last_bb (condition_bb);
3407           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3408                                  GSI_SAME_STMT);
3409         }
3410
3411       /* Loop versioning violates an assumption we try to maintain during
3412          vectorization - that the loop exit block has a single predecessor.
3413          After versioning, the exit block of both loop versions is the same
3414          basic block (i.e. it has two predecessors). Just in order to simplify
3415          following transformations in the vectorizer, we fix this situation
3416          here by adding a new (empty) block on the exit-edge of the loop,
3417          with the proper loop-exit phis to maintain loop-closed-form.
3418          If loop versioning wasn't done from loop, but scalar_loop instead,
3419          merge_bb will have already just a single successor.  */
3420
3421       merge_bb = single_exit (loop_to_version)->dest;
3422       if (EDGE_COUNT (merge_bb->preds) >= 2)
3423         {
3424           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3425           new_exit_bb = split_edge (single_exit (loop_to_version));
3426           new_exit_e = single_exit (loop_to_version);
3427           e = EDGE_SUCC (new_exit_bb, 0);
3428
3429           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3430                gsi_next (&gsi))
3431             {
3432               tree new_res;
3433               orig_phi = gsi.phi ();
3434               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3435               new_phi = create_phi_node (new_res, new_exit_bb);
3436               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3437               add_phi_arg (new_phi, arg, new_exit_e,
3438                            gimple_phi_arg_location_from_edge (orig_phi, e));
3439               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3440             }
3441         }
3442
3443       update_ssa (TODO_update_ssa);
3444     }
3445
3446   if (version_niter)
3447     {
3448       /* The versioned loop could be infinite, we need to clear existing
3449          niter information which is copied from the original loop.  */
3450       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3451       vect_free_loop_info_assumptions (nloop);
3452       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3453       loop_constraint_set (loop, LOOP_C_INFINITE);
3454     }
3455
3456   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3457       && dump_enabled_p ())
3458     {
3459       if (version_alias)
3460         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3461                          vect_location,
3462                          "loop versioned for vectorization because of "
3463                          "possible aliasing\n");
3464       if (version_align)
3465         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3466                          vect_location,
3467                          "loop versioned for vectorization to enhance "
3468                          "alignment\n");
3469
3470     }
3471
3472   return nloop;
3473 }