gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 324                                rgroup_controls *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->type;
 327   tree dest_masktype = dest_rgm->type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 342         {
 343           tree src = src_rgm->controls[i / 2];
 344           tree dest = dest_rgm->controls[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 375         {
 376           tree src = src_rgm->controls[i / 2];
 377           tree dest = dest_rgm->controls[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 388    for all the rgroup controls in RGC and return a control that is nonzero
 389    when the loop needs to iterate.  Add any new preheader statements to
 390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGC->max_nscalars_per_iter
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGC->max_nscalars_per_iter
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks for RGC.  */
 415
 416 static tree
 417 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 418                                  gimple_seq *preheader_seq,
 419                                  gimple_stmt_iterator loop_cond_gsi,
 420                                  rgroup_controls *rgc, tree niters,
 421                                  tree niters_skip, bool might_wrap_p)
 422 {
 423   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 424   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 425   tree ctrl_type = rgc->type;
 426   unsigned int nscalars_per_iter = rgc->max_nscalars_per_iter;
 427   poly_uint64 nscalars_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
 428   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 429
 430   /* Calculate the maximum number of scalar values that the rgroup
 431      handles in total, the number that it handles for each iteration
 432      of the vector loop, and the number that it should skip during the
 433      first iteration of the vector loop.  */
 434   tree nscalars_total = niters;
 435   tree nscalars_step = build_int_cst (iv_type, vf);
 436   tree nscalars_skip = niters_skip;
 437   if (nscalars_per_iter != 1)
 438     {
 439       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 440          these multiplications don't overflow.  */
 441       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
 442       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
 443       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 444                                      nscalars_total, compare_factor);
 445       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 446                                     nscalars_step, iv_factor);
 447       if (nscalars_skip)
 448         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 449                                       nscalars_skip, compare_factor);
 450     }
 451
 452   /* Create an induction variable that counts the number of scalars
 453      processed.  */
 454   tree index_before_incr, index_after_incr;
 455   gimple_stmt_iterator incr_gsi;
 456   bool insert_after;
 457   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 458   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
 459              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 460
 461   tree zero_index = build_int_cst (compare_type, 0);
 462   tree test_index, test_limit, first_limit;
 463   gimple_stmt_iterator *test_gsi;
 464   if (might_wrap_p)
 465     {
 466       /* In principle the loop should stop iterating once the incremented
 467          IV reaches a value greater than or equal to:
 468
 469            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 470
 471          However, there's no guarantee that this addition doesn't overflow
 472          the comparison type, or that the IV hits a value above it before
 473          wrapping around.  We therefore adjust the limit down by one
 474          IV step:
 475
 476            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 477            -[infinite-prec] NSCALARS_STEP
 478
 479          and compare the IV against this limit _before_ incrementing it.
 480          Since the comparison type is unsigned, we actually want the
 481          subtraction to saturate at zero:
 482
 483            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 484            -[sat] NSCALARS_STEP
 485
 486          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 487
 488            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 489
 490          where the rightmost subtraction can be done directly in
 491          COMPARE_TYPE.  */
 492       test_index = index_before_incr;
 493       tree adjust = gimple_convert (preheader_seq, compare_type,
 494                                     nscalars_step);
 495       if (nscalars_skip)
 496         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 497                                adjust, nscalars_skip);
 498       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 499                                  nscalars_total, adjust);
 500       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 501                                  test_limit, adjust);
 502       test_gsi = &incr_gsi;
 503
 504       /* Get a safe limit for the first iteration.  */
 505       if (nscalars_skip)
 506         {
 507           /* The first vector iteration can handle at most NSCALARS_STEP
 508              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 509              NSCALARS_SKIP to that cannot overflow.  */
 510           tree const_limit = build_int_cst (compare_type,
 511                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 512                                             * nscalars_per_iter);
 513           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 514                                       nscalars_total, const_limit);
 515           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 516                                       first_limit, nscalars_skip);
 517         }
 518       else
 519         /* For the first iteration it doesn't matter whether the IV hits
 520            a value above NSCALARS_TOTAL.  That only matters for the latch
 521            condition.  */
 522         first_limit = nscalars_total;
 523     }
 524   else
 525     {
 526       /* Test the incremented IV, which will always hit a value above
 527          the bound before wrapping.  */
 528       test_index = index_after_incr;
 529       test_limit = nscalars_total;
 530       if (nscalars_skip)
 531         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 532                                    test_limit, nscalars_skip);
 533       test_gsi = &loop_cond_gsi;
 534
 535       first_limit = test_limit;
 536     }
 537
 538   /* Convert the IV value to the comparison type (either a no-op or
 539      a demotion).  */
 540   gimple_seq test_seq = NULL;
 541   test_index = gimple_convert (&test_seq, compare_type, test_index);
 542   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 543
 544   /* Provide a definition of each control in the group.  */
 545   tree next_ctrl = NULL_TREE;
 546   tree ctrl;
 547   unsigned int i;
 548   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 549     {
 550       /* Previous controls will cover BIAS scalars.  This control covers the
 551          next batch.  */
 552       poly_uint64 bias = nscalars_per_ctrl * i;
 553       tree bias_tree = build_int_cst (compare_type, bias);
 554       gimple *tmp_stmt;
 555
 556       /* See whether the first iteration of the vector loop is known
 557          to have a full control.  */
 558       poly_uint64 const_limit;
 559       bool first_iteration_full
 560         = (poly_int_tree_p (first_limit, &const_limit)
 561            && known_ge (const_limit, (i + 1) * nscalars_per_ctrl));
 562
 563       /* Rather than have a new IV that starts at BIAS and goes up to
 564          TEST_LIMIT, prefer to use the same 0-based IV for each control
 565          and adjust the bound down by BIAS.  */
 566       tree this_test_limit = test_limit;
 567       if (i != 0)
 568         {
 569           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 570                                           compare_type, this_test_limit,
 571                                           bias_tree);
 572           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 573                                           compare_type, this_test_limit,
 574                                           bias_tree);
 575         }
 576
 577       /* Create the initial control.  First include all scalars that
 578          are within the loop limit.  */
 579       tree init_ctrl = NULL_TREE;
 580       if (!first_iteration_full)
 581         {
 582           tree start, end;
 583           if (first_limit == test_limit)
 584             {
 585               /* Use a natural test between zero (the initial IV value)
 586                  and the loop limit.  The "else" block would be valid too,
 587                  but this choice can avoid the need to load BIAS_TREE into
 588                  a register.  */
 589               start = zero_index;
 590               end = this_test_limit;
 591             }
 592           else
 593             {
 594               /* FIRST_LIMIT is the maximum number of scalars handled by the
 595                  first iteration of the vector loop.  Test the portion
 596                  associated with this control.  */
 597               start = bias_tree;
 598               end = first_limit;
 599             }
 600
 601           init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
 602           tmp_stmt = vect_gen_while (init_ctrl, start, end);
 603           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 604         }
 605
 606       /* Now AND out the bits that are within the number of skipped
 607          scalars.  */
 608       poly_uint64 const_skip;
 609       if (nscalars_skip
 610           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 611                && known_le (const_skip, bias)))
 612         {
 613           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 614                                                     bias_tree, nscalars_skip);
 615           if (init_ctrl)
 616             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 617                                       init_ctrl, unskipped_mask);
 618           else
 619             init_ctrl = unskipped_mask;
 620         }
 621
 622       if (!init_ctrl)
 623         /* First iteration is full.  */
 624         init_ctrl = build_minus_one_cst (ctrl_type);
 625
 626       /* Get the control value for the next iteration of the loop.  */
 627       next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
 628       gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
 629       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 630
 631       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 632     }
 633   return next_ctrl;
 634 }
 635
 636 /* Set up the iteration condition and rgroup controls for LOOP, given
 637    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 638    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 639    the number of iterations of the original scalar loop that should be
 640    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 641    for vect_set_loop_condition.
 642
 643    Insert the branch-back condition before LOOP_COND_GSI and return the
 644    final gcond.  */
 645
 646 static gcond *
 647 vect_set_loop_condition_partial_vectors (class loop *loop,
 648                                          loop_vec_info loop_vinfo, tree niters,
 649                                          tree final_iv, bool niters_maybe_zero,
 650                                          gimple_stmt_iterator loop_cond_gsi)
 651 {
 652   gimple_seq preheader_seq = NULL;
 653   gimple_seq header_seq = NULL;
 654
 655   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 656   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 657   tree orig_niters = niters;
 658
 659   /* Type of the initial value of NITERS.  */
 660   tree ni_actual_type = TREE_TYPE (niters);
 661   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 662   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 663
 664   /* Convert NITERS to the same size as the compare.  */
 665   if (compare_precision > ni_actual_precision
 666       && niters_maybe_zero)
 667     {
 668       /* We know that there is always at least one iteration, so if the
 669          count is zero then it must have wrapped.  Cope with this by
 670          subtracting 1 before the conversion and adding 1 to the result.  */
 671       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 672       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 673                              niters, build_minus_one_cst (ni_actual_type));
 674       niters = gimple_convert (&preheader_seq, compare_type, niters);
 675       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 676                              niters, build_one_cst (compare_type));
 677     }
 678   else
 679     niters = gimple_convert (&preheader_seq, compare_type, niters);
 680
 681   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
 682
 683   /* Iterate over all the rgroups and fill in their controls.  We could use
 684      the first control from any rgroup for the loop condition; here we
 685      arbitrarily pick the last.  */
 686   tree test_ctrl = NULL_TREE;
 687   rgroup_controls *rgc;
 688   unsigned int i;
 689   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 690   FOR_EACH_VEC_ELT (*masks, i, rgc)
 691     if (!rgc->controls.is_empty ())
 692       {
 693         /* First try using permutes.  This adds a single vector
 694            instruction to the loop for each mask, but needs no extra
 695            loop invariants or IVs.  */
 696         unsigned int nmasks = i + 1;
 697         if ((nmasks & 1) == 0)
 698           {
 699             rgroup_controls *half_rgc = &(*masks)[nmasks / 2 - 1];
 700             if (!half_rgc->controls.is_empty ()
 701                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 702               continue;
 703           }
 704
 705         /* See whether zero-based IV would ever generate all-false masks
 706            before wrapping around.  */
 707         bool might_wrap_p
 708           = (iv_limit == -1
 709              || (wi::min_precision (iv_limit * rgc->max_nscalars_per_iter,
 710                                     UNSIGNED)
 711                  > compare_precision));
 712
 713         /* Set up all controls for this group.  */
 714         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 715                                                      &preheader_seq,
 716                                                      loop_cond_gsi, rgc,
 717                                                      niters, niters_skip,
 718                                                      might_wrap_p);
 719       }
 720
 721   /* Emit all accumulated statements.  */
 722   add_preheader_seq (loop, preheader_seq);
 723   add_header_seq (loop, header_seq);
 724
 725   /* Get a boolean result that tells us whether to iterate.  */
 726   edge exit_edge = single_exit (loop);
 727   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 728   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 729   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 730                                         NULL_TREE, NULL_TREE);
 731   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 732
 733   /* The loop iterates (NITERS - 1) / VF + 1 times.
 734      Subtract one from this to get the latch count.  */
 735   tree step = build_int_cst (compare_type,
 736                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 737   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 738                                        build_minus_one_cst (compare_type));
 739   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 740                                      niters_minus_one, step);
 741
 742   if (final_iv)
 743     {
 744       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 745       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 746     }
 747
 748   return cond_stmt;
 749 }
 750
 751 /* Like vect_set_loop_condition, but handle the case in which the vector
 752    loop handles exactly VF scalars per iteration.  */
 753
 754 static gcond *
 755 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 756                                 tree final_iv, bool niters_maybe_zero,
 757                                 gimple_stmt_iterator loop_cond_gsi)
 758 {
 759   tree indx_before_incr, indx_after_incr;
 760   gcond *cond_stmt;
 761   gcond *orig_cond;
 762   edge pe = loop_preheader_edge (loop);
 763   edge exit_edge = single_exit (loop);
 764   gimple_stmt_iterator incr_gsi;
 765   bool insert_after;
 766   enum tree_code code;
 767   tree niters_type = TREE_TYPE (niters);
 768
 769   orig_cond = get_loop_exit_condition (loop);
 770   gcc_assert (orig_cond);
 771   loop_cond_gsi = gsi_for_stmt (orig_cond);
 772
 773   tree init, limit;
 774   if (!niters_maybe_zero && integer_onep (step))
 775     {
 776       /* In this case we can use a simple 0-based IV:
 777
 778          A:
 779            x = 0;
 780            do
 781              {
 782                ...
 783                x += 1;
 784              }
 785            while (x < NITERS);  */
 786       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 787       init = build_zero_cst (niters_type);
 788       limit = niters;
 789     }
 790   else
 791     {
 792       /* The following works for all values of NITERS except 0:
 793
 794          B:
 795            x = 0;
 796            do
 797              {
 798                ...
 799                x += STEP;
 800              }
 801            while (x <= NITERS - STEP);
 802
 803          so that the loop continues to iterate if x + STEP - 1 < NITERS
 804          but stops if x + STEP - 1 >= NITERS.
 805
 806          However, if NITERS is zero, x never hits a value above NITERS - STEP
 807          before wrapping around.  There are two obvious ways of dealing with
 808          this:
 809
 810          - start at STEP - 1 and compare x before incrementing it
 811          - start at -1 and compare x after incrementing it
 812
 813          The latter is simpler and is what we use.  The loop in this case
 814          looks like:
 815
 816          C:
 817            x = -1;
 818            do
 819              {
 820                ...
 821                x += STEP;
 822              }
 823            while (x < NITERS - STEP);
 824
 825          In both cases the loop limit is NITERS - STEP.  */
 826       gimple_seq seq = NULL;
 827       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 828       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 829       if (seq)
 830         {
 831           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 832           gcc_assert (!new_bb);
 833         }
 834       if (niters_maybe_zero)
 835         {
 836           /* Case C.  */
 837           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 838           init = build_all_ones_cst (niters_type);
 839         }
 840       else
 841         {
 842           /* Case B.  */
 843           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 844           init = build_zero_cst (niters_type);
 845         }
 846     }
 847
 848   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 849   create_iv (init, step, NULL_TREE, loop,
 850              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 851   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 852                                               true, NULL_TREE, true,
 853                                               GSI_SAME_STMT);
 854   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 855                                      true, GSI_SAME_STMT);
 856
 857   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 858                                  NULL_TREE);
 859
 860   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 861
 862   /* Record the number of latch iterations.  */
 863   if (limit == niters)
 864     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 865        latch count.  */
 866     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 867                                        build_int_cst (niters_type, 1));
 868   else
 869     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 870        Subtract one from this to get the latch count.  */
 871     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 872                                        limit, step);
 873
 874   if (final_iv)
 875     {
 876       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 877                                              indx_after_incr, init);
 878       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 879     }
 880
 881   return cond_stmt;
 882 }
 883
 884 /* If we're using fully-masked loops, make LOOP iterate:
 885
 886       N == (NITERS - 1) / STEP + 1
 887
 888    times.  When NITERS is zero, this is equivalent to making the loop
 889    execute (1 << M) / STEP times, where M is the precision of NITERS.
 890    NITERS_MAYBE_ZERO is true if this last case might occur.
 891
 892    If we're not using fully-masked loops, make LOOP iterate:
 893
 894       N == (NITERS - STEP) / STEP + 1
 895
 896    times, where NITERS is known to be outside the range [1, STEP - 1].
 897    This is equivalent to making the loop execute NITERS / STEP times
 898    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 899    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 900
 901    If FINAL_IV is nonnull, it is an SSA name that should be set to
 902    N * STEP on exit from the loop.
 903
 904    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 905
 906 void
 907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 908                          tree niters, tree step, tree final_iv,
 909                          bool niters_maybe_zero)
 910 {
 911   gcond *cond_stmt;
 912   gcond *orig_cond = get_loop_exit_condition (loop);
 913   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 914
 915   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 916     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 917                                                          niters, final_iv,
 918                                                          niters_maybe_zero,
 919                                                          loop_cond_gsi);
 920   else
 921     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 922                                                 niters_maybe_zero,
 923                                                 loop_cond_gsi);
 924
 925   /* Remove old loop exit test.  */
 926   stmt_vec_info orig_cond_info;
 927   if (loop_vinfo
 928       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 929     loop_vinfo->remove_stmt (orig_cond_info);
 930   else
 931     gsi_remove (&loop_cond_gsi, true);
 932
 933   if (dump_enabled_p ())
 934     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 935                      cond_stmt);
 936 }
 937
 938 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 939    For all PHI arguments in FROM->dest and TO->dest from those
 940    edges ensure that TO->dest PHI arguments have current_def
 941    to that in from.  */
 942
 943 static void
 944 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 945 {
 946   gimple_stmt_iterator gsi_from, gsi_to;
 947
 948   for (gsi_from = gsi_start_phis (from->dest),
 949        gsi_to = gsi_start_phis (to->dest);
 950        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 951     {
 952       gimple *from_phi = gsi_stmt (gsi_from);
 953       gimple *to_phi = gsi_stmt (gsi_to);
 954       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 955       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 956       if (virtual_operand_p (from_arg))
 957         {
 958           gsi_next (&gsi_from);
 959           continue;
 960         }
 961       if (virtual_operand_p (to_arg))
 962         {
 963           gsi_next (&gsi_to);
 964           continue;
 965         }
 966       if (TREE_CODE (from_arg) != SSA_NAME)
 967         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 968       else if (TREE_CODE (to_arg) == SSA_NAME
 969                && from_arg != to_arg)
 970         {
 971           if (get_current_def (to_arg) == NULL_TREE)
 972             {
 973               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 974                                               TREE_TYPE (get_current_def
 975                                                            (from_arg))));
 976               set_current_def (to_arg, get_current_def (from_arg));
 977             }
 978         }
 979       gsi_next (&gsi_from);
 980       gsi_next (&gsi_to);
 981     }
 982
 983   gphi *from_phi = get_virtual_phi (from->dest);
 984   gphi *to_phi = get_virtual_phi (to->dest);
 985   if (from_phi)
 986     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 987                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 988 }
 989
 990
 991 /* Given LOOP this function generates a new copy of it and puts it
 992    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 993    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 994    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 995    entry or exit of LOOP.  */
 996
 997 class loop *
 998 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
 999                                         class loop *scalar_loop, edge e)
1000 {
1001   class loop *new_loop;
1002   basic_block *new_bbs, *bbs, *pbbs;
1003   bool at_exit;
1004   bool was_imm_dom;
1005   basic_block exit_dest;
1006   edge exit, new_exit;
1007   bool duplicate_outer_loop = false;
1008
1009   exit = single_exit (loop);
1010   at_exit = (e == exit);
1011   if (!at_exit && e != loop_preheader_edge (loop))
1012     return NULL;
1013
1014   if (scalar_loop == NULL)
1015     scalar_loop = loop;
1016
1017   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1018   pbbs = bbs + 1;
1019   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1020   /* Allow duplication of outer loops.  */
1021   if (scalar_loop->inner)
1022     duplicate_outer_loop = true;
1023   /* Check whether duplication is possible.  */
1024   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1025     {
1026       free (bbs);
1027       return NULL;
1028     }
1029
1030   /* Generate new loop structure.  */
1031   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1032   duplicate_subloops (scalar_loop, new_loop);
1033
1034   exit_dest = exit->dest;
1035   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1036                                           exit_dest) == loop->header ?
1037                  true : false);
1038
1039   /* Also copy the pre-header, this avoids jumping through hoops to
1040      duplicate the loop entry PHI arguments.  Create an empty
1041      pre-header unconditionally for this.  */
1042   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1043   edge entry_e = single_pred_edge (preheader);
1044   bbs[0] = preheader;
1045   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1046
1047   exit = single_exit (scalar_loop);
1048   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1049             &exit, 1, &new_exit, NULL,
1050             at_exit ? loop->latch : e->src, true);
1051   exit = single_exit (loop);
1052   basic_block new_preheader = new_bbs[0];
1053
1054   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1055
1056   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1057   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1058     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1059
1060   if (scalar_loop != loop)
1061     {
1062       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1063          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1064          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1065          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1066          header) to have current_def set, so copy them over.  */
1067       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1068                                                 exit);
1069       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1070                                                            0),
1071                                                 EDGE_SUCC (loop->latch, 0));
1072     }
1073
1074   if (at_exit) /* Add the loop copy at exit.  */
1075     {
1076       if (scalar_loop != loop)
1077         {
1078           gphi_iterator gsi;
1079           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1080
1081           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1082                gsi_next (&gsi))
1083             {
1084               gphi *phi = gsi.phi ();
1085               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1086               location_t orig_locus
1087                 = gimple_phi_arg_location_from_edge (phi, e);
1088
1089               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1090             }
1091         }
1092       redirect_edge_and_branch_force (e, new_preheader);
1093       flush_pending_stmts (e);
1094       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1095       if (was_imm_dom || duplicate_outer_loop)
1096         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1097
1098       /* And remove the non-necessary forwarder again.  Keep the other
1099          one so we have a proper pre-header for the loop at the exit edge.  */
1100       redirect_edge_pred (single_succ_edge (preheader),
1101                           single_pred (preheader));
1102       delete_basic_block (preheader);
1103       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104                                loop_preheader_edge (scalar_loop)->src);
1105     }
1106   else /* Add the copy at entry.  */
1107     {
1108       if (scalar_loop != loop)
1109         {
1110           /* Remove the non-necessary forwarder of scalar_loop again.  */
1111           redirect_edge_pred (single_succ_edge (preheader),
1112                               single_pred (preheader));
1113           delete_basic_block (preheader);
1114           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1115                                    loop_preheader_edge (scalar_loop)->src);
1116           preheader = split_edge (loop_preheader_edge (loop));
1117           entry_e = single_pred_edge (preheader);
1118         }
1119
1120       redirect_edge_and_branch_force (entry_e, new_preheader);
1121       flush_pending_stmts (entry_e);
1122       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1123
1124       redirect_edge_and_branch_force (new_exit, preheader);
1125       flush_pending_stmts (new_exit);
1126       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1127
1128       /* And remove the non-necessary forwarder again.  Keep the other
1129          one so we have a proper pre-header for the loop at the exit edge.  */
1130       redirect_edge_pred (single_succ_edge (new_preheader),
1131                           single_pred (new_preheader));
1132       delete_basic_block (new_preheader);
1133       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1134                                loop_preheader_edge (new_loop)->src);
1135     }
1136
1137   if (scalar_loop != loop)
1138     {
1139       /* Update new_loop->header PHIs, so that on the preheader
1140          edge they are the ones from loop rather than scalar_loop.  */
1141       gphi_iterator gsi_orig, gsi_new;
1142       edge orig_e = loop_preheader_edge (loop);
1143       edge new_e = loop_preheader_edge (new_loop);
1144
1145       for (gsi_orig = gsi_start_phis (loop->header),
1146            gsi_new = gsi_start_phis (new_loop->header);
1147            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1148            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1149         {
1150           gphi *orig_phi = gsi_orig.phi ();
1151           gphi *new_phi = gsi_new.phi ();
1152           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1153           location_t orig_locus
1154             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1155
1156           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1157         }
1158     }
1159
1160   free (new_bbs);
1161   free (bbs);
1162
1163   checking_verify_dominators (CDI_DOMINATORS);
1164
1165   return new_loop;
1166 }
1167
1168
1169 /* Given the condition expression COND, put it as the last statement of
1170    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1171    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1172    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1173    new edge as irreducible if IRREDUCIBLE_P is true.  */
1174
1175 static edge
1176 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1177                        basic_block guard_to, basic_block dom_bb,
1178                        profile_probability probability, bool irreducible_p)
1179 {
1180   gimple_stmt_iterator gsi;
1181   edge new_e, enter_e;
1182   gcond *cond_stmt;
1183   gimple_seq gimplify_stmt_list = NULL;
1184
1185   enter_e = EDGE_SUCC (guard_bb, 0);
1186   enter_e->flags &= ~EDGE_FALLTHRU;
1187   enter_e->flags |= EDGE_FALSE_VALUE;
1188   gsi = gsi_last_bb (guard_bb);
1189
1190   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1191                                  NULL_TREE);
1192   if (gimplify_stmt_list)
1193     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1194
1195   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1196   gsi = gsi_last_bb (guard_bb);
1197   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1198
1199   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1200   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1201
1202   new_e->probability = probability;
1203   if (irreducible_p)
1204     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1205
1206   enter_e->probability = probability.invert ();
1207   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1208
1209   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1210   if (enter_e->dest->loop_father->header == enter_e->dest)
1211     split_edge (enter_e);
1212
1213   return new_e;
1214 }
1215
1216
1217 /* This function verifies that the following restrictions apply to LOOP:
1218    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1219        for innermost loop and 5 basic blocks for outer-loop.
1220    (2) it is single entry, single exit
1221    (3) its exit condition is the last stmt in the header
1222    (4) E is the entry/exit edge of LOOP.
1223  */
1224
1225 bool
1226 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1227 {
1228   edge exit_e = single_exit (loop);
1229   edge entry_e = loop_preheader_edge (loop);
1230   gcond *orig_cond = get_loop_exit_condition (loop);
1231   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1232   unsigned int num_bb = loop->inner? 5 : 2;
1233
1234   /* All loops have an outer scope; the only case loop->outer is NULL is for
1235      the function itself.  */
1236   if (!loop_outer (loop)
1237       || loop->num_nodes != num_bb
1238       || !empty_block_p (loop->latch)
1239       || !single_exit (loop)
1240       /* Verify that new loop exit condition can be trivially modified.  */
1241       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1242       || (e != exit_e && e != entry_e))
1243     return false;
1244
1245   return true;
1246 }
1247
1248 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1249    in the exit bb and rename all the uses after the loop.  This simplifies
1250    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1251    (but normally loop closed SSA form doesn't require virtual PHIs to be
1252    in the same form).  Doing this early simplifies the checking what
1253    uses should be renamed.
1254
1255    If we create a new phi after the loop, return the definition that
1256    applies on entry to the loop, otherwise return null.  */
1257
1258 static tree
1259 create_lcssa_for_virtual_phi (class loop *loop)
1260 {
1261   gphi_iterator gsi;
1262   edge exit_e = single_exit (loop);
1263
1264   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1265     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1266       {
1267         gphi *phi = gsi.phi ();
1268         for (gsi = gsi_start_phis (exit_e->dest);
1269              !gsi_end_p (gsi); gsi_next (&gsi))
1270           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1271             break;
1272         if (gsi_end_p (gsi))
1273           {
1274             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1275             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1276             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1277             imm_use_iterator imm_iter;
1278             gimple *stmt;
1279             use_operand_p use_p;
1280
1281             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1282               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1283             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1284             gimple_phi_set_result (new_phi, new_vop);
1285             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1286               if (stmt != new_phi
1287                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1288                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1289                   SET_USE (use_p, new_vop);
1290
1291             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1292           }
1293         break;
1294       }
1295   return NULL_TREE;
1296 }
1297
1298 /* Function vect_get_loop_location.
1299
1300    Extract the location of the loop in the source code.
1301    If the loop is not well formed for vectorization, an estimated
1302    location is calculated.
1303    Return the loop location if succeed and NULL if not.  */
1304
1305 dump_user_location_t
1306 find_loop_location (class loop *loop)
1307 {
1308   gimple *stmt = NULL;
1309   basic_block bb;
1310   gimple_stmt_iterator si;
1311
1312   if (!loop)
1313     return dump_user_location_t ();
1314
1315   stmt = get_loop_exit_condition (loop);
1316
1317   if (stmt
1318       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1319     return stmt;
1320
1321   /* If we got here the loop is probably not "well formed",
1322      try to estimate the loop location */
1323
1324   if (!loop->header)
1325     return dump_user_location_t ();
1326
1327   bb = loop->header;
1328
1329   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1330     {
1331       stmt = gsi_stmt (si);
1332       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1333         return stmt;
1334     }
1335
1336   return dump_user_location_t ();
1337 }
1338
1339 /* Return true if the phi described by STMT_INFO defines an IV of the
1340    loop to be vectorized.  */
1341
1342 static bool
1343 iv_phi_p (stmt_vec_info stmt_info)
1344 {
1345   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1346   if (virtual_operand_p (PHI_RESULT (phi)))
1347     return false;
1348
1349   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1350       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1351     return false;
1352
1353   return true;
1354 }
1355
1356 /* Function vect_can_advance_ivs_p
1357
1358    In case the number of iterations that LOOP iterates is unknown at compile
1359    time, an epilog loop will be generated, and the loop induction variables
1360    (IVs) will be "advanced" to the value they are supposed to take just before
1361    the epilog loop.  Here we check that the access function of the loop IVs
1362    and the expression that represents the loop bound are simple enough.
1363    These restrictions will be relaxed in the future.  */
1364
1365 bool
1366 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1367 {
1368   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1369   basic_block bb = loop->header;
1370   gphi_iterator gsi;
1371
1372   /* Analyze phi functions of the loop header.  */
1373
1374   if (dump_enabled_p ())
1375     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1376   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1377     {
1378       tree evolution_part;
1379
1380       gphi *phi = gsi.phi ();
1381       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1382       if (dump_enabled_p ())
1383         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1384                          phi_info->stmt);
1385
1386       /* Skip virtual phi's. The data dependences that are associated with
1387          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1388
1389          Skip reduction phis.  */
1390       if (!iv_phi_p (phi_info))
1391         {
1392           if (dump_enabled_p ())
1393             dump_printf_loc (MSG_NOTE, vect_location,
1394                              "reduc or virtual phi. skip.\n");
1395           continue;
1396         }
1397
1398       /* Analyze the evolution function.  */
1399
1400       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1401       if (evolution_part == NULL_TREE)
1402         {
1403           if (dump_enabled_p ())
1404             dump_printf (MSG_MISSED_OPTIMIZATION,
1405                          "No access function or evolution.\n");
1406           return false;
1407         }
1408
1409       /* FORNOW: We do not transform initial conditions of IVs
1410          which evolution functions are not invariants in the loop.  */
1411
1412       if (!expr_invariant_in_loop_p (loop, evolution_part))
1413         {
1414           if (dump_enabled_p ())
1415             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416                              "evolution not invariant in loop.\n");
1417           return false;
1418         }
1419
1420       /* FORNOW: We do not transform initial conditions of IVs
1421          which evolution functions are a polynomial of degree >= 2.  */
1422
1423       if (tree_is_chrec (evolution_part))
1424         {
1425           if (dump_enabled_p ())
1426             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1427                              "evolution is chrec.\n");
1428           return false;
1429         }
1430     }
1431
1432   return true;
1433 }
1434
1435
1436 /*   Function vect_update_ivs_after_vectorizer.
1437
1438      "Advance" the induction variables of LOOP to the value they should take
1439      after the execution of LOOP.  This is currently necessary because the
1440      vectorizer does not handle induction variables that are used after the
1441      loop.  Such a situation occurs when the last iterations of LOOP are
1442      peeled, because:
1443      1. We introduced new uses after LOOP for IVs that were not originally used
1444         after LOOP: the IVs of LOOP are now used by an epilog loop.
1445      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1446         times, whereas the loop IVs should be bumped N times.
1447
1448      Input:
1449      - LOOP - a loop that is going to be vectorized. The last few iterations
1450               of LOOP were peeled.
1451      - NITERS - the number of iterations that LOOP executes (before it is
1452                 vectorized). i.e, the number of times the ivs should be bumped.
1453      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1454                   coming out from LOOP on which there are uses of the LOOP ivs
1455                   (this is the path from LOOP->exit to epilog_loop->preheader).
1456
1457                   The new definitions of the ivs are placed in LOOP->exit.
1458                   The phi args associated with the edge UPDATE_E in the bb
1459                   UPDATE_E->dest are updated accordingly.
1460
1461      Assumption 1: Like the rest of the vectorizer, this function assumes
1462      a single loop exit that has a single predecessor.
1463
1464      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1465      organized in the same order.
1466
1467      Assumption 3: The access function of the ivs is simple enough (see
1468      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1469
1470      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1471      coming out of LOOP on which the ivs of LOOP are used (this is the path
1472      that leads to the epilog loop; other paths skip the epilog loop).  This
1473      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1474      needs to have its phis updated.
1475  */
1476
1477 static void
1478 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1479                                   tree niters, edge update_e)
1480 {
1481   gphi_iterator gsi, gsi1;
1482   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1483   basic_block update_bb = update_e->dest;
1484   basic_block exit_bb = single_exit (loop)->dest;
1485
1486   /* Make sure there exists a single-predecessor exit bb:  */
1487   gcc_assert (single_pred_p (exit_bb));
1488   gcc_assert (single_succ_edge (exit_bb) == update_e);
1489
1490   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1491        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1492        gsi_next (&gsi), gsi_next (&gsi1))
1493     {
1494       tree init_expr;
1495       tree step_expr, off;
1496       tree type;
1497       tree var, ni, ni_name;
1498       gimple_stmt_iterator last_gsi;
1499
1500       gphi *phi = gsi.phi ();
1501       gphi *phi1 = gsi1.phi ();
1502       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1503       if (dump_enabled_p ())
1504         dump_printf_loc (MSG_NOTE, vect_location,
1505                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1506
1507       /* Skip reduction and virtual phis.  */
1508       if (!iv_phi_p (phi_info))
1509         {
1510           if (dump_enabled_p ())
1511             dump_printf_loc (MSG_NOTE, vect_location,
1512                              "reduc or virtual phi. skip.\n");
1513           continue;
1514         }
1515
1516       type = TREE_TYPE (gimple_phi_result (phi));
1517       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1518       step_expr = unshare_expr (step_expr);
1519
1520       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1521          of degree >= 2 or exponential.  */
1522       gcc_assert (!tree_is_chrec (step_expr));
1523
1524       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1525
1526       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1527                          fold_convert (TREE_TYPE (step_expr), niters),
1528                          step_expr);
1529       if (POINTER_TYPE_P (type))
1530         ni = fold_build_pointer_plus (init_expr, off);
1531       else
1532         ni = fold_build2 (PLUS_EXPR, type,
1533                           init_expr, fold_convert (type, off));
1534
1535       var = create_tmp_var (type, "tmp");
1536
1537       last_gsi = gsi_last_bb (exit_bb);
1538       gimple_seq new_stmts = NULL;
1539       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1540       /* Exit_bb shouldn't be empty.  */
1541       if (!gsi_end_p (last_gsi))
1542         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1543       else
1544         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1545
1546       /* Fix phi expressions in the successor bb.  */
1547       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1548     }
1549 }
1550
1551 /* Return a gimple value containing the misalignment (measured in vector
1552    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1553    it is away from a perfectly aligned address.  Add any new statements
1554    to SEQ.  */
1555
1556 static tree
1557 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1558 {
1559   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1560   stmt_vec_info stmt_info = dr_info->stmt;
1561   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1562
1563   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1564   unsigned HOST_WIDE_INT target_align_c;
1565   tree target_align_minus_1;
1566
1567   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1568                                         size_zero_node) < 0;
1569   tree offset = (negative
1570                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1571                  : size_zero_node);
1572   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1573                                                           stmt_info, seq,
1574                                                           offset);
1575   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1576   if (target_align.is_constant (&target_align_c))
1577     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1578   else
1579     {
1580       tree vla = build_int_cst (type, target_align);
1581       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1582                                     fold_build2 (MINUS_EXPR, type,
1583                                                  build_int_cst (type, 0), vla));
1584       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1585                                           build_int_cst (type, 1));
1586     }
1587
1588   HOST_WIDE_INT elem_size
1589     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1590   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1591
1592   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1593   tree int_start_addr = fold_convert (type, start_addr);
1594   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1595                                         target_align_minus_1);
1596
1597   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1598   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1599                                         elem_size_log);
1600
1601   return misalign_in_elems;
1602 }
1603
1604 /* Function vect_gen_prolog_loop_niters
1605
1606    Generate the number of iterations which should be peeled as prolog for the
1607    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1608    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1609    As a result, after the execution of this loop, the data reference DR will
1610    refer to an aligned location.  The following computation is generated:
1611
1612    If the misalignment of DR is known at compile time:
1613      addr_mis = int mis = DR_MISALIGNMENT (dr);
1614    Else, compute address misalignment in bytes:
1615      addr_mis = addr & (target_align - 1)
1616
1617    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1618
1619    (elem_size = element type size; an element is the scalar element whose type
1620    is the inner type of the vectype)
1621
1622    The computations will be emitted at the end of BB.  We also compute and
1623    store upper bound (included) of the result in BOUND.
1624
1625    When the step of the data-ref in the loop is not 1 (as in interleaved data
1626    and SLP), the number of iterations of the prolog must be divided by the step
1627    (which is equal to the size of interleaved group).
1628
1629    The above formulas assume that VF == number of elements in the vector. This
1630    may not hold when there are multiple-types in the loop.
1631    In this case, for some data-references in the loop the VF does not represent
1632    the number of elements that fit in the vector.  Therefore, instead of VF we
1633    use TYPE_VECTOR_SUBPARTS.  */
1634
1635 static tree
1636 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1637                              basic_block bb, int *bound)
1638 {
1639   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1640   tree var;
1641   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1642   gimple_seq stmts = NULL, new_stmts = NULL;
1643   tree iters, iters_name;
1644   stmt_vec_info stmt_info = dr_info->stmt;
1645   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1646   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1647
1648   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1649     {
1650       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1651
1652       if (dump_enabled_p ())
1653         dump_printf_loc (MSG_NOTE, vect_location,
1654                          "known peeling = %d.\n", npeel);
1655
1656       iters = build_int_cst (niters_type, npeel);
1657       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1658     }
1659   else
1660     {
1661       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1662       tree type = TREE_TYPE (misalign_in_elems);
1663       HOST_WIDE_INT elem_size
1664         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1665       /* We only do prolog peeling if the target alignment is known at compile
1666          time.  */
1667       poly_uint64 align_in_elems =
1668         exact_div (target_align, elem_size);
1669       tree align_in_elems_minus_1 =
1670         build_int_cst (type, align_in_elems - 1);
1671       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1672
1673       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1674                                  & (align_in_elems - 1)).  */
1675       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1676                                             size_zero_node) < 0;
1677       if (negative)
1678         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1679                              align_in_elems_tree);
1680       else
1681         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1682                              misalign_in_elems);
1683       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1684       iters = fold_convert (niters_type, iters);
1685       unsigned HOST_WIDE_INT align_in_elems_c;
1686       if (align_in_elems.is_constant (&align_in_elems_c))
1687         *bound = align_in_elems_c - 1;
1688       else
1689         *bound = -1;
1690     }
1691
1692   if (dump_enabled_p ())
1693     dump_printf_loc (MSG_NOTE, vect_location,
1694                      "niters for prolog loop: %T\n", iters);
1695
1696   var = create_tmp_var (niters_type, "prolog_loop_niters");
1697   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1698
1699   if (new_stmts)
1700     gimple_seq_add_seq (&stmts, new_stmts);
1701   if (stmts)
1702     {
1703       gcc_assert (single_succ_p (bb));
1704       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1705       if (gsi_end_p (gsi))
1706         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1707       else
1708         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1709     }
1710   return iters_name;
1711 }
1712
1713
1714 /* Function vect_update_init_of_dr
1715
1716    If CODE is PLUS, the vector loop starts NITERS iterations after the
1717    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1718    iterations before the scalar one (using masking to skip inactive
1719    elements).  This function updates the information recorded in DR to
1720    account for the difference.  Specifically, it updates the OFFSET
1721    field of DR_INFO.  */
1722
1723 static void
1724 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1725 {
1726   struct data_reference *dr = dr_info->dr;
1727   tree offset = dr_info->offset;
1728   if (!offset)
1729     offset = build_zero_cst (sizetype);
1730
1731   niters = fold_build2 (MULT_EXPR, sizetype,
1732                         fold_convert (sizetype, niters),
1733                         fold_convert (sizetype, DR_STEP (dr)));
1734   offset = fold_build2 (code, sizetype,
1735                         fold_convert (sizetype, offset), niters);
1736   dr_info->offset = offset;
1737 }
1738
1739
1740 /* Function vect_update_inits_of_drs
1741
1742    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1743    CODE and NITERS are as for vect_update_inits_of_dr.  */
1744
1745 void
1746 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1747                           tree_code code)
1748 {
1749   unsigned int i;
1750   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1751   struct data_reference *dr;
1752
1753   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1754
1755   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1756      here, but since we might use these niters to update the epilogues niters
1757      and data references we can't insert them here as this definition might not
1758      always dominate its uses.  */
1759   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1760     niters = fold_convert (sizetype, niters);
1761
1762   FOR_EACH_VEC_ELT (datarefs, i, dr)
1763     {
1764       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1765       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1766         vect_update_init_of_dr (dr_info, niters, code);
1767     }
1768 }
1769
1770 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1771    by masking.  This involves calculating the number of iterations to
1772    be peeled and then aligning all memory references appropriately.  */
1773
1774 void
1775 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1776 {
1777   tree misalign_in_elems;
1778   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1779
1780   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1781
1782   /* From the information recorded in LOOP_VINFO get the number of iterations
1783      that need to be skipped via masking.  */
1784   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1785     {
1786       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1787                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1788       misalign_in_elems = build_int_cst (type, misalign);
1789     }
1790   else
1791     {
1792       gimple_seq seq1 = NULL, seq2 = NULL;
1793       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1794       misalign_in_elems = fold_convert (type, misalign_in_elems);
1795       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1796                                                 &seq2, true, NULL_TREE);
1797       gimple_seq_add_seq (&seq1, seq2);
1798       if (seq1)
1799         {
1800           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1801           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1802           gcc_assert (!new_bb);
1803         }
1804     }
1805
1806   if (dump_enabled_p ())
1807     dump_printf_loc (MSG_NOTE, vect_location,
1808                      "misalignment for fully-masked loop: %T\n",
1809                      misalign_in_elems);
1810
1811   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1812
1813   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1814 }
1815
1816 /* This function builds ni_name = number of iterations.  Statements
1817    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1818    it to TRUE if new ssa_var is generated.  */
1819
1820 tree
1821 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1822 {
1823   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1824   if (TREE_CODE (ni) == INTEGER_CST)
1825     return ni;
1826   else
1827     {
1828       tree ni_name, var;
1829       gimple_seq stmts = NULL;
1830       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1831
1832       var = create_tmp_var (TREE_TYPE (ni), "niters");
1833       ni_name = force_gimple_operand (ni, &stmts, false, var);
1834       if (stmts)
1835         {
1836           gsi_insert_seq_on_edge_immediate (pe, stmts);
1837           if (new_var_p != NULL)
1838             *new_var_p = true;
1839         }
1840
1841       return ni_name;
1842     }
1843 }
1844
1845 /* Calculate the number of iterations above which vectorized loop will be
1846    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1847    of prolog loop.  If it's integer const, the integer number is also passed
1848    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1849    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1850    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1851    threshold below which the scalar (rather than vectorized) loop will be
1852    executed.  This function stores the upper bound (inclusive) of the result
1853    in BOUND_SCALAR.  */
1854
1855 static tree
1856 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1857                              int bound_prolog, poly_int64 bound_epilog, int th,
1858                              poly_uint64 *bound_scalar,
1859                              bool check_profitability)
1860 {
1861   tree type = TREE_TYPE (niters_prolog);
1862   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1863                              build_int_cst (type, bound_epilog));
1864
1865   *bound_scalar = bound_prolog + bound_epilog;
1866   if (check_profitability)
1867     {
1868       /* TH indicates the minimum niters of vectorized loop, while we
1869          compute the maximum niters of scalar loop.  */
1870       th--;
1871       /* Peeling for constant times.  */
1872       if (int_niters_prolog >= 0)
1873         {
1874           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1875           return build_int_cst (type, *bound_scalar);
1876         }
1877       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1878          and BOUND_EPILOG are inclusive upper bounds.  */
1879       if (known_ge (th, bound_prolog + bound_epilog))
1880         {
1881           *bound_scalar = th;
1882           return build_int_cst (type, th);
1883         }
1884       /* Need to do runtime comparison.  */
1885       else if (maybe_gt (th, bound_epilog))
1886         {
1887           *bound_scalar = upper_bound (*bound_scalar, th);
1888           return fold_build2 (MAX_EXPR, type,
1889                               build_int_cst (type, th), niters);
1890         }
1891     }
1892   return niters;
1893 }
1894
1895 /* NITERS is the number of times that the original scalar loop executes
1896    after peeling.  Work out the maximum number of iterations N that can
1897    be handled by the vectorized form of the loop and then either:
1898
1899    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1900
1901         niters_vector = N
1902
1903    b) set *STEP_VECTOR_PTR to one and generate:
1904
1905         niters_vector = N / vf
1906
1907    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1908    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1909    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1910
1911 void
1912 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1913                              tree *niters_vector_ptr, tree *step_vector_ptr,
1914                              bool niters_no_overflow)
1915 {
1916   tree ni_minus_gap, var;
1917   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1918   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1919   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1920   tree log_vf = NULL_TREE;
1921
1922   /* If epilogue loop is required because of data accesses with gaps, we
1923      subtract one iteration from the total number of iterations here for
1924      correct calculation of RATIO.  */
1925   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1926     {
1927       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1928                                   build_one_cst (type));
1929       if (!is_gimple_val (ni_minus_gap))
1930         {
1931           var = create_tmp_var (type, "ni_gap");
1932           gimple *stmts = NULL;
1933           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1934                                                true, var);
1935           gsi_insert_seq_on_edge_immediate (pe, stmts);
1936         }
1937     }
1938   else
1939     ni_minus_gap = niters;
1940
1941   unsigned HOST_WIDE_INT const_vf;
1942   if (vf.is_constant (&const_vf)
1943       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1944     {
1945       /* Create: niters >> log2(vf) */
1946       /* If it's known that niters == number of latch executions + 1 doesn't
1947          overflow, we can generate niters >> log2(vf); otherwise we generate
1948          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1949          will be at least one.  */
1950       log_vf = build_int_cst (type, exact_log2 (const_vf));
1951       if (niters_no_overflow)
1952         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1953       else
1954         niters_vector
1955           = fold_build2 (PLUS_EXPR, type,
1956                          fold_build2 (RSHIFT_EXPR, type,
1957                                       fold_build2 (MINUS_EXPR, type,
1958                                                    ni_minus_gap,
1959                                                    build_int_cst (type, vf)),
1960                                       log_vf),
1961                          build_int_cst (type, 1));
1962       step_vector = build_one_cst (type);
1963     }
1964   else
1965     {
1966       niters_vector = ni_minus_gap;
1967       step_vector = build_int_cst (type, vf);
1968     }
1969
1970   if (!is_gimple_val (niters_vector))
1971     {
1972       var = create_tmp_var (type, "bnd");
1973       gimple_seq stmts = NULL;
1974       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1975       gsi_insert_seq_on_edge_immediate (pe, stmts);
1976       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1977          we set range information to make niters analyzer's life easier.  */
1978       if (stmts != NULL && log_vf)
1979         set_range_info (niters_vector, VR_RANGE,
1980                         wi::to_wide (build_int_cst (type, 1)),
1981                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1982                                                   TYPE_MAX_VALUE (type),
1983                                                   log_vf)));
1984     }
1985   *niters_vector_ptr = niters_vector;
1986   *step_vector_ptr = step_vector;
1987
1988   return;
1989 }
1990
1991 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1992    loop specified by LOOP_VINFO after vectorization, compute the number
1993    of iterations before vectorization (niters_vector * vf) and store it
1994    to NITERS_VECTOR_MULT_VF_PTR.  */
1995
1996 static void
1997 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1998                                      tree niters_vector,
1999                                      tree *niters_vector_mult_vf_ptr)
2000 {
2001   /* We should be using a step_vector of VF if VF is variable.  */
2002   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2003   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2004   tree type = TREE_TYPE (niters_vector);
2005   tree log_vf = build_int_cst (type, exact_log2 (vf));
2006   basic_block exit_bb = single_exit (loop)->dest;
2007
2008   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2009   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2010                                             niters_vector, log_vf);
2011   if (!is_gimple_val (niters_vector_mult_vf))
2012     {
2013       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2014       gimple_seq stmts = NULL;
2015       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2016                                                     &stmts, true, var);
2017       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2018       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2019     }
2020   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2021 }
2022
2023 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2024    this function searches for the corresponding lcssa phi node in exit
2025    bb of LOOP.  If it is found, return the phi result; otherwise return
2026    NULL.  */
2027
2028 static tree
2029 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2030                 gphi *lcssa_phi)
2031 {
2032   gphi_iterator gsi;
2033   edge e = single_exit (loop);
2034
2035   gcc_assert (single_pred_p (e->dest));
2036   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2037     {
2038       gphi *phi = gsi.phi ();
2039       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2040                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2041         return PHI_RESULT (phi);
2042     }
2043   return NULL_TREE;
2044 }
2045
2046 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2047    from SECOND/FIRST and puts it at the original loop's preheader/exit
2048    edge, the two loops are arranged as below:
2049
2050        preheader_a:
2051      first_loop:
2052        header_a:
2053          i_1 = PHI<i_0, i_2>;
2054          ...
2055          i_2 = i_1 + 1;
2056          if (cond_a)
2057            goto latch_a;
2058          else
2059            goto between_bb;
2060        latch_a:
2061          goto header_a;
2062
2063        between_bb:
2064          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2065
2066      second_loop:
2067        header_b:
2068          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2069                                  or with i_2 if no LCSSA phi is created
2070                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2071          ...
2072          i_4 = i_3 + 1;
2073          if (cond_b)
2074            goto latch_b;
2075          else
2076            goto exit_bb;
2077        latch_b:
2078          goto header_b;
2079
2080        exit_bb:
2081
2082    This function creates loop closed SSA for the first loop; update the
2083    second loop's PHI nodes by replacing argument on incoming edge with the
2084    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2085    is false, Loop closed ssa phis will only be created for non-iv phis for
2086    the first loop.
2087
2088    This function assumes exit bb of the first loop is preheader bb of the
2089    second loop, i.e, between_bb in the example code.  With PHIs updated,
2090    the second loop will execute rest iterations of the first.  */
2091
2092 static void
2093 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2094                                    class loop *first, class loop *second,
2095                                    bool create_lcssa_for_iv_phis)
2096 {
2097   gphi_iterator gsi_update, gsi_orig;
2098   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2099
2100   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2101   edge second_preheader_e = loop_preheader_edge (second);
2102   basic_block between_bb = single_exit (first)->dest;
2103
2104   gcc_assert (between_bb == second_preheader_e->src);
2105   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2106   /* Either the first loop or the second is the loop to be vectorized.  */
2107   gcc_assert (loop == first || loop == second);
2108
2109   for (gsi_orig = gsi_start_phis (first->header),
2110        gsi_update = gsi_start_phis (second->header);
2111        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2112        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2113     {
2114       gphi *orig_phi = gsi_orig.phi ();
2115       gphi *update_phi = gsi_update.phi ();
2116
2117       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2118       /* Generate lcssa PHI node for the first loop.  */
2119       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2120       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2121       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2122         {
2123           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2124           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2125           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2126           arg = new_res;
2127         }
2128
2129       /* Update PHI node in the second loop by replacing arg on the loop's
2130          incoming edge.  */
2131       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2132     }
2133
2134   /* For epilogue peeling we have to make sure to copy all LC PHIs
2135      for correct vectorization of live stmts.  */
2136   if (loop == first)
2137     {
2138       basic_block orig_exit = single_exit (second)->dest;
2139       for (gsi_orig = gsi_start_phis (orig_exit);
2140            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2141         {
2142           gphi *orig_phi = gsi_orig.phi ();
2143           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2144           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2145             continue;
2146
2147           /* Already created in the above loop.   */
2148           if (find_guard_arg (first, second, orig_phi))
2149             continue;
2150
2151           tree new_res = copy_ssa_name (orig_arg);
2152           gphi *lcphi = create_phi_node (new_res, between_bb);
2153           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2154         }
2155     }
2156 }
2157
2158 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2159    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2160    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2161    appear like below:
2162
2163        guard_bb:
2164          if (cond)
2165            goto merge_bb;
2166          else
2167            goto skip_loop;
2168
2169      skip_loop:
2170        header_a:
2171          i_1 = PHI<i_0, i_2>;
2172          ...
2173          i_2 = i_1 + 1;
2174          if (cond_a)
2175            goto latch_a;
2176          else
2177            goto exit_a;
2178        latch_a:
2179          goto header_a;
2180
2181        exit_a:
2182          i_5 = PHI<i_2>;
2183
2184        merge_bb:
2185          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2186
2187      update_loop:
2188        header_b:
2189          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2190          ...
2191          i_4 = i_3 + 1;
2192          if (cond_b)
2193            goto latch_b;
2194          else
2195            goto exit_bb;
2196        latch_b:
2197          goto header_b;
2198
2199        exit_bb:
2200
2201    This function creates PHI nodes at merge_bb and replaces the use of i_5
2202    in the update_loop's PHI node with the result of new PHI result.  */
2203
2204 static void
2205 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2206                                     class loop *update_loop,
2207                                     edge guard_edge, edge merge_edge)
2208 {
2209   location_t merge_loc, guard_loc;
2210   edge orig_e = loop_preheader_edge (skip_loop);
2211   edge update_e = loop_preheader_edge (update_loop);
2212   gphi_iterator gsi_orig, gsi_update;
2213
2214   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2215         gsi_update = gsi_start_phis (update_loop->header));
2216        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2217        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2218     {
2219       gphi *orig_phi = gsi_orig.phi ();
2220       gphi *update_phi = gsi_update.phi ();
2221
2222       /* Generate new phi node at merge bb of the guard.  */
2223       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2224       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2225
2226       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2227          args in NEW_PHI for these edges.  */
2228       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2229       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2230       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2231       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2232       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2233       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2234
2235       /* Update phi in UPDATE_PHI.  */
2236       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2237     }
2238 }
2239
2240 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2241    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2242    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2243    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2244    The CFG looks like:
2245
2246      loop:
2247        header_a:
2248          i_1 = PHI<i_0, i_2>;
2249          ...
2250          i_2 = i_1 + 1;
2251          if (cond_a)
2252            goto latch_a;
2253          else
2254            goto exit_a;
2255        latch_a:
2256          goto header_a;
2257
2258        exit_a:
2259
2260        guard_bb:
2261          if (cond)
2262            goto merge_bb;
2263          else
2264            goto epilog_loop;
2265
2266        ;; fall_through_bb
2267
2268      epilog_loop:
2269        header_b:
2270          i_3 = PHI<i_2, i_4>;
2271          ...
2272          i_4 = i_3 + 1;
2273          if (cond_b)
2274            goto latch_b;
2275          else
2276            goto merge_bb;
2277        latch_b:
2278          goto header_b;
2279
2280        merge_bb:
2281          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2282
2283        exit_bb:
2284          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2285
2286    For each name used out side EPILOG (i.e - for each name that has a lcssa
2287    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2288    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2289    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2290    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2291    in exit_bb will also be updated.  */
2292
2293 static void
2294 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2295                                     edge guard_edge, edge merge_edge)
2296 {
2297   gphi_iterator gsi;
2298   basic_block merge_bb = guard_edge->dest;
2299
2300   gcc_assert (single_succ_p (merge_bb));
2301   edge e = single_succ_edge (merge_bb);
2302   basic_block exit_bb = e->dest;
2303   gcc_assert (single_pred_p (exit_bb));
2304   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2305
2306   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2307     {
2308       gphi *update_phi = gsi.phi ();
2309       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2310
2311       tree merge_arg = NULL_TREE;
2312
2313       /* If the old argument is a SSA_NAME use its current_def.  */
2314       if (TREE_CODE (old_arg) == SSA_NAME)
2315         merge_arg = get_current_def (old_arg);
2316       /* If it's a constant or doesn't have a current_def, just use the old
2317          argument.  */
2318       if (!merge_arg)
2319         merge_arg = old_arg;
2320
2321       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2322       /* If the var is live after loop but not a reduction, we simply
2323          use the old arg.  */
2324       if (!guard_arg)
2325         guard_arg = old_arg;
2326
2327       /* Create new phi node in MERGE_BB:  */
2328       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2329       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2330
2331       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2332          the two PHI args in merge_phi for these edges.  */
2333       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2334       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2335
2336       /* Update the original phi in exit_bb.  */
2337       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2338     }
2339 }
2340
2341 /* EPILOG loop is duplicated from the original loop for vectorizing,
2342    the arg of its loop closed ssa PHI needs to be updated.  */
2343
2344 static void
2345 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2346 {
2347   gphi_iterator gsi;
2348   basic_block exit_bb = single_exit (epilog)->dest;
2349
2350   gcc_assert (single_pred_p (exit_bb));
2351   edge e = EDGE_PRED (exit_bb, 0);
2352   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2353     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2354 }
2355
2356 /* Function vect_do_peeling.
2357
2358    Input:
2359    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2360
2361        preheader:
2362      LOOP:
2363        header_bb:
2364          loop_body
2365          if (exit_loop_cond) goto exit_bb
2366          else                goto header_bb
2367        exit_bb:
2368
2369    - NITERS: The number of iterations of the loop.
2370    - NITERSM1: The number of iterations of the loop's latch.
2371    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2372    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2373                               CHECK_PROFITABILITY is true.
2374    Output:
2375    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2376      iterate after vectorization; see vect_set_loop_condition for details.
2377    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2378      should be set to the number of scalar iterations handled by the
2379      vector loop.  The SSA name is only used on exit from the loop.
2380
2381    This function peels prolog and epilog from the loop, adds guards skipping
2382    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2383    would look like:
2384
2385        guard_bb_1:
2386          if (prefer_scalar_loop) goto merge_bb_1
2387          else                    goto guard_bb_2
2388
2389        guard_bb_2:
2390          if (skip_prolog) goto merge_bb_2
2391          else             goto prolog_preheader
2392
2393        prolog_preheader:
2394      PROLOG:
2395        prolog_header_bb:
2396          prolog_body
2397          if (exit_prolog_cond) goto prolog_exit_bb
2398          else                  goto prolog_header_bb
2399        prolog_exit_bb:
2400
2401        merge_bb_2:
2402
2403        vector_preheader:
2404      VECTOR LOOP:
2405        vector_header_bb:
2406          vector_body
2407          if (exit_vector_cond) goto vector_exit_bb
2408          else                  goto vector_header_bb
2409        vector_exit_bb:
2410
2411        guard_bb_3:
2412          if (skip_epilog) goto merge_bb_3
2413          else             goto epilog_preheader
2414
2415        merge_bb_1:
2416
2417        epilog_preheader:
2418      EPILOG:
2419        epilog_header_bb:
2420          epilog_body
2421          if (exit_epilog_cond) goto merge_bb_3
2422          else                  goto epilog_header_bb
2423
2424        merge_bb_3:
2425
2426    Note this function peels prolog and epilog only if it's necessary,
2427    as well as guards.
2428    This function returns the epilogue loop if a decision was made to vectorize
2429    it, otherwise NULL.
2430
2431    The analysis resulting in this epilogue loop's loop_vec_info was performed
2432    in the same vect_analyze_loop call as the main loop's.  At that time
2433    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2434    vectorization factors than the main loop.  This list is stored in the main
2435    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2436    vectorize the epilogue loop for a lower vectorization factor,  the
2437    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2438    updated and linked to the epilogue loop.  This is later used to vectorize
2439    the epilogue.  The reason the loop_vec_info needs updating is that it was
2440    constructed based on the original main loop, and the epilogue loop is a
2441    copy of this loop, so all links pointing to statements in the original loop
2442    need updating.  Furthermore, these loop_vec_infos share the
2443    data_reference's records, which will also need to be updated.
2444
2445    TODO: Guard for prefer_scalar_loop should be emitted along with
2446    versioning conditions if loop versioning is needed.  */
2447
2448
2449 class loop *
2450 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2451                  tree *niters_vector, tree *step_vector,
2452                  tree *niters_vector_mult_vf_var, int th,
2453                  bool check_profitability, bool niters_no_overflow,
2454                  tree *advance)
2455 {
2456   edge e, guard_e;
2457   tree type = TREE_TYPE (niters), guard_cond;
2458   basic_block guard_bb, guard_to;
2459   profile_probability prob_prolog, prob_vector, prob_epilog;
2460   int estimated_vf;
2461   int prolog_peeling = 0;
2462   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2463   /* We currently do not support prolog peeling if the target alignment is not
2464      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2465      target alignment being constant.  */
2466   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2467   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2468     return NULL;
2469
2470   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2471     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2472
2473   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2474   poly_uint64 bound_epilog = 0;
2475   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2476       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2477     bound_epilog += vf - 1;
2478   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2479     bound_epilog += 1;
2480   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2481   poly_uint64 bound_scalar = bound_epilog;
2482
2483   if (!prolog_peeling && !epilog_peeling)
2484     return NULL;
2485
2486   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2487   estimated_vf = vect_vf_for_cost (loop_vinfo);
2488   if (estimated_vf == 2)
2489     estimated_vf = 3;
2490   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2491                         .apply_scale (estimated_vf - 1, estimated_vf);
2492
2493   class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2494   class loop *first_loop = loop;
2495   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2496
2497   /* We might have a queued need to update virtual SSA form.  As we
2498      delete the update SSA machinery below after doing a regular
2499      incremental SSA update during loop copying make sure we don't
2500      lose that fact.
2501      ???  Needing to update virtual SSA form by renaming is unfortunate
2502      but not all of the vectorizer code inserting new loads / stores
2503      properly assigns virtual operands to those statements.  */
2504   update_ssa (TODO_update_ssa_only_virtuals);
2505
2506   create_lcssa_for_virtual_phi (loop);
2507
2508   /* If we're vectorizing an epilogue loop, the update_ssa above will
2509      have ensured that the virtual operand is in SSA form throughout the
2510      vectorized main loop.  Normally it is possible to trace the updated
2511      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2512      back to scalar-stmt vuses, meaning that the effect of the SSA update
2513      remains local to the main loop.  However, there are rare cases in
2514      which the vectorized loop has vdefs even when the original scalar
2515      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2516      introduces clobbers of the temporary vector array, which in turn
2517      needs new vdefs.  If the scalar loop doesn't write to memory, these
2518      new vdefs will be the only ones in the vector loop.
2519
2520      In that case, update_ssa will have added a new virtual phi to the
2521      main loop, which previously didn't need one.  Ensure that we (locally)
2522      maintain LCSSA form for the virtual operand, just as we would have
2523      done if the virtual phi had existed from the outset.  This makes it
2524      easier to duplicate the scalar epilogue loop below.  */
2525   tree vop_to_rename = NULL_TREE;
2526   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2527     {
2528       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2529       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2530     }
2531
2532   if (MAY_HAVE_DEBUG_BIND_STMTS)
2533     {
2534       gcc_assert (!adjust_vec.exists ());
2535       adjust_vec.create (32);
2536     }
2537   initialize_original_copy_tables ();
2538
2539   /* Record the anchor bb at which the guard should be placed if the scalar
2540      loop might be preferred.  */
2541   basic_block anchor = loop_preheader_edge (loop)->src;
2542
2543   /* Generate the number of iterations for the prolog loop.  We do this here
2544      so that we can also get the upper bound on the number of iterations.  */
2545   tree niters_prolog;
2546   int bound_prolog = 0;
2547   if (prolog_peeling)
2548     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2549                                                   &bound_prolog);
2550   else
2551     niters_prolog = build_int_cst (type, 0);
2552
2553   loop_vec_info epilogue_vinfo = NULL;
2554   if (vect_epilogues)
2555     {
2556       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2557       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2558     }
2559
2560   tree niters_vector_mult_vf = NULL_TREE;
2561   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2562   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2563   edge update_e = NULL, skip_e = NULL;
2564   unsigned int lowest_vf = constant_lower_bound (vf);
2565   /* If we know the number of scalar iterations for the main loop we should
2566      check whether after the main loop there are enough iterations left over
2567      for the epilogue.  */
2568   if (vect_epilogues
2569       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2570       && prolog_peeling >= 0
2571       && known_eq (vf, lowest_vf))
2572     {
2573       unsigned HOST_WIDE_INT eiters
2574         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2575            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2576
2577       eiters -= prolog_peeling;
2578       eiters
2579         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2580
2581       unsigned int ratio;
2582       unsigned int epilogue_gaps
2583         = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2584       while (!(constant_multiple_p
2585                (GET_MODE_SIZE (loop_vinfo->vector_mode),
2586                 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2587                && eiters >= lowest_vf / ratio + epilogue_gaps))
2588         {
2589           delete epilogue_vinfo;
2590           epilogue_vinfo = NULL;
2591           if (loop_vinfo->epilogue_vinfos.length () == 0)
2592             {
2593               vect_epilogues = false;
2594               break;
2595             }
2596           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2597           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2598           epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2599         }
2600     }
2601   /* Prolog loop may be skipped.  */
2602   bool skip_prolog = (prolog_peeling != 0);
2603   /* Skip this loop to epilog when there are not enough iterations to enter this
2604      vectorized loop.  If true we should perform runtime checks on the NITERS
2605      to check whether we should skip the current vectorized loop.  If we know
2606      the number of scalar iterations we may choose to add a runtime check if
2607      this number "maybe" smaller than the number of iterations required
2608      when we know the number of scalar iterations may potentially
2609      be smaller than the number of iterations required to enter this loop, for
2610      this we use the upper bounds on the prolog and epilog peeling.  When we
2611      don't know the number of iterations and don't require versioning it is
2612      because we have asserted that there are enough scalar iterations to enter
2613      the main loop, so this skip is not necessary.  When we are versioning then
2614      we only add such a skip if we have chosen to vectorize the epilogue.  */
2615   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2616                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2617                                   bound_prolog + bound_epilog)
2618                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2619                          || vect_epilogues));
2620   /* Epilog loop must be executed if the number of iterations for epilog
2621      loop is known at compile time, otherwise we need to add a check at
2622      the end of vector loop and skip to the end of epilog loop.  */
2623   bool skip_epilog = (prolog_peeling < 0
2624                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2625                       || !vf.is_constant ());
2626   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2627   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2628     skip_epilog = false;
2629
2630   if (skip_vector)
2631     {
2632       split_edge (loop_preheader_edge (loop));
2633
2634       /* Due to the order in which we peel prolog and epilog, we first
2635          propagate probability to the whole loop.  The purpose is to
2636          avoid adjusting probabilities of both prolog and vector loops
2637          separately.  Note in this case, the probability of epilog loop
2638          needs to be scaled back later.  */
2639       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2640       if (prob_vector.initialized_p ())
2641         {
2642           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2643           scale_loop_profile (loop, prob_vector, 0);
2644         }
2645     }
2646
2647   dump_user_location_t loop_loc = find_loop_location (loop);
2648   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2649   if (vect_epilogues)
2650     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2651        use the original scalar loop as remaining epilogue if necessary.  */
2652     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2653       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2654
2655   if (prolog_peeling)
2656     {
2657       e = loop_preheader_edge (loop);
2658       if (!slpeel_can_duplicate_loop_p (loop, e))
2659         {
2660           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2661                            "loop can't be duplicated to preheader edge.\n");
2662           gcc_unreachable ();
2663         }
2664       /* Peel prolog and put it on preheader edge of loop.  */
2665       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2666       if (!prolog)
2667         {
2668           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2669                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2670           gcc_unreachable ();
2671         }
2672       prolog->force_vectorize = false;
2673       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2674       first_loop = prolog;
2675       reset_original_copy_tables ();
2676
2677       /* Update the number of iterations for prolog loop.  */
2678       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2679       vect_set_loop_condition (prolog, NULL, niters_prolog,
2680                                step_prolog, NULL_TREE, false);
2681
2682       /* Skip the prolog loop.  */
2683       if (skip_prolog)
2684         {
2685           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2686                                     niters_prolog, build_int_cst (type, 0));
2687           guard_bb = loop_preheader_edge (prolog)->src;
2688           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2689           guard_to = split_edge (loop_preheader_edge (loop));
2690           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2691                                            guard_to, guard_bb,
2692                                            prob_prolog.invert (),
2693                                            irred_flag);
2694           e = EDGE_PRED (guard_to, 0);
2695           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2696           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2697
2698           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2699           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2700         }
2701
2702       /* Update init address of DRs.  */
2703       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2704       /* Update niters for vector loop.  */
2705       LOOP_VINFO_NITERS (loop_vinfo)
2706         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2707       LOOP_VINFO_NITERSM1 (loop_vinfo)
2708         = fold_build2 (MINUS_EXPR, type,
2709                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2710       bool new_var_p = false;
2711       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2712       /* It's guaranteed that vector loop bound before vectorization is at
2713          least VF, so set range information for newly generated var.  */
2714       if (new_var_p)
2715         set_range_info (niters, VR_RANGE,
2716                         wi::to_wide (build_int_cst (type, vf)),
2717                         wi::to_wide (TYPE_MAX_VALUE (type)));
2718
2719       /* Prolog iterates at most bound_prolog times, latch iterates at
2720          most bound_prolog - 1 times.  */
2721       record_niter_bound (prolog, bound_prolog - 1, false, true);
2722       delete_update_ssa ();
2723       adjust_vec_debug_stmts ();
2724       scev_reset ();
2725     }
2726
2727   if (epilog_peeling)
2728     {
2729       e = single_exit (loop);
2730       if (!slpeel_can_duplicate_loop_p (loop, e))
2731         {
2732           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2733                            "loop can't be duplicated to exit edge.\n");
2734           gcc_unreachable ();
2735         }
2736       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2737          said epilog then we should use a copy of the main loop as a starting
2738          point.  This loop may have already had some preliminary transformations
2739          to allow for more optimal vectorization, for example if-conversion.
2740          If we are not vectorizing the epilog then we should use the scalar loop
2741          as the transformations mentioned above make less or no sense when not
2742          vectorizing.  */
2743       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2744       if (vop_to_rename)
2745         {
2746           /* Vectorizing the main loop can sometimes introduce a vdef to
2747              a loop that previously didn't have one; see the comment above
2748              the definition of VOP_TO_RENAME for details.  The definition
2749              D that holds on E will then be different from the definition
2750              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2751              rename VOP_TO_RENAME to D when copying the loop.
2752
2753              The virtual operand is in LCSSA form for the main loop,
2754              and no stmt between the main loop and E needs a vdef,
2755              so we know that D is provided by a phi rather than by a
2756              vdef on a normal gimple stmt.  */
2757           basic_block vdef_bb = e->src;
2758           gphi *vphi;
2759           while (!(vphi = get_virtual_phi (vdef_bb)))
2760             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2761           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2762           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2763         }
2764       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2765       if (!epilog)
2766         {
2767           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2768                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2769           gcc_unreachable ();
2770         }
2771       epilog->force_vectorize = false;
2772       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2773
2774       /* Scalar version loop may be preferred.  In this case, add guard
2775          and skip to epilog.  Note this only happens when the number of
2776          iterations of loop is unknown at compile time, otherwise this
2777          won't be vectorized.  */
2778       if (skip_vector)
2779         {
2780           /* Additional epilogue iteration is peeled if gap exists.  */
2781           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2782                                                 bound_prolog, bound_epilog,
2783                                                 th, &bound_scalar,
2784                                                 check_profitability);
2785           /* Build guard against NITERSM1 since NITERS may overflow.  */
2786           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2787           guard_bb = anchor;
2788           guard_to = split_edge (loop_preheader_edge (epilog));
2789           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2790                                            guard_to, guard_bb,
2791                                            prob_vector.invert (),
2792                                            irred_flag);
2793           skip_e = guard_e;
2794           e = EDGE_PRED (guard_to, 0);
2795           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2796           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2797
2798           /* Simply propagate profile info from guard_bb to guard_to which is
2799              a merge point of control flow.  */
2800           guard_to->count = guard_bb->count;
2801
2802           /* Scale probability of epilog loop back.
2803              FIXME: We should avoid scaling down and back up.  Profile may
2804              get lost if we scale down to 0.  */
2805           basic_block *bbs = get_loop_body (epilog);
2806           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2807             bbs[i]->count = bbs[i]->count.apply_scale
2808                                  (bbs[i]->count,
2809                                   bbs[i]->count.apply_probability
2810                                     (prob_vector));
2811           free (bbs);
2812         }
2813
2814       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2815       /* If loop is peeled for non-zero constant times, now niters refers to
2816          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2817          overflows.  */
2818       niters_no_overflow |= (prolog_peeling > 0);
2819       vect_gen_vector_loop_niters (loop_vinfo, niters,
2820                                    niters_vector, step_vector,
2821                                    niters_no_overflow);
2822       if (!integer_onep (*step_vector))
2823         {
2824           /* On exit from the loop we will have an easy way of calcalating
2825              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2826              until then.  */
2827           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2828           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2829           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2830         }
2831       else
2832         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2833                                              &niters_vector_mult_vf);
2834       /* Update IVs of original loop as if they were advanced by
2835          niters_vector_mult_vf steps.  */
2836       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2837       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2838       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2839                                         update_e);
2840
2841       if (skip_epilog)
2842         {
2843           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2844                                     niters, niters_vector_mult_vf);
2845           guard_bb = single_exit (loop)->dest;
2846           guard_to = split_edge (single_exit (epilog));
2847           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2848                                            skip_vector ? anchor : guard_bb,
2849                                            prob_epilog.invert (),
2850                                            irred_flag);
2851           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2852                                               single_exit (epilog));
2853           /* Only need to handle basic block before epilog loop if it's not
2854              the guard_bb, which is the case when skip_vector is true.  */
2855           if (guard_bb != bb_before_epilog)
2856             {
2857               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2858
2859               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2860             }
2861           scale_loop_profile (epilog, prob_epilog, 0);
2862         }
2863       else
2864         slpeel_update_phi_nodes_for_lcssa (epilog);
2865
2866       unsigned HOST_WIDE_INT bound;
2867       if (bound_scalar.is_constant (&bound))
2868         {
2869           gcc_assert (bound != 0);
2870           /* -1 to convert loop iterations to latch iterations.  */
2871           record_niter_bound (epilog, bound - 1, false, true);
2872         }
2873
2874       delete_update_ssa ();
2875       adjust_vec_debug_stmts ();
2876       scev_reset ();
2877     }
2878
2879   if (vect_epilogues)
2880     {
2881       epilog->aux = epilogue_vinfo;
2882       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2883
2884       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2885
2886       /* We now must calculate the number of NITERS performed by the previous
2887          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2888       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2889                                  niters_prolog, niters_vector_mult_vf);
2890
2891       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2892          determine whether we are coming from the previous vectorized loop
2893          using the update_e edge or the skip_vector basic block using the
2894          skip_e edge.  */
2895       if (skip_vector)
2896         {
2897           gcc_assert (update_e != NULL && skip_e != NULL);
2898           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2899                                            update_e->dest);
2900           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2901           gimple *stmt = gimple_build_assign (new_ssa, niters);
2902           gimple_stmt_iterator gsi;
2903           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2904               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2905             {
2906               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2907               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2908             }
2909           else
2910             {
2911               gsi = gsi_last_bb (update_e->src);
2912               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2913             }
2914
2915           niters = new_ssa;
2916           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2917           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2918                        UNKNOWN_LOCATION);
2919           niters = PHI_RESULT (new_phi);
2920         }
2921
2922       /* Subtract the number of iterations performed by the vectorized loop
2923          from the number of total iterations.  */
2924       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2925                                           before_loop_niters,
2926                                           niters);
2927
2928       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2929       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2930         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2931                        epilogue_niters,
2932                        build_one_cst (TREE_TYPE (epilogue_niters)));
2933
2934       /* Set ADVANCE to the number of iterations performed by the previous
2935          loop and its prologue.  */
2936       *advance = niters;
2937
2938       /* Redo the peeling for niter analysis as the NITERs and alignment
2939          may have been updated to take the main loop into account.  */
2940       determine_peel_for_niter (epilogue_vinfo);
2941     }
2942
2943   adjust_vec.release ();
2944   free_original_copy_tables ();
2945
2946   return vect_epilogues ? epilog : NULL;
2947 }
2948
2949 /* Function vect_create_cond_for_niters_checks.
2950
2951    Create a conditional expression that represents the run-time checks for
2952    loop's niter.  The loop is guaranteed to terminate if the run-time
2953    checks hold.
2954
2955    Input:
2956    COND_EXPR  - input conditional expression.  New conditions will be chained
2957                 with logical AND operation.  If it is NULL, then the function
2958                 is used to return the number of alias checks.
2959    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2960                 to be checked.
2961
2962    Output:
2963    COND_EXPR - conditional expression.
2964
2965    The returned COND_EXPR is the conditional expression to be used in the
2966    if statement that controls which version of the loop gets executed at
2967    runtime.  */
2968
2969 static void
2970 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2971 {
2972   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2973
2974   if (*cond_expr)
2975     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2976                               *cond_expr, part_cond_expr);
2977   else
2978     *cond_expr = part_cond_expr;
2979 }
2980
2981 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2982    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2983
2984 static void
2985 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2986 {
2987   if (*cond_expr)
2988     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2989                               *cond_expr, part_cond_expr);
2990   else
2991     *cond_expr = part_cond_expr;
2992 }
2993
2994 /* Function vect_create_cond_for_align_checks.
2995
2996    Create a conditional expression that represents the alignment checks for
2997    all of data references (array element references) whose alignment must be
2998    checked at runtime.
2999
3000    Input:
3001    COND_EXPR  - input conditional expression.  New conditions will be chained
3002                 with logical AND operation.
3003    LOOP_VINFO - two fields of the loop information are used.
3004                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3005                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3006
3007    Output:
3008    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3009                          expression.
3010    The returned value is the conditional expression to be used in the if
3011    statement that controls which version of the loop gets executed at runtime.
3012
3013    The algorithm makes two assumptions:
3014      1) The number of bytes "n" in a vector is a power of 2.
3015      2) An address "a" is aligned if a%n is zero and that this
3016         test can be done as a&(n-1) == 0.  For example, for 16
3017         byte vectors the test is a&0xf == 0.  */
3018
3019 static void
3020 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3021                                    tree *cond_expr,
3022                                    gimple_seq *cond_expr_stmt_list)
3023 {
3024   vec<stmt_vec_info> may_misalign_stmts
3025     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3026   stmt_vec_info stmt_info;
3027   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3028   tree mask_cst;
3029   unsigned int i;
3030   tree int_ptrsize_type;
3031   char tmp_name[20];
3032   tree or_tmp_name = NULL_TREE;
3033   tree and_tmp_name;
3034   gimple *and_stmt;
3035   tree ptrsize_zero;
3036   tree part_cond_expr;
3037
3038   /* Check that mask is one less than a power of 2, i.e., mask is
3039      all zeros followed by all ones.  */
3040   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3041
3042   int_ptrsize_type = signed_type_for (ptr_type_node);
3043
3044   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3045      of the first vector of the i'th data reference. */
3046
3047   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3048     {
3049       gimple_seq new_stmt_list = NULL;
3050       tree addr_base;
3051       tree addr_tmp_name;
3052       tree new_or_tmp_name;
3053       gimple *addr_stmt, *or_stmt;
3054       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3055       bool negative = tree_int_cst_compare
3056         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3057       tree offset = negative
3058         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3059
3060       /* create: addr_tmp = (int)(address_of_first_vector) */
3061       addr_base =
3062         vect_create_addr_base_for_vector_ref (loop_vinfo,
3063                                               stmt_info, &new_stmt_list,
3064                                               offset);
3065       if (new_stmt_list != NULL)
3066         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3067
3068       sprintf (tmp_name, "addr2int%d", i);
3069       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3070       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3071       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3072
3073       /* The addresses are OR together.  */
3074
3075       if (or_tmp_name != NULL_TREE)
3076         {
3077           /* create: or_tmp = or_tmp | addr_tmp */
3078           sprintf (tmp_name, "orptrs%d", i);
3079           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3080           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3081                                          or_tmp_name, addr_tmp_name);
3082           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3083           or_tmp_name = new_or_tmp_name;
3084         }
3085       else
3086         or_tmp_name = addr_tmp_name;
3087
3088     } /* end for i */
3089
3090   mask_cst = build_int_cst (int_ptrsize_type, mask);
3091
3092   /* create: and_tmp = or_tmp & mask  */
3093   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3094
3095   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3096                                   or_tmp_name, mask_cst);
3097   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3098
3099   /* Make and_tmp the left operand of the conditional test against zero.
3100      if and_tmp has a nonzero bit then some address is unaligned.  */
3101   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3102   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3103                                 and_tmp_name, ptrsize_zero);
3104   chain_cond_expr (cond_expr, part_cond_expr);
3105 }
3106
3107 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3108    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3109    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3110    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3111
3112 static void
3113 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3114 {
3115   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3116   unsigned int i;
3117   vec_object_pair *pair;
3118   FOR_EACH_VEC_ELT (pairs, i, pair)
3119     {
3120       tree addr1 = build_fold_addr_expr (pair->first);
3121       tree addr2 = build_fold_addr_expr (pair->second);
3122       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3123                                          addr1, addr2);
3124       chain_cond_expr (cond_expr, part_cond_expr);
3125     }
3126 }
3127
3128 /* Create an expression that is true when all lower-bound conditions for
3129    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3130
3131 static void
3132 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3133 {
3134   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3135   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3136     {
3137       tree expr = lower_bounds[i].expr;
3138       tree type = unsigned_type_for (TREE_TYPE (expr));
3139       expr = fold_convert (type, expr);
3140       poly_uint64 bound = lower_bounds[i].min_value;
3141       if (!lower_bounds[i].unsigned_p)
3142         {
3143           expr = fold_build2 (PLUS_EXPR, type, expr,
3144                               build_int_cstu (type, bound - 1));
3145           bound += bound - 1;
3146         }
3147       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3148                                          build_int_cstu (type, bound));
3149       chain_cond_expr (cond_expr, part_cond_expr);
3150     }
3151 }
3152
3153 /* Function vect_create_cond_for_alias_checks.
3154
3155    Create a conditional expression that represents the run-time checks for
3156    overlapping of address ranges represented by a list of data references
3157    relations passed as input.
3158
3159    Input:
3160    COND_EXPR  - input conditional expression.  New conditions will be chained
3161                 with logical AND operation.  If it is NULL, then the function
3162                 is used to return the number of alias checks.
3163    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3164                 to be checked.
3165
3166    Output:
3167    COND_EXPR - conditional expression.
3168
3169    The returned COND_EXPR is the conditional expression to be used in the if
3170    statement that controls which version of the loop gets executed at runtime.
3171 */
3172
3173 void
3174 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3175 {
3176   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3177     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3178
3179   if (comp_alias_ddrs.is_empty ())
3180     return;
3181
3182   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3183                                &comp_alias_ddrs, cond_expr);
3184   if (dump_enabled_p ())
3185     dump_printf_loc (MSG_NOTE, vect_location,
3186                      "created %u versioning for alias checks.\n",
3187                      comp_alias_ddrs.length ());
3188 }
3189
3190
3191 /* Function vect_loop_versioning.
3192
3193    If the loop has data references that may or may not be aligned or/and
3194    has data reference relations whose independence was not proven then
3195    two versions of the loop need to be generated, one which is vectorized
3196    and one which isn't.  A test is then generated to control which of the
3197    loops is executed.  The test checks for the alignment of all of the
3198    data references that may or may not be aligned.  An additional
3199    sequence of runtime tests is generated for each pairs of DDRs whose
3200    independence was not proven.  The vectorized version of loop is
3201    executed only if both alias and alignment tests are passed.
3202
3203    The test generated to check which version of loop is executed
3204    is modified to also check for profitability as indicated by the
3205    cost model threshold TH.
3206
3207    The versioning precondition(s) are placed in *COND_EXPR and
3208    *COND_EXPR_STMT_LIST.  */
3209
3210 class loop *
3211 vect_loop_versioning (loop_vec_info loop_vinfo,
3212                       gimple *loop_vectorized_call)
3213 {
3214   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3215   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3216   basic_block condition_bb;
3217   gphi_iterator gsi;
3218   gimple_stmt_iterator cond_exp_gsi;
3219   basic_block merge_bb;
3220   basic_block new_exit_bb;
3221   edge new_exit_e, e;
3222   gphi *orig_phi, *new_phi;
3223   tree cond_expr = NULL_TREE;
3224   gimple_seq cond_expr_stmt_list = NULL;
3225   tree arg;
3226   profile_probability prob = profile_probability::likely ();
3227   gimple_seq gimplify_stmt_list = NULL;
3228   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3229   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3230   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3231   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3232   poly_uint64 versioning_threshold
3233     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3234   tree version_simd_if_cond
3235     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3236   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3237
3238   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3239       && !ordered_p (th, versioning_threshold))
3240     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3241                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3242                                             th - 1));
3243   if (maybe_ne (versioning_threshold, 0U))
3244     {
3245       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3246                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3247                                               versioning_threshold - 1));
3248       if (cond_expr)
3249         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3250                                  expr, cond_expr);
3251       else
3252         cond_expr = expr;
3253     }
3254
3255   if (version_niter)
3256     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3257
3258   if (cond_expr)
3259     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3260                                         &cond_expr_stmt_list,
3261                                         is_gimple_condexpr, NULL_TREE);
3262
3263   if (version_align)
3264     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3265                                        &cond_expr_stmt_list);
3266
3267   if (version_alias)
3268     {
3269       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3270       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3271       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3272     }
3273
3274   if (version_simd_if_cond)
3275     {
3276       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3277       if (flag_checking)
3278         if (basic_block bb
3279             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3280           gcc_assert (bb != loop->header
3281                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3282                       && (scalar_loop == NULL
3283                           || (bb != scalar_loop->header
3284                               && dominated_by_p (CDI_DOMINATORS,
3285                                                  scalar_loop->header, bb))));
3286       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3287       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3288                             version_simd_if_cond, zero);
3289       if (cond_expr)
3290         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3291                                  c, cond_expr);
3292       else
3293         cond_expr = c;
3294       if (dump_enabled_p ())
3295         dump_printf_loc (MSG_NOTE, vect_location,
3296                          "created versioning for simd if condition check.\n");
3297     }
3298
3299   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3300                                       &gimplify_stmt_list,
3301                                       is_gimple_condexpr, NULL_TREE);
3302   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3303
3304   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3305      invariant in.  */
3306   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3307   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3308        !gsi_end_p (gsi); gsi_next (&gsi))
3309     {
3310       gimple *stmt = gsi_stmt (gsi);
3311       update_stmt (stmt);
3312       ssa_op_iter iter;
3313       use_operand_p use_p;
3314       basic_block def_bb;
3315       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3316         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3317             && flow_bb_inside_loop_p (outermost, def_bb))
3318           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3319     }
3320
3321   /* Search for the outermost loop we can version.  Avoid versioning of
3322      non-perfect nests but allow if-conversion versioned loops inside.  */
3323   class loop *loop_to_version = loop;
3324   if (flow_loop_nested_p (outermost, loop))
3325     {
3326       if (dump_enabled_p ())
3327         dump_printf_loc (MSG_NOTE, vect_location,
3328                          "trying to apply versioning to outer loop %d\n",
3329                          outermost->num);
3330       if (outermost->num == 0)
3331         outermost = superloop_at_depth (loop, 1);
3332       /* And avoid applying versioning on non-perfect nests.  */
3333       while (loop_to_version != outermost
3334              && single_exit (loop_outer (loop_to_version))
3335              && (!loop_outer (loop_to_version)->inner->next
3336                  || vect_loop_vectorized_call (loop_to_version))
3337              && (!loop_outer (loop_to_version)->inner->next
3338                  || !loop_outer (loop_to_version)->inner->next->next))
3339         loop_to_version = loop_outer (loop_to_version);
3340     }
3341
3342   /* Apply versioning.  If there is already a scalar version created by
3343      if-conversion re-use that.  Note we cannot re-use the copy of
3344      an if-converted outer-loop when vectorizing the inner loop only.  */
3345   gcond *cond;
3346   if ((!loop_to_version->inner || loop == loop_to_version)
3347       && loop_vectorized_call)
3348     {
3349       gcc_assert (scalar_loop);
3350       condition_bb = gimple_bb (loop_vectorized_call);
3351       cond = as_a <gcond *> (last_stmt (condition_bb));
3352       gimple_cond_set_condition_from_tree (cond, cond_expr);
3353       update_stmt (cond);
3354
3355       if (cond_expr_stmt_list)
3356         {
3357           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3358           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3359                                  GSI_SAME_STMT);
3360         }
3361
3362       /* if-conversion uses profile_probability::always () for both paths,
3363          reset the paths probabilities appropriately.  */
3364       edge te, fe;
3365       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3366       te->probability = prob;
3367       fe->probability = prob.invert ();
3368       /* We can scale loops counts immediately but have to postpone
3369          scaling the scalar loop because we re-use it during peeling.  */
3370       scale_loop_frequencies (loop_to_version, te->probability);
3371       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3372
3373       nloop = scalar_loop;
3374       if (dump_enabled_p ())
3375         dump_printf_loc (MSG_NOTE, vect_location,
3376                          "reusing %sloop version created by if conversion\n",
3377                          loop_to_version != loop ? "outer " : "");
3378     }
3379   else
3380     {
3381       if (loop_to_version != loop
3382           && dump_enabled_p ())
3383         dump_printf_loc (MSG_NOTE, vect_location,
3384                          "applying loop versioning to outer loop %d\n",
3385                          loop_to_version->num);
3386
3387       initialize_original_copy_tables ();
3388       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3389                             prob, prob.invert (), prob, prob.invert (), true);
3390       gcc_assert (nloop);
3391       nloop = get_loop_copy (loop);
3392
3393       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3394          reap those otherwise;  they also refer to the original
3395          loops.  */
3396       class loop *l = loop;
3397       while (gimple *call = vect_loop_vectorized_call (l))
3398         {
3399           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3400           fold_loop_internal_call (call, boolean_false_node);
3401           l = loop_outer (l);
3402         }
3403       free_original_copy_tables ();
3404
3405       if (cond_expr_stmt_list)
3406         {
3407           cond_exp_gsi = gsi_last_bb (condition_bb);
3408           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3409                                  GSI_SAME_STMT);
3410         }
3411
3412       /* Loop versioning violates an assumption we try to maintain during
3413          vectorization - that the loop exit block has a single predecessor.
3414          After versioning, the exit block of both loop versions is the same
3415          basic block (i.e. it has two predecessors). Just in order to simplify
3416          following transformations in the vectorizer, we fix this situation
3417          here by adding a new (empty) block on the exit-edge of the loop,
3418          with the proper loop-exit phis to maintain loop-closed-form.
3419          If loop versioning wasn't done from loop, but scalar_loop instead,
3420          merge_bb will have already just a single successor.  */
3421
3422       merge_bb = single_exit (loop_to_version)->dest;
3423       if (EDGE_COUNT (merge_bb->preds) >= 2)
3424         {
3425           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3426           new_exit_bb = split_edge (single_exit (loop_to_version));
3427           new_exit_e = single_exit (loop_to_version);
3428           e = EDGE_SUCC (new_exit_bb, 0);
3429
3430           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3431                gsi_next (&gsi))
3432             {
3433               tree new_res;
3434               orig_phi = gsi.phi ();
3435               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3436               new_phi = create_phi_node (new_res, new_exit_bb);
3437               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3438               add_phi_arg (new_phi, arg, new_exit_e,
3439                            gimple_phi_arg_location_from_edge (orig_phi, e));
3440               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3441             }
3442         }
3443
3444       update_ssa (TODO_update_ssa);
3445     }
3446
3447   if (version_niter)
3448     {
3449       /* The versioned loop could be infinite, we need to clear existing
3450          niter information which is copied from the original loop.  */
3451       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3452       vect_free_loop_info_assumptions (nloop);
3453       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3454       loop_constraint_set (loop, LOOP_C_INFINITE);
3455     }
3456
3457   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3458       && dump_enabled_p ())
3459     {
3460       if (version_alias)
3461         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3462                          vect_location,
3463                          "loop versioned for vectorization because of "
3464                          "possible aliasing\n");
3465       if (version_align)
3466         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3467                          vect_location,
3468                          "loop versioned for vectorization to enhance "
3469                          "alignment\n");
3470
3471     }
3472
3473   return nloop;
3474 }