gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 324                                rgroup_controls *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->type;
 327   tree dest_masktype = dest_rgm->type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 342         {
 343           tree src = src_rgm->controls[i / 2];
 344           tree dest = dest_rgm->controls[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 375         {
 376           tree src = src_rgm->controls[i / 2];
 377           tree dest = dest_rgm->controls[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 388    for all the rgroup controls in RGC and return a control that is nonzero
 389    when the loop needs to iterate.  Add any new preheader statements to
 390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGC->max_nscalars_per_iter * RGC->factor
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks or zero
 415    lengths for RGC.
 416
 417    Note: the cost of the code generated by this function is modeled
 418    by vect_estimate_min_profitable_iters, so changes here may need
 419    corresponding changes there.  */
 420
 421 static tree
 422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 423                                  gimple_seq *preheader_seq,
 424                                  gimple_seq *header_seq,
 425                                  gimple_stmt_iterator loop_cond_gsi,
 426                                  rgroup_controls *rgc, tree niters,
 427                                  tree niters_skip, bool might_wrap_p)
 428 {
 429   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 430   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 431   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 432
 433   tree ctrl_type = rgc->type;
 434   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 435   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 436   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 437   tree length_limit = NULL_TREE;
 438   /* For length, we need length_limit to ensure length in range.  */
 439   if (!use_masks_p)
 440     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 441
 442   /* Calculate the maximum number of item values that the rgroup
 443      handles in total, the number that it handles for each iteration
 444      of the vector loop, and the number that it should skip during the
 445      first iteration of the vector loop.  */
 446   tree nitems_total = niters;
 447   tree nitems_step = build_int_cst (iv_type, vf);
 448   tree nitems_skip = niters_skip;
 449   if (nitems_per_iter != 1)
 450     {
 451       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 452          these multiplications don't overflow.  */
 453       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 454       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 455       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 456                                    nitems_total, compare_factor);
 457       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 458                                   nitems_step, iv_factor);
 459       if (nitems_skip)
 460         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 461                                     nitems_skip, compare_factor);
 462     }
 463
 464   /* Create an induction variable that counts the number of items
 465      processed.  */
 466   tree index_before_incr, index_after_incr;
 467   gimple_stmt_iterator incr_gsi;
 468   bool insert_after;
 469   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 470   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 471              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 472
 473   tree zero_index = build_int_cst (compare_type, 0);
 474   tree test_index, test_limit, first_limit;
 475   gimple_stmt_iterator *test_gsi;
 476   if (might_wrap_p)
 477     {
 478       /* In principle the loop should stop iterating once the incremented
 479          IV reaches a value greater than or equal to:
 480
 481            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 482
 483          However, there's no guarantee that this addition doesn't overflow
 484          the comparison type, or that the IV hits a value above it before
 485          wrapping around.  We therefore adjust the limit down by one
 486          IV step:
 487
 488            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 489            -[infinite-prec] NITEMS_STEP
 490
 491          and compare the IV against this limit _before_ incrementing it.
 492          Since the comparison type is unsigned, we actually want the
 493          subtraction to saturate at zero:
 494
 495            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 496            -[sat] NITEMS_STEP
 497
 498          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 499
 500            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 501
 502          where the rightmost subtraction can be done directly in
 503          COMPARE_TYPE.  */
 504       test_index = index_before_incr;
 505       tree adjust = gimple_convert (preheader_seq, compare_type,
 506                                     nitems_step);
 507       if (nitems_skip)
 508         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 509                                adjust, nitems_skip);
 510       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 511                                  nitems_total, adjust);
 512       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 513                                  test_limit, adjust);
 514       test_gsi = &incr_gsi;
 515
 516       /* Get a safe limit for the first iteration.  */
 517       if (nitems_skip)
 518         {
 519           /* The first vector iteration can handle at most NITEMS_STEP
 520              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 521              NITEMS_SKIP to that cannot overflow.  */
 522           tree const_limit = build_int_cst (compare_type,
 523                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 524                                             * nitems_per_iter);
 525           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 526                                       nitems_total, const_limit);
 527           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 528                                       first_limit, nitems_skip);
 529         }
 530       else
 531         /* For the first iteration it doesn't matter whether the IV hits
 532            a value above NITEMS_TOTAL.  That only matters for the latch
 533            condition.  */
 534         first_limit = nitems_total;
 535     }
 536   else
 537     {
 538       /* Test the incremented IV, which will always hit a value above
 539          the bound before wrapping.  */
 540       test_index = index_after_incr;
 541       test_limit = nitems_total;
 542       if (nitems_skip)
 543         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 544                                    test_limit, nitems_skip);
 545       test_gsi = &loop_cond_gsi;
 546
 547       first_limit = test_limit;
 548     }
 549
 550   /* Convert the IV value to the comparison type (either a no-op or
 551      a demotion).  */
 552   gimple_seq test_seq = NULL;
 553   test_index = gimple_convert (&test_seq, compare_type, test_index);
 554   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 555
 556   /* Provide a definition of each control in the group.  */
 557   tree next_ctrl = NULL_TREE;
 558   tree ctrl;
 559   unsigned int i;
 560   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 561     {
 562       /* Previous controls will cover BIAS items.  This control covers the
 563          next batch.  */
 564       poly_uint64 bias = nitems_per_ctrl * i;
 565       tree bias_tree = build_int_cst (compare_type, bias);
 566
 567       /* See whether the first iteration of the vector loop is known
 568          to have a full control.  */
 569       poly_uint64 const_limit;
 570       bool first_iteration_full
 571         = (poly_int_tree_p (first_limit, &const_limit)
 572            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 573
 574       /* Rather than have a new IV that starts at BIAS and goes up to
 575          TEST_LIMIT, prefer to use the same 0-based IV for each control
 576          and adjust the bound down by BIAS.  */
 577       tree this_test_limit = test_limit;
 578       if (i != 0)
 579         {
 580           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 581                                           compare_type, this_test_limit,
 582                                           bias_tree);
 583           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 584                                           compare_type, this_test_limit,
 585                                           bias_tree);
 586         }
 587
 588       /* Create the initial control.  First include all items that
 589          are within the loop limit.  */
 590       tree init_ctrl = NULL_TREE;
 591       if (!first_iteration_full)
 592         {
 593           tree start, end;
 594           if (first_limit == test_limit)
 595             {
 596               /* Use a natural test between zero (the initial IV value)
 597                  and the loop limit.  The "else" block would be valid too,
 598                  but this choice can avoid the need to load BIAS_TREE into
 599                  a register.  */
 600               start = zero_index;
 601               end = this_test_limit;
 602             }
 603           else
 604             {
 605               /* FIRST_LIMIT is the maximum number of items handled by the
 606                  first iteration of the vector loop.  Test the portion
 607                  associated with this control.  */
 608               start = bias_tree;
 609               end = first_limit;
 610             }
 611
 612           if (use_masks_p)
 613             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 614                                         start, end, "max_mask");
 615           else
 616             {
 617               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 618               gimple_seq seq = vect_gen_len (init_ctrl, start,
 619                                              end, length_limit);
 620               gimple_seq_add_seq (preheader_seq, seq);
 621             }
 622         }
 623
 624       /* Now AND out the bits that are within the number of skipped
 625          items.  */
 626       poly_uint64 const_skip;
 627       if (nitems_skip
 628           && !(poly_int_tree_p (nitems_skip, &const_skip)
 629                && known_le (const_skip, bias)))
 630         {
 631           gcc_assert (use_masks_p);
 632           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 633                                                     bias_tree, nitems_skip);
 634           if (init_ctrl)
 635             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 636                                       init_ctrl, unskipped_mask);
 637           else
 638             init_ctrl = unskipped_mask;
 639         }
 640
 641       if (!init_ctrl)
 642         {
 643           /* First iteration is full.  */
 644           if (use_masks_p)
 645             init_ctrl = build_minus_one_cst (ctrl_type);
 646           else
 647             init_ctrl = length_limit;
 648         }
 649
 650       /* Get the control value for the next iteration of the loop.  */
 651       if (use_masks_p)
 652         {
 653           gimple_seq stmts = NULL;
 654           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 655                                       this_test_limit, "next_mask");
 656           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 657         }
 658       else
 659         {
 660           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 661           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 662                                          length_limit);
 663           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 664         }
 665
 666       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 667     }
 668
 669   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 670   if (partial_load_bias != 0)
 671     {
 672       tree adjusted_len = rgc->bias_adjusted_ctrl;
 673       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 674                                             rgc->controls[0],
 675                                             build_int_cst
 676                                             (TREE_TYPE (rgc->controls[0]),
 677                                              partial_load_bias));
 678       gimple_seq_add_stmt (header_seq, minus);
 679     }
 680
 681   return next_ctrl;
 682 }
 683
 684 /* Set up the iteration condition and rgroup controls for LOOP, given
 685    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 686    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 687    the number of iterations of the original scalar loop that should be
 688    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 689    for vect_set_loop_condition.
 690
 691    Insert the branch-back condition before LOOP_COND_GSI and return the
 692    final gcond.  */
 693
 694 static gcond *
 695 vect_set_loop_condition_partial_vectors (class loop *loop,
 696                                          loop_vec_info loop_vinfo, tree niters,
 697                                          tree final_iv, bool niters_maybe_zero,
 698                                          gimple_stmt_iterator loop_cond_gsi)
 699 {
 700   gimple_seq preheader_seq = NULL;
 701   gimple_seq header_seq = NULL;
 702
 703   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 704   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 705   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 706   tree orig_niters = niters;
 707
 708   /* Type of the initial value of NITERS.  */
 709   tree ni_actual_type = TREE_TYPE (niters);
 710   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 711   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 712
 713   /* Convert NITERS to the same size as the compare.  */
 714   if (compare_precision > ni_actual_precision
 715       && niters_maybe_zero)
 716     {
 717       /* We know that there is always at least one iteration, so if the
 718          count is zero then it must have wrapped.  Cope with this by
 719          subtracting 1 before the conversion and adding 1 to the result.  */
 720       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 721       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 722                              niters, build_minus_one_cst (ni_actual_type));
 723       niters = gimple_convert (&preheader_seq, compare_type, niters);
 724       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 725                              niters, build_one_cst (compare_type));
 726     }
 727   else
 728     niters = gimple_convert (&preheader_seq, compare_type, niters);
 729
 730   /* Iterate over all the rgroups and fill in their controls.  We could use
 731      the first control from any rgroup for the loop condition; here we
 732      arbitrarily pick the last.  */
 733   tree test_ctrl = NULL_TREE;
 734   rgroup_controls *rgc;
 735   unsigned int i;
 736   auto_vec<rgroup_controls> *controls = use_masks_p
 737                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 738                                           : &LOOP_VINFO_LENS (loop_vinfo);
 739   FOR_EACH_VEC_ELT (*controls, i, rgc)
 740     if (!rgc->controls.is_empty ())
 741       {
 742         /* First try using permutes.  This adds a single vector
 743            instruction to the loop for each mask, but needs no extra
 744            loop invariants or IVs.  */
 745         unsigned int nmasks = i + 1;
 746         if (use_masks_p && (nmasks & 1) == 0)
 747           {
 748             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 749             if (!half_rgc->controls.is_empty ()
 750                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 751               continue;
 752           }
 753
 754         /* See whether zero-based IV would ever generate all-false masks
 755            or zero length before wrapping around.  */
 756         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 757
 758         /* Set up all controls for this group.  */
 759         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 760                                                      &preheader_seq,
 761                                                      &header_seq,
 762                                                      loop_cond_gsi, rgc,
 763                                                      niters, niters_skip,
 764                                                      might_wrap_p);
 765       }
 766
 767   /* Emit all accumulated statements.  */
 768   add_preheader_seq (loop, preheader_seq);
 769   add_header_seq (loop, header_seq);
 770
 771   /* Get a boolean result that tells us whether to iterate.  */
 772   edge exit_edge = single_exit (loop);
 773   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 774   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 775   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 776                                         NULL_TREE, NULL_TREE);
 777   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 778
 779   /* The loop iterates (NITERS - 1) / VF + 1 times.
 780      Subtract one from this to get the latch count.  */
 781   tree step = build_int_cst (compare_type,
 782                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 783   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 784                                        build_minus_one_cst (compare_type));
 785   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 786                                      niters_minus_one, step);
 787
 788   if (final_iv)
 789     {
 790       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 791       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 792     }
 793
 794   return cond_stmt;
 795 }
 796
 797 /* Like vect_set_loop_condition, but handle the case in which the vector
 798    loop handles exactly VF scalars per iteration.  */
 799
 800 static gcond *
 801 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 802                                 tree final_iv, bool niters_maybe_zero,
 803                                 gimple_stmt_iterator loop_cond_gsi)
 804 {
 805   tree indx_before_incr, indx_after_incr;
 806   gcond *cond_stmt;
 807   gcond *orig_cond;
 808   edge pe = loop_preheader_edge (loop);
 809   edge exit_edge = single_exit (loop);
 810   gimple_stmt_iterator incr_gsi;
 811   bool insert_after;
 812   enum tree_code code;
 813   tree niters_type = TREE_TYPE (niters);
 814
 815   orig_cond = get_loop_exit_condition (loop);
 816   gcc_assert (orig_cond);
 817   loop_cond_gsi = gsi_for_stmt (orig_cond);
 818
 819   tree init, limit;
 820   if (!niters_maybe_zero && integer_onep (step))
 821     {
 822       /* In this case we can use a simple 0-based IV:
 823
 824          A:
 825            x = 0;
 826            do
 827              {
 828                ...
 829                x += 1;
 830              }
 831            while (x < NITERS);  */
 832       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 833       init = build_zero_cst (niters_type);
 834       limit = niters;
 835     }
 836   else
 837     {
 838       /* The following works for all values of NITERS except 0:
 839
 840          B:
 841            x = 0;
 842            do
 843              {
 844                ...
 845                x += STEP;
 846              }
 847            while (x <= NITERS - STEP);
 848
 849          so that the loop continues to iterate if x + STEP - 1 < NITERS
 850          but stops if x + STEP - 1 >= NITERS.
 851
 852          However, if NITERS is zero, x never hits a value above NITERS - STEP
 853          before wrapping around.  There are two obvious ways of dealing with
 854          this:
 855
 856          - start at STEP - 1 and compare x before incrementing it
 857          - start at -1 and compare x after incrementing it
 858
 859          The latter is simpler and is what we use.  The loop in this case
 860          looks like:
 861
 862          C:
 863            x = -1;
 864            do
 865              {
 866                ...
 867                x += STEP;
 868              }
 869            while (x < NITERS - STEP);
 870
 871          In both cases the loop limit is NITERS - STEP.  */
 872       gimple_seq seq = NULL;
 873       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 874       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 875       if (seq)
 876         {
 877           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 878           gcc_assert (!new_bb);
 879         }
 880       if (niters_maybe_zero)
 881         {
 882           /* Case C.  */
 883           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 884           init = build_all_ones_cst (niters_type);
 885         }
 886       else
 887         {
 888           /* Case B.  */
 889           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 890           init = build_zero_cst (niters_type);
 891         }
 892     }
 893
 894   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 895   create_iv (init, step, NULL_TREE, loop,
 896              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 897   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 898                                               true, NULL_TREE, true,
 899                                               GSI_SAME_STMT);
 900   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 901                                      true, GSI_SAME_STMT);
 902
 903   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 904                                  NULL_TREE);
 905
 906   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 907
 908   /* Record the number of latch iterations.  */
 909   if (limit == niters)
 910     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 911        latch count.  */
 912     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 913                                        build_int_cst (niters_type, 1));
 914   else
 915     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 916        Subtract one from this to get the latch count.  */
 917     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 918                                        limit, step);
 919
 920   if (final_iv)
 921     {
 922       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 923                                              indx_after_incr, init);
 924       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 925     }
 926
 927   return cond_stmt;
 928 }
 929
 930 /* If we're using fully-masked loops, make LOOP iterate:
 931
 932       N == (NITERS - 1) / STEP + 1
 933
 934    times.  When NITERS is zero, this is equivalent to making the loop
 935    execute (1 << M) / STEP times, where M is the precision of NITERS.
 936    NITERS_MAYBE_ZERO is true if this last case might occur.
 937
 938    If we're not using fully-masked loops, make LOOP iterate:
 939
 940       N == (NITERS - STEP) / STEP + 1
 941
 942    times, where NITERS is known to be outside the range [1, STEP - 1].
 943    This is equivalent to making the loop execute NITERS / STEP times
 944    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 945    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 946
 947    If FINAL_IV is nonnull, it is an SSA name that should be set to
 948    N * STEP on exit from the loop.
 949
 950    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 951
 952 void
 953 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 954                          tree niters, tree step, tree final_iv,
 955                          bool niters_maybe_zero)
 956 {
 957   gcond *cond_stmt;
 958   gcond *orig_cond = get_loop_exit_condition (loop);
 959   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 960
 961   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 962     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 963                                                          niters, final_iv,
 964                                                          niters_maybe_zero,
 965                                                          loop_cond_gsi);
 966   else
 967     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 968                                                 niters_maybe_zero,
 969                                                 loop_cond_gsi);
 970
 971   /* Remove old loop exit test.  */
 972   stmt_vec_info orig_cond_info;
 973   if (loop_vinfo
 974       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 975     loop_vinfo->remove_stmt (orig_cond_info);
 976   else
 977     gsi_remove (&loop_cond_gsi, true);
 978
 979   if (dump_enabled_p ())
 980     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 981                      cond_stmt);
 982 }
 983
 984 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 985    For all PHI arguments in FROM->dest and TO->dest from those
 986    edges ensure that TO->dest PHI arguments have current_def
 987    to that in from.  */
 988
 989 static void
 990 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 991 {
 992   gimple_stmt_iterator gsi_from, gsi_to;
 993
 994   for (gsi_from = gsi_start_phis (from->dest),
 995        gsi_to = gsi_start_phis (to->dest);
 996        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 997     {
 998       gimple *from_phi = gsi_stmt (gsi_from);
 999       gimple *to_phi = gsi_stmt (gsi_to);
1000       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1001       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1002       if (virtual_operand_p (from_arg))
1003         {
1004           gsi_next (&gsi_from);
1005           continue;
1006         }
1007       if (virtual_operand_p (to_arg))
1008         {
1009           gsi_next (&gsi_to);
1010           continue;
1011         }
1012       if (TREE_CODE (from_arg) != SSA_NAME)
1013         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1014       else if (TREE_CODE (to_arg) == SSA_NAME
1015                && from_arg != to_arg)
1016         {
1017           if (get_current_def (to_arg) == NULL_TREE)
1018             {
1019               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1020                                               TREE_TYPE (get_current_def
1021                                                            (from_arg))));
1022               set_current_def (to_arg, get_current_def (from_arg));
1023             }
1024         }
1025       gsi_next (&gsi_from);
1026       gsi_next (&gsi_to);
1027     }
1028
1029   gphi *from_phi = get_virtual_phi (from->dest);
1030   gphi *to_phi = get_virtual_phi (to->dest);
1031   if (from_phi)
1032     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1033                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1034 }
1035
1036
1037 /* Given LOOP this function generates a new copy of it and puts it
1038    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1039    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1040    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1041    entry or exit of LOOP.  */
1042
1043 class loop *
1044 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1045                                         class loop *scalar_loop, edge e)
1046 {
1047   class loop *new_loop;
1048   basic_block *new_bbs, *bbs, *pbbs;
1049   bool at_exit;
1050   bool was_imm_dom;
1051   basic_block exit_dest;
1052   edge exit, new_exit;
1053   bool duplicate_outer_loop = false;
1054
1055   exit = single_exit (loop);
1056   at_exit = (e == exit);
1057   if (!at_exit && e != loop_preheader_edge (loop))
1058     return NULL;
1059
1060   if (scalar_loop == NULL)
1061     scalar_loop = loop;
1062
1063   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1064   pbbs = bbs + 1;
1065   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1066   /* Allow duplication of outer loops.  */
1067   if (scalar_loop->inner)
1068     duplicate_outer_loop = true;
1069   /* Check whether duplication is possible.  */
1070   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1071     {
1072       free (bbs);
1073       return NULL;
1074     }
1075
1076   /* Generate new loop structure.  */
1077   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1078   duplicate_subloops (scalar_loop, new_loop);
1079
1080   exit_dest = exit->dest;
1081   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1082                                           exit_dest) == loop->header ?
1083                  true : false);
1084
1085   /* Also copy the pre-header, this avoids jumping through hoops to
1086      duplicate the loop entry PHI arguments.  Create an empty
1087      pre-header unconditionally for this.  */
1088   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1089   edge entry_e = single_pred_edge (preheader);
1090   bbs[0] = preheader;
1091   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1092
1093   exit = single_exit (scalar_loop);
1094   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1095             &exit, 1, &new_exit, NULL,
1096             at_exit ? loop->latch : e->src, true);
1097   exit = single_exit (loop);
1098   basic_block new_preheader = new_bbs[0];
1099
1100   /* Before installing PHI arguments make sure that the edges
1101      into them match that of the scalar loop we analyzed.  This
1102      makes sure the SLP tree matches up between the main vectorized
1103      loop and the epilogue vectorized copies.  */
1104   if (single_succ_edge (preheader)->dest_idx
1105       != single_succ_edge (new_bbs[0])->dest_idx)
1106     {
1107       basic_block swap_bb = new_bbs[1];
1108       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1109       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1110       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1111       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1112     }
1113   if (duplicate_outer_loop)
1114     {
1115       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1116       if (loop_preheader_edge (scalar_loop)->dest_idx
1117           != loop_preheader_edge (new_inner_loop)->dest_idx)
1118         {
1119           basic_block swap_bb = new_inner_loop->header;
1120           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1121           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1122           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1123           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1124         }
1125     }
1126
1127   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1128
1129   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1130   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1131     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1132
1133   if (scalar_loop != loop)
1134     {
1135       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1136          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1137          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1138          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1139          header) to have current_def set, so copy them over.  */
1140       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1141                                                 exit);
1142       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1143                                                            0),
1144                                                 EDGE_SUCC (loop->latch, 0));
1145     }
1146
1147   if (at_exit) /* Add the loop copy at exit.  */
1148     {
1149       if (scalar_loop != loop)
1150         {
1151           gphi_iterator gsi;
1152           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1153
1154           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1155                gsi_next (&gsi))
1156             {
1157               gphi *phi = gsi.phi ();
1158               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1159               location_t orig_locus
1160                 = gimple_phi_arg_location_from_edge (phi, e);
1161
1162               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1163             }
1164         }
1165       redirect_edge_and_branch_force (e, new_preheader);
1166       flush_pending_stmts (e);
1167       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1168       if (was_imm_dom || duplicate_outer_loop)
1169         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1170
1171       /* And remove the non-necessary forwarder again.  Keep the other
1172          one so we have a proper pre-header for the loop at the exit edge.  */
1173       redirect_edge_pred (single_succ_edge (preheader),
1174                           single_pred (preheader));
1175       delete_basic_block (preheader);
1176       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1177                                loop_preheader_edge (scalar_loop)->src);
1178     }
1179   else /* Add the copy at entry.  */
1180     {
1181       if (scalar_loop != loop)
1182         {
1183           /* Remove the non-necessary forwarder of scalar_loop again.  */
1184           redirect_edge_pred (single_succ_edge (preheader),
1185                               single_pred (preheader));
1186           delete_basic_block (preheader);
1187           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1188                                    loop_preheader_edge (scalar_loop)->src);
1189           preheader = split_edge (loop_preheader_edge (loop));
1190           entry_e = single_pred_edge (preheader);
1191         }
1192
1193       redirect_edge_and_branch_force (entry_e, new_preheader);
1194       flush_pending_stmts (entry_e);
1195       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1196
1197       redirect_edge_and_branch_force (new_exit, preheader);
1198       flush_pending_stmts (new_exit);
1199       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1200
1201       /* And remove the non-necessary forwarder again.  Keep the other
1202          one so we have a proper pre-header for the loop at the exit edge.  */
1203       redirect_edge_pred (single_succ_edge (new_preheader),
1204                           single_pred (new_preheader));
1205       delete_basic_block (new_preheader);
1206       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1207                                loop_preheader_edge (new_loop)->src);
1208     }
1209
1210   if (scalar_loop != loop)
1211     {
1212       /* Update new_loop->header PHIs, so that on the preheader
1213          edge they are the ones from loop rather than scalar_loop.  */
1214       gphi_iterator gsi_orig, gsi_new;
1215       edge orig_e = loop_preheader_edge (loop);
1216       edge new_e = loop_preheader_edge (new_loop);
1217
1218       for (gsi_orig = gsi_start_phis (loop->header),
1219            gsi_new = gsi_start_phis (new_loop->header);
1220            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1221            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1222         {
1223           gphi *orig_phi = gsi_orig.phi ();
1224           gphi *new_phi = gsi_new.phi ();
1225           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1226           location_t orig_locus
1227             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1228
1229           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1230         }
1231     }
1232
1233   free (new_bbs);
1234   free (bbs);
1235
1236   checking_verify_dominators (CDI_DOMINATORS);
1237
1238   return new_loop;
1239 }
1240
1241
1242 /* Given the condition expression COND, put it as the last statement of
1243    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1244    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1245    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1246    new edge as irreducible if IRREDUCIBLE_P is true.  */
1247
1248 static edge
1249 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1250                        basic_block guard_to, basic_block dom_bb,
1251                        profile_probability probability, bool irreducible_p)
1252 {
1253   gimple_stmt_iterator gsi;
1254   edge new_e, enter_e;
1255   gcond *cond_stmt;
1256   gimple_seq gimplify_stmt_list = NULL;
1257
1258   enter_e = EDGE_SUCC (guard_bb, 0);
1259   enter_e->flags &= ~EDGE_FALLTHRU;
1260   enter_e->flags |= EDGE_FALSE_VALUE;
1261   gsi = gsi_last_bb (guard_bb);
1262
1263   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1264                                  is_gimple_condexpr_for_cond, NULL_TREE);
1265   if (gimplify_stmt_list)
1266     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1267
1268   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1269   gsi = gsi_last_bb (guard_bb);
1270   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1271
1272   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1273   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1274
1275   new_e->probability = probability;
1276   if (irreducible_p)
1277     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1278
1279   enter_e->probability = probability.invert ();
1280   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1281
1282   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1283   if (enter_e->dest->loop_father->header == enter_e->dest)
1284     split_edge (enter_e);
1285
1286   return new_e;
1287 }
1288
1289
1290 /* This function verifies that the following restrictions apply to LOOP:
1291    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1292        for innermost loop and 5 basic blocks for outer-loop.
1293    (2) it is single entry, single exit
1294    (3) its exit condition is the last stmt in the header
1295    (4) E is the entry/exit edge of LOOP.
1296  */
1297
1298 bool
1299 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1300 {
1301   edge exit_e = single_exit (loop);
1302   edge entry_e = loop_preheader_edge (loop);
1303   gcond *orig_cond = get_loop_exit_condition (loop);
1304   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1305   unsigned int num_bb = loop->inner? 5 : 2;
1306
1307   /* All loops have an outer scope; the only case loop->outer is NULL is for
1308      the function itself.  */
1309   if (!loop_outer (loop)
1310       || loop->num_nodes != num_bb
1311       || !empty_block_p (loop->latch)
1312       || !single_exit (loop)
1313       /* Verify that new loop exit condition can be trivially modified.  */
1314       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1315       || (e != exit_e && e != entry_e))
1316     return false;
1317
1318   return true;
1319 }
1320
1321 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1322    in the exit bb and rename all the uses after the loop.  This simplifies
1323    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1324    (but normally loop closed SSA form doesn't require virtual PHIs to be
1325    in the same form).  Doing this early simplifies the checking what
1326    uses should be renamed.
1327
1328    If we create a new phi after the loop, return the definition that
1329    applies on entry to the loop, otherwise return null.  */
1330
1331 static tree
1332 create_lcssa_for_virtual_phi (class loop *loop)
1333 {
1334   gphi_iterator gsi;
1335   edge exit_e = single_exit (loop);
1336
1337   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1338     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1339       {
1340         gphi *phi = gsi.phi ();
1341         for (gsi = gsi_start_phis (exit_e->dest);
1342              !gsi_end_p (gsi); gsi_next (&gsi))
1343           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1344             break;
1345         if (gsi_end_p (gsi))
1346           {
1347             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1348             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1349             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1350             imm_use_iterator imm_iter;
1351             gimple *stmt;
1352             use_operand_p use_p;
1353
1354             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1355               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1356             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1357             gimple_phi_set_result (new_phi, new_vop);
1358             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1359               if (stmt != new_phi
1360                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1361                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1362                   SET_USE (use_p, new_vop);
1363
1364             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1365           }
1366         break;
1367       }
1368   return NULL_TREE;
1369 }
1370
1371 /* Function vect_get_loop_location.
1372
1373    Extract the location of the loop in the source code.
1374    If the loop is not well formed for vectorization, an estimated
1375    location is calculated.
1376    Return the loop location if succeed and NULL if not.  */
1377
1378 dump_user_location_t
1379 find_loop_location (class loop *loop)
1380 {
1381   gimple *stmt = NULL;
1382   basic_block bb;
1383   gimple_stmt_iterator si;
1384
1385   if (!loop)
1386     return dump_user_location_t ();
1387
1388   stmt = get_loop_exit_condition (loop);
1389
1390   if (stmt
1391       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1392     return stmt;
1393
1394   /* If we got here the loop is probably not "well formed",
1395      try to estimate the loop location */
1396
1397   if (!loop->header)
1398     return dump_user_location_t ();
1399
1400   bb = loop->header;
1401
1402   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1403     {
1404       stmt = gsi_stmt (si);
1405       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1406         return stmt;
1407     }
1408
1409   return dump_user_location_t ();
1410 }
1411
1412 /* Return true if the phi described by STMT_INFO defines an IV of the
1413    loop to be vectorized.  */
1414
1415 static bool
1416 iv_phi_p (stmt_vec_info stmt_info)
1417 {
1418   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1419   if (virtual_operand_p (PHI_RESULT (phi)))
1420     return false;
1421
1422   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1423       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1424     return false;
1425
1426   return true;
1427 }
1428
1429 /* Function vect_can_advance_ivs_p
1430
1431    In case the number of iterations that LOOP iterates is unknown at compile
1432    time, an epilog loop will be generated, and the loop induction variables
1433    (IVs) will be "advanced" to the value they are supposed to take just before
1434    the epilog loop.  Here we check that the access function of the loop IVs
1435    and the expression that represents the loop bound are simple enough.
1436    These restrictions will be relaxed in the future.  */
1437
1438 bool
1439 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1440 {
1441   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1442   basic_block bb = loop->header;
1443   gphi_iterator gsi;
1444
1445   /* Analyze phi functions of the loop header.  */
1446
1447   if (dump_enabled_p ())
1448     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1449   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1450     {
1451       tree evolution_part;
1452
1453       gphi *phi = gsi.phi ();
1454       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1455       if (dump_enabled_p ())
1456         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1457                          phi_info->stmt);
1458
1459       /* Skip virtual phi's. The data dependences that are associated with
1460          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1461
1462          Skip reduction phis.  */
1463       if (!iv_phi_p (phi_info))
1464         {
1465           if (dump_enabled_p ())
1466             dump_printf_loc (MSG_NOTE, vect_location,
1467                              "reduc or virtual phi. skip.\n");
1468           continue;
1469         }
1470
1471       /* Analyze the evolution function.  */
1472
1473       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1474       if (evolution_part == NULL_TREE)
1475         {
1476           if (dump_enabled_p ())
1477             dump_printf (MSG_MISSED_OPTIMIZATION,
1478                          "No access function or evolution.\n");
1479           return false;
1480         }
1481
1482       /* FORNOW: We do not transform initial conditions of IVs
1483          which evolution functions are not invariants in the loop.  */
1484
1485       if (!expr_invariant_in_loop_p (loop, evolution_part))
1486         {
1487           if (dump_enabled_p ())
1488             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                              "evolution not invariant in loop.\n");
1490           return false;
1491         }
1492
1493       /* FORNOW: We do not transform initial conditions of IVs
1494          which evolution functions are a polynomial of degree >= 2.  */
1495
1496       if (tree_is_chrec (evolution_part))
1497         {
1498           if (dump_enabled_p ())
1499             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1500                              "evolution is chrec.\n");
1501           return false;
1502         }
1503     }
1504
1505   return true;
1506 }
1507
1508
1509 /*   Function vect_update_ivs_after_vectorizer.
1510
1511      "Advance" the induction variables of LOOP to the value they should take
1512      after the execution of LOOP.  This is currently necessary because the
1513      vectorizer does not handle induction variables that are used after the
1514      loop.  Such a situation occurs when the last iterations of LOOP are
1515      peeled, because:
1516      1. We introduced new uses after LOOP for IVs that were not originally used
1517         after LOOP: the IVs of LOOP are now used by an epilog loop.
1518      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1519         times, whereas the loop IVs should be bumped N times.
1520
1521      Input:
1522      - LOOP - a loop that is going to be vectorized. The last few iterations
1523               of LOOP were peeled.
1524      - NITERS - the number of iterations that LOOP executes (before it is
1525                 vectorized). i.e, the number of times the ivs should be bumped.
1526      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1527                   coming out from LOOP on which there are uses of the LOOP ivs
1528                   (this is the path from LOOP->exit to epilog_loop->preheader).
1529
1530                   The new definitions of the ivs are placed in LOOP->exit.
1531                   The phi args associated with the edge UPDATE_E in the bb
1532                   UPDATE_E->dest are updated accordingly.
1533
1534      Assumption 1: Like the rest of the vectorizer, this function assumes
1535      a single loop exit that has a single predecessor.
1536
1537      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1538      organized in the same order.
1539
1540      Assumption 3: The access function of the ivs is simple enough (see
1541      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1542
1543      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1544      coming out of LOOP on which the ivs of LOOP are used (this is the path
1545      that leads to the epilog loop; other paths skip the epilog loop).  This
1546      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1547      needs to have its phis updated.
1548  */
1549
1550 static void
1551 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1552                                   tree niters, edge update_e)
1553 {
1554   gphi_iterator gsi, gsi1;
1555   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1556   basic_block update_bb = update_e->dest;
1557   basic_block exit_bb = single_exit (loop)->dest;
1558
1559   /* Make sure there exists a single-predecessor exit bb:  */
1560   gcc_assert (single_pred_p (exit_bb));
1561   gcc_assert (single_succ_edge (exit_bb) == update_e);
1562
1563   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1564        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1565        gsi_next (&gsi), gsi_next (&gsi1))
1566     {
1567       tree init_expr;
1568       tree step_expr, off;
1569       tree type;
1570       tree var, ni, ni_name;
1571       gimple_stmt_iterator last_gsi;
1572
1573       gphi *phi = gsi.phi ();
1574       gphi *phi1 = gsi1.phi ();
1575       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1576       if (dump_enabled_p ())
1577         dump_printf_loc (MSG_NOTE, vect_location,
1578                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1579
1580       /* Skip reduction and virtual phis.  */
1581       if (!iv_phi_p (phi_info))
1582         {
1583           if (dump_enabled_p ())
1584             dump_printf_loc (MSG_NOTE, vect_location,
1585                              "reduc or virtual phi. skip.\n");
1586           continue;
1587         }
1588
1589       type = TREE_TYPE (gimple_phi_result (phi));
1590       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1591       step_expr = unshare_expr (step_expr);
1592
1593       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1594          of degree >= 2 or exponential.  */
1595       gcc_assert (!tree_is_chrec (step_expr));
1596
1597       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1598
1599       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1600                          fold_convert (TREE_TYPE (step_expr), niters),
1601                          step_expr);
1602       if (POINTER_TYPE_P (type))
1603         ni = fold_build_pointer_plus (init_expr, off);
1604       else
1605         ni = fold_build2 (PLUS_EXPR, type,
1606                           init_expr, fold_convert (type, off));
1607
1608       var = create_tmp_var (type, "tmp");
1609
1610       last_gsi = gsi_last_bb (exit_bb);
1611       gimple_seq new_stmts = NULL;
1612       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1613       /* Exit_bb shouldn't be empty.  */
1614       if (!gsi_end_p (last_gsi))
1615         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1616       else
1617         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1618
1619       /* Fix phi expressions in the successor bb.  */
1620       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1621     }
1622 }
1623
1624 /* Return a gimple value containing the misalignment (measured in vector
1625    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1626    it is away from a perfectly aligned address.  Add any new statements
1627    to SEQ.  */
1628
1629 static tree
1630 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1631 {
1632   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1633   stmt_vec_info stmt_info = dr_info->stmt;
1634   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1635
1636   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1637   unsigned HOST_WIDE_INT target_align_c;
1638   tree target_align_minus_1;
1639
1640   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1641                                         size_zero_node) < 0;
1642   tree offset = (negative
1643                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1644                              * TREE_INT_CST_LOW
1645                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1646                  : size_zero_node);
1647   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1648                                                           stmt_info, seq,
1649                                                           offset);
1650   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1651   if (target_align.is_constant (&target_align_c))
1652     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1653   else
1654     {
1655       tree vla = build_int_cst (type, target_align);
1656       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1657                                     fold_build2 (MINUS_EXPR, type,
1658                                                  build_int_cst (type, 0), vla));
1659       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1660                                           build_int_cst (type, 1));
1661     }
1662
1663   HOST_WIDE_INT elem_size
1664     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1665   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1666
1667   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1668   tree int_start_addr = fold_convert (type, start_addr);
1669   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1670                                         target_align_minus_1);
1671
1672   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1673   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1674                                         elem_size_log);
1675
1676   return misalign_in_elems;
1677 }
1678
1679 /* Function vect_gen_prolog_loop_niters
1680
1681    Generate the number of iterations which should be peeled as prolog for the
1682    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1683    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1684    As a result, after the execution of this loop, the data reference DR will
1685    refer to an aligned location.  The following computation is generated:
1686
1687    If the misalignment of DR is known at compile time:
1688      addr_mis = int mis = DR_MISALIGNMENT (dr);
1689    Else, compute address misalignment in bytes:
1690      addr_mis = addr & (target_align - 1)
1691
1692    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1693
1694    (elem_size = element type size; an element is the scalar element whose type
1695    is the inner type of the vectype)
1696
1697    The computations will be emitted at the end of BB.  We also compute and
1698    store upper bound (included) of the result in BOUND.
1699
1700    When the step of the data-ref in the loop is not 1 (as in interleaved data
1701    and SLP), the number of iterations of the prolog must be divided by the step
1702    (which is equal to the size of interleaved group).
1703
1704    The above formulas assume that VF == number of elements in the vector. This
1705    may not hold when there are multiple-types in the loop.
1706    In this case, for some data-references in the loop the VF does not represent
1707    the number of elements that fit in the vector.  Therefore, instead of VF we
1708    use TYPE_VECTOR_SUBPARTS.  */
1709
1710 static tree
1711 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1712                              basic_block bb, int *bound)
1713 {
1714   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1715   tree var;
1716   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1717   gimple_seq stmts = NULL, new_stmts = NULL;
1718   tree iters, iters_name;
1719   stmt_vec_info stmt_info = dr_info->stmt;
1720   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1721   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1722
1723   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1724     {
1725       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1726
1727       if (dump_enabled_p ())
1728         dump_printf_loc (MSG_NOTE, vect_location,
1729                          "known peeling = %d.\n", npeel);
1730
1731       iters = build_int_cst (niters_type, npeel);
1732       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1733     }
1734   else
1735     {
1736       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1737       tree type = TREE_TYPE (misalign_in_elems);
1738       HOST_WIDE_INT elem_size
1739         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1740       /* We only do prolog peeling if the target alignment is known at compile
1741          time.  */
1742       poly_uint64 align_in_elems =
1743         exact_div (target_align, elem_size);
1744       tree align_in_elems_minus_1 =
1745         build_int_cst (type, align_in_elems - 1);
1746       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1747
1748       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1749                                  & (align_in_elems - 1)).  */
1750       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1751                                             size_zero_node) < 0;
1752       if (negative)
1753         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1754                              align_in_elems_tree);
1755       else
1756         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1757                              misalign_in_elems);
1758       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1759       iters = fold_convert (niters_type, iters);
1760       unsigned HOST_WIDE_INT align_in_elems_c;
1761       if (align_in_elems.is_constant (&align_in_elems_c))
1762         *bound = align_in_elems_c - 1;
1763       else
1764         *bound = -1;
1765     }
1766
1767   if (dump_enabled_p ())
1768     dump_printf_loc (MSG_NOTE, vect_location,
1769                      "niters for prolog loop: %T\n", iters);
1770
1771   var = create_tmp_var (niters_type, "prolog_loop_niters");
1772   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1773
1774   if (new_stmts)
1775     gimple_seq_add_seq (&stmts, new_stmts);
1776   if (stmts)
1777     {
1778       gcc_assert (single_succ_p (bb));
1779       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1780       if (gsi_end_p (gsi))
1781         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1782       else
1783         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1784     }
1785   return iters_name;
1786 }
1787
1788
1789 /* Function vect_update_init_of_dr
1790
1791    If CODE is PLUS, the vector loop starts NITERS iterations after the
1792    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1793    iterations before the scalar one (using masking to skip inactive
1794    elements).  This function updates the information recorded in DR to
1795    account for the difference.  Specifically, it updates the OFFSET
1796    field of DR_INFO.  */
1797
1798 static void
1799 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1800 {
1801   struct data_reference *dr = dr_info->dr;
1802   tree offset = dr_info->offset;
1803   if (!offset)
1804     offset = build_zero_cst (sizetype);
1805
1806   niters = fold_build2 (MULT_EXPR, sizetype,
1807                         fold_convert (sizetype, niters),
1808                         fold_convert (sizetype, DR_STEP (dr)));
1809   offset = fold_build2 (code, sizetype,
1810                         fold_convert (sizetype, offset), niters);
1811   dr_info->offset = offset;
1812 }
1813
1814
1815 /* Function vect_update_inits_of_drs
1816
1817    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1818    CODE and NITERS are as for vect_update_inits_of_dr.  */
1819
1820 void
1821 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1822                           tree_code code)
1823 {
1824   unsigned int i;
1825   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1826   struct data_reference *dr;
1827
1828   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1829
1830   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1831      here, but since we might use these niters to update the epilogues niters
1832      and data references we can't insert them here as this definition might not
1833      always dominate its uses.  */
1834   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1835     niters = fold_convert (sizetype, niters);
1836
1837   FOR_EACH_VEC_ELT (datarefs, i, dr)
1838     {
1839       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1840       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1841           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1842         vect_update_init_of_dr (dr_info, niters, code);
1843     }
1844 }
1845
1846 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1847    by masking.  This involves calculating the number of iterations to
1848    be peeled and then aligning all memory references appropriately.  */
1849
1850 void
1851 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1852 {
1853   tree misalign_in_elems;
1854   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1855
1856   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1857
1858   /* From the information recorded in LOOP_VINFO get the number of iterations
1859      that need to be skipped via masking.  */
1860   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1861     {
1862       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1863                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1864       misalign_in_elems = build_int_cst (type, misalign);
1865     }
1866   else
1867     {
1868       gimple_seq seq1 = NULL, seq2 = NULL;
1869       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1870       misalign_in_elems = fold_convert (type, misalign_in_elems);
1871       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1872                                                 &seq2, true, NULL_TREE);
1873       gimple_seq_add_seq (&seq1, seq2);
1874       if (seq1)
1875         {
1876           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1877           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1878           gcc_assert (!new_bb);
1879         }
1880     }
1881
1882   if (dump_enabled_p ())
1883     dump_printf_loc (MSG_NOTE, vect_location,
1884                      "misalignment for fully-masked loop: %T\n",
1885                      misalign_in_elems);
1886
1887   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1888
1889   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1890 }
1891
1892 /* This function builds ni_name = number of iterations.  Statements
1893    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1894    it to TRUE if new ssa_var is generated.  */
1895
1896 tree
1897 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1898 {
1899   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1900   if (TREE_CODE (ni) == INTEGER_CST)
1901     return ni;
1902   else
1903     {
1904       tree ni_name, var;
1905       gimple_seq stmts = NULL;
1906       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1907
1908       var = create_tmp_var (TREE_TYPE (ni), "niters");
1909       ni_name = force_gimple_operand (ni, &stmts, false, var);
1910       if (stmts)
1911         {
1912           gsi_insert_seq_on_edge_immediate (pe, stmts);
1913           if (new_var_p != NULL)
1914             *new_var_p = true;
1915         }
1916
1917       return ni_name;
1918     }
1919 }
1920
1921 /* Calculate the number of iterations above which vectorized loop will be
1922    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1923    of prolog loop.  If it's integer const, the integer number is also passed
1924    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1925    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1926    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1927    threshold below which the scalar (rather than vectorized) loop will be
1928    executed.  This function stores the upper bound (inclusive) of the result
1929    in BOUND_SCALAR.  */
1930
1931 static tree
1932 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1933                              int bound_prolog, poly_int64 bound_epilog, int th,
1934                              poly_uint64 *bound_scalar,
1935                              bool check_profitability)
1936 {
1937   tree type = TREE_TYPE (niters_prolog);
1938   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1939                              build_int_cst (type, bound_epilog));
1940
1941   *bound_scalar = bound_prolog + bound_epilog;
1942   if (check_profitability)
1943     {
1944       /* TH indicates the minimum niters of vectorized loop, while we
1945          compute the maximum niters of scalar loop.  */
1946       th--;
1947       /* Peeling for constant times.  */
1948       if (int_niters_prolog >= 0)
1949         {
1950           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1951           return build_int_cst (type, *bound_scalar);
1952         }
1953       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1954          and BOUND_EPILOG are inclusive upper bounds.  */
1955       if (known_ge (th, bound_prolog + bound_epilog))
1956         {
1957           *bound_scalar = th;
1958           return build_int_cst (type, th);
1959         }
1960       /* Need to do runtime comparison.  */
1961       else if (maybe_gt (th, bound_epilog))
1962         {
1963           *bound_scalar = upper_bound (*bound_scalar, th);
1964           return fold_build2 (MAX_EXPR, type,
1965                               build_int_cst (type, th), niters);
1966         }
1967     }
1968   return niters;
1969 }
1970
1971 /* NITERS is the number of times that the original scalar loop executes
1972    after peeling.  Work out the maximum number of iterations N that can
1973    be handled by the vectorized form of the loop and then either:
1974
1975    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1976
1977         niters_vector = N
1978
1979    b) set *STEP_VECTOR_PTR to one and generate:
1980
1981         niters_vector = N / vf
1982
1983    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1984    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1985    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1986
1987 void
1988 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1989                              tree *niters_vector_ptr, tree *step_vector_ptr,
1990                              bool niters_no_overflow)
1991 {
1992   tree ni_minus_gap, var;
1993   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1994   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1995   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1996   tree log_vf = NULL_TREE;
1997
1998   /* If epilogue loop is required because of data accesses with gaps, we
1999      subtract one iteration from the total number of iterations here for
2000      correct calculation of RATIO.  */
2001   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2002     {
2003       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2004                                   build_one_cst (type));
2005       if (!is_gimple_val (ni_minus_gap))
2006         {
2007           var = create_tmp_var (type, "ni_gap");
2008           gimple *stmts = NULL;
2009           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2010                                                true, var);
2011           gsi_insert_seq_on_edge_immediate (pe, stmts);
2012         }
2013     }
2014   else
2015     ni_minus_gap = niters;
2016
2017   unsigned HOST_WIDE_INT const_vf;
2018   if (vf.is_constant (&const_vf)
2019       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2020     {
2021       /* Create: niters >> log2(vf) */
2022       /* If it's known that niters == number of latch executions + 1 doesn't
2023          overflow, we can generate niters >> log2(vf); otherwise we generate
2024          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2025          will be at least one.  */
2026       log_vf = build_int_cst (type, exact_log2 (const_vf));
2027       if (niters_no_overflow)
2028         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2029       else
2030         niters_vector
2031           = fold_build2 (PLUS_EXPR, type,
2032                          fold_build2 (RSHIFT_EXPR, type,
2033                                       fold_build2 (MINUS_EXPR, type,
2034                                                    ni_minus_gap,
2035                                                    build_int_cst (type, vf)),
2036                                       log_vf),
2037                          build_int_cst (type, 1));
2038       step_vector = build_one_cst (type);
2039     }
2040   else
2041     {
2042       niters_vector = ni_minus_gap;
2043       step_vector = build_int_cst (type, vf);
2044     }
2045
2046   if (!is_gimple_val (niters_vector))
2047     {
2048       var = create_tmp_var (type, "bnd");
2049       gimple_seq stmts = NULL;
2050       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2051       gsi_insert_seq_on_edge_immediate (pe, stmts);
2052       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2053          we set range information to make niters analyzer's life easier.
2054          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2055          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2056       if (stmts != NULL && log_vf)
2057         {
2058           if (niters_no_overflow)
2059             {
2060               value_range vr (type,
2061                               wi::one (TYPE_PRECISION (type)),
2062                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2063                                                          TYPE_SIGN (type)),
2064                                           exact_log2 (const_vf),
2065                                           TYPE_SIGN (type)));
2066               set_range_info (niters_vector, vr);
2067             }
2068           /* For VF == 1 the vector IV might also overflow so we cannot
2069              assert a minimum value of 1.  */
2070           else if (const_vf > 1)
2071             {
2072               value_range vr (type,
2073                               wi::one (TYPE_PRECISION (type)),
2074                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2075                                                          TYPE_SIGN (type))
2076                                           - (const_vf - 1),
2077                                           exact_log2 (const_vf), TYPE_SIGN (type))
2078                               + 1);
2079               set_range_info (niters_vector, vr);
2080             }
2081         }
2082     }
2083   *niters_vector_ptr = niters_vector;
2084   *step_vector_ptr = step_vector;
2085
2086   return;
2087 }
2088
2089 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2090    loop specified by LOOP_VINFO after vectorization, compute the number
2091    of iterations before vectorization (niters_vector * vf) and store it
2092    to NITERS_VECTOR_MULT_VF_PTR.  */
2093
2094 static void
2095 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2096                                      tree niters_vector,
2097                                      tree *niters_vector_mult_vf_ptr)
2098 {
2099   /* We should be using a step_vector of VF if VF is variable.  */
2100   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2101   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2102   tree type = TREE_TYPE (niters_vector);
2103   tree log_vf = build_int_cst (type, exact_log2 (vf));
2104   basic_block exit_bb = single_exit (loop)->dest;
2105
2106   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2107   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2108                                             niters_vector, log_vf);
2109   if (!is_gimple_val (niters_vector_mult_vf))
2110     {
2111       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2112       gimple_seq stmts = NULL;
2113       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2114                                                     &stmts, true, var);
2115       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2116       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2117     }
2118   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2119 }
2120
2121 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2122    this function searches for the corresponding lcssa phi node in exit
2123    bb of LOOP.  If it is found, return the phi result; otherwise return
2124    NULL.  */
2125
2126 static tree
2127 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2128                 gphi *lcssa_phi)
2129 {
2130   gphi_iterator gsi;
2131   edge e = single_exit (loop);
2132
2133   gcc_assert (single_pred_p (e->dest));
2134   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2135     {
2136       gphi *phi = gsi.phi ();
2137       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2138                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2139         return PHI_RESULT (phi);
2140     }
2141   return NULL_TREE;
2142 }
2143
2144 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2145    from SECOND/FIRST and puts it at the original loop's preheader/exit
2146    edge, the two loops are arranged as below:
2147
2148        preheader_a:
2149      first_loop:
2150        header_a:
2151          i_1 = PHI<i_0, i_2>;
2152          ...
2153          i_2 = i_1 + 1;
2154          if (cond_a)
2155            goto latch_a;
2156          else
2157            goto between_bb;
2158        latch_a:
2159          goto header_a;
2160
2161        between_bb:
2162          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2163
2164      second_loop:
2165        header_b:
2166          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2167                                  or with i_2 if no LCSSA phi is created
2168                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2169          ...
2170          i_4 = i_3 + 1;
2171          if (cond_b)
2172            goto latch_b;
2173          else
2174            goto exit_bb;
2175        latch_b:
2176          goto header_b;
2177
2178        exit_bb:
2179
2180    This function creates loop closed SSA for the first loop; update the
2181    second loop's PHI nodes by replacing argument on incoming edge with the
2182    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2183    is false, Loop closed ssa phis will only be created for non-iv phis for
2184    the first loop.
2185
2186    This function assumes exit bb of the first loop is preheader bb of the
2187    second loop, i.e, between_bb in the example code.  With PHIs updated,
2188    the second loop will execute rest iterations of the first.  */
2189
2190 static void
2191 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2192                                    class loop *first, class loop *second,
2193                                    bool create_lcssa_for_iv_phis)
2194 {
2195   gphi_iterator gsi_update, gsi_orig;
2196   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2197
2198   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2199   edge second_preheader_e = loop_preheader_edge (second);
2200   basic_block between_bb = single_exit (first)->dest;
2201
2202   gcc_assert (between_bb == second_preheader_e->src);
2203   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2204   /* Either the first loop or the second is the loop to be vectorized.  */
2205   gcc_assert (loop == first || loop == second);
2206
2207   for (gsi_orig = gsi_start_phis (first->header),
2208        gsi_update = gsi_start_phis (second->header);
2209        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2210        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2211     {
2212       gphi *orig_phi = gsi_orig.phi ();
2213       gphi *update_phi = gsi_update.phi ();
2214
2215       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2216       /* Generate lcssa PHI node for the first loop.  */
2217       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2218       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2219       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2220         {
2221           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2222           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2223           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2224           arg = new_res;
2225         }
2226
2227       /* Update PHI node in the second loop by replacing arg on the loop's
2228          incoming edge.  */
2229       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2230     }
2231
2232   /* For epilogue peeling we have to make sure to copy all LC PHIs
2233      for correct vectorization of live stmts.  */
2234   if (loop == first)
2235     {
2236       basic_block orig_exit = single_exit (second)->dest;
2237       for (gsi_orig = gsi_start_phis (orig_exit);
2238            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2239         {
2240           gphi *orig_phi = gsi_orig.phi ();
2241           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2242           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2243             continue;
2244
2245           /* Already created in the above loop.   */
2246           if (find_guard_arg (first, second, orig_phi))
2247             continue;
2248
2249           tree new_res = copy_ssa_name (orig_arg);
2250           gphi *lcphi = create_phi_node (new_res, between_bb);
2251           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2252         }
2253     }
2254 }
2255
2256 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2257    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2258    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2259    appear like below:
2260
2261        guard_bb:
2262          if (cond)
2263            goto merge_bb;
2264          else
2265            goto skip_loop;
2266
2267      skip_loop:
2268        header_a:
2269          i_1 = PHI<i_0, i_2>;
2270          ...
2271          i_2 = i_1 + 1;
2272          if (cond_a)
2273            goto latch_a;
2274          else
2275            goto exit_a;
2276        latch_a:
2277          goto header_a;
2278
2279        exit_a:
2280          i_5 = PHI<i_2>;
2281
2282        merge_bb:
2283          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2284
2285      update_loop:
2286        header_b:
2287          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2288          ...
2289          i_4 = i_3 + 1;
2290          if (cond_b)
2291            goto latch_b;
2292          else
2293            goto exit_bb;
2294        latch_b:
2295          goto header_b;
2296
2297        exit_bb:
2298
2299    This function creates PHI nodes at merge_bb and replaces the use of i_5
2300    in the update_loop's PHI node with the result of new PHI result.  */
2301
2302 static void
2303 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2304                                     class loop *update_loop,
2305                                     edge guard_edge, edge merge_edge)
2306 {
2307   location_t merge_loc, guard_loc;
2308   edge orig_e = loop_preheader_edge (skip_loop);
2309   edge update_e = loop_preheader_edge (update_loop);
2310   gphi_iterator gsi_orig, gsi_update;
2311
2312   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2313         gsi_update = gsi_start_phis (update_loop->header));
2314        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2315        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2316     {
2317       gphi *orig_phi = gsi_orig.phi ();
2318       gphi *update_phi = gsi_update.phi ();
2319
2320       /* Generate new phi node at merge bb of the guard.  */
2321       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2322       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2323
2324       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2325          args in NEW_PHI for these edges.  */
2326       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2327       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2328       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2329       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2330       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2331       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2332
2333       /* Update phi in UPDATE_PHI.  */
2334       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2335     }
2336 }
2337
2338 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2339    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2340    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2341    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2342    The CFG looks like:
2343
2344      loop:
2345        header_a:
2346          i_1 = PHI<i_0, i_2>;
2347          ...
2348          i_2 = i_1 + 1;
2349          if (cond_a)
2350            goto latch_a;
2351          else
2352            goto exit_a;
2353        latch_a:
2354          goto header_a;
2355
2356        exit_a:
2357
2358        guard_bb:
2359          if (cond)
2360            goto merge_bb;
2361          else
2362            goto epilog_loop;
2363
2364        ;; fall_through_bb
2365
2366      epilog_loop:
2367        header_b:
2368          i_3 = PHI<i_2, i_4>;
2369          ...
2370          i_4 = i_3 + 1;
2371          if (cond_b)
2372            goto latch_b;
2373          else
2374            goto merge_bb;
2375        latch_b:
2376          goto header_b;
2377
2378        merge_bb:
2379          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2380
2381        exit_bb:
2382          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2383
2384    For each name used out side EPILOG (i.e - for each name that has a lcssa
2385    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2386    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2387    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2388    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2389    in exit_bb will also be updated.  */
2390
2391 static void
2392 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2393                                     edge guard_edge, edge merge_edge)
2394 {
2395   gphi_iterator gsi;
2396   basic_block merge_bb = guard_edge->dest;
2397
2398   gcc_assert (single_succ_p (merge_bb));
2399   edge e = single_succ_edge (merge_bb);
2400   basic_block exit_bb = e->dest;
2401   gcc_assert (single_pred_p (exit_bb));
2402   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2403
2404   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2405     {
2406       gphi *update_phi = gsi.phi ();
2407       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2408
2409       tree merge_arg = NULL_TREE;
2410
2411       /* If the old argument is a SSA_NAME use its current_def.  */
2412       if (TREE_CODE (old_arg) == SSA_NAME)
2413         merge_arg = get_current_def (old_arg);
2414       /* If it's a constant or doesn't have a current_def, just use the old
2415          argument.  */
2416       if (!merge_arg)
2417         merge_arg = old_arg;
2418
2419       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2420       /* If the var is live after loop but not a reduction, we simply
2421          use the old arg.  */
2422       if (!guard_arg)
2423         guard_arg = old_arg;
2424
2425       /* Create new phi node in MERGE_BB:  */
2426       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2427       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2428
2429       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2430          the two PHI args in merge_phi for these edges.  */
2431       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2432       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2433
2434       /* Update the original phi in exit_bb.  */
2435       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2436     }
2437 }
2438
2439 /* EPILOG loop is duplicated from the original loop for vectorizing,
2440    the arg of its loop closed ssa PHI needs to be updated.  */
2441
2442 static void
2443 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2444 {
2445   gphi_iterator gsi;
2446   basic_block exit_bb = single_exit (epilog)->dest;
2447
2448   gcc_assert (single_pred_p (exit_bb));
2449   edge e = EDGE_PRED (exit_bb, 0);
2450   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2451     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2452 }
2453
2454 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2455    iterate exactly CONST_NITERS times.  Make a final decision about
2456    whether the epilogue loop should be used, returning true if so.  */
2457
2458 static bool
2459 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2460                              unsigned HOST_WIDE_INT const_niters)
2461 {
2462   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2463      using an epilogue loop for a single scalar iteration, even if
2464      we could in principle implement that using partial vectors.  */
2465   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2466   if (const_niters <= gap_niters + 1)
2467     return false;
2468
2469   /* Install the number of iterations.  */
2470   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2471   tree niters_tree = build_int_cst (niters_type, const_niters);
2472   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2473
2474   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2475   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2476
2477   /* Decide what to do if the number of epilogue iterations is not
2478      a multiple of the epilogue loop's vectorization factor.  */
2479   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2480 }
2481
2482 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2483    Return a value that equals:
2484
2485    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2486    - SKIP_VALUE when the main loop is skipped.  */
2487
2488 tree
2489 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2490                            tree skip_value)
2491 {
2492   gcc_assert (loop_vinfo->main_loop_edge);
2493
2494   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2495   basic_block bb = loop_vinfo->main_loop_edge->dest;
2496   gphi *new_phi = create_phi_node (phi_result, bb);
2497   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2498                UNKNOWN_LOCATION);
2499   add_phi_arg (new_phi, skip_value,
2500                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2501   return phi_result;
2502 }
2503
2504 /* Function vect_do_peeling.
2505
2506    Input:
2507    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2508
2509        preheader:
2510      LOOP:
2511        header_bb:
2512          loop_body
2513          if (exit_loop_cond) goto exit_bb
2514          else                goto header_bb
2515        exit_bb:
2516
2517    - NITERS: The number of iterations of the loop.
2518    - NITERSM1: The number of iterations of the loop's latch.
2519    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2520    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2521                               CHECK_PROFITABILITY is true.
2522    Output:
2523    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2524      iterate after vectorization; see vect_set_loop_condition for details.
2525    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2526      should be set to the number of scalar iterations handled by the
2527      vector loop.  The SSA name is only used on exit from the loop.
2528
2529    This function peels prolog and epilog from the loop, adds guards skipping
2530    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2531    would look like:
2532
2533        guard_bb_1:
2534          if (prefer_scalar_loop) goto merge_bb_1
2535          else                    goto guard_bb_2
2536
2537        guard_bb_2:
2538          if (skip_prolog) goto merge_bb_2
2539          else             goto prolog_preheader
2540
2541        prolog_preheader:
2542      PROLOG:
2543        prolog_header_bb:
2544          prolog_body
2545          if (exit_prolog_cond) goto prolog_exit_bb
2546          else                  goto prolog_header_bb
2547        prolog_exit_bb:
2548
2549        merge_bb_2:
2550
2551        vector_preheader:
2552      VECTOR LOOP:
2553        vector_header_bb:
2554          vector_body
2555          if (exit_vector_cond) goto vector_exit_bb
2556          else                  goto vector_header_bb
2557        vector_exit_bb:
2558
2559        guard_bb_3:
2560          if (skip_epilog) goto merge_bb_3
2561          else             goto epilog_preheader
2562
2563        merge_bb_1:
2564
2565        epilog_preheader:
2566      EPILOG:
2567        epilog_header_bb:
2568          epilog_body
2569          if (exit_epilog_cond) goto merge_bb_3
2570          else                  goto epilog_header_bb
2571
2572        merge_bb_3:
2573
2574    Note this function peels prolog and epilog only if it's necessary,
2575    as well as guards.
2576    This function returns the epilogue loop if a decision was made to vectorize
2577    it, otherwise NULL.
2578
2579    The analysis resulting in this epilogue loop's loop_vec_info was performed
2580    in the same vect_analyze_loop call as the main loop's.  At that time
2581    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2582    vectorization factors than the main loop.  This list is stored in the main
2583    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2584    vectorize the epilogue loop for a lower vectorization factor,  the
2585    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2586    updated and linked to the epilogue loop.  This is later used to vectorize
2587    the epilogue.  The reason the loop_vec_info needs updating is that it was
2588    constructed based on the original main loop, and the epilogue loop is a
2589    copy of this loop, so all links pointing to statements in the original loop
2590    need updating.  Furthermore, these loop_vec_infos share the
2591    data_reference's records, which will also need to be updated.
2592
2593    TODO: Guard for prefer_scalar_loop should be emitted along with
2594    versioning conditions if loop versioning is needed.  */
2595
2596
2597 class loop *
2598 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2599                  tree *niters_vector, tree *step_vector,
2600                  tree *niters_vector_mult_vf_var, int th,
2601                  bool check_profitability, bool niters_no_overflow,
2602                  tree *advance)
2603 {
2604   edge e, guard_e;
2605   tree type = TREE_TYPE (niters), guard_cond;
2606   basic_block guard_bb, guard_to;
2607   profile_probability prob_prolog, prob_vector, prob_epilog;
2608   int estimated_vf;
2609   int prolog_peeling = 0;
2610   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2611   bool vect_epilogues_updated_niters = false;
2612   /* We currently do not support prolog peeling if the target alignment is not
2613      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2614      target alignment being constant.  */
2615   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2616   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2617     return NULL;
2618
2619   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2620     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2621
2622   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2623   poly_uint64 bound_epilog = 0;
2624   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2625       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2626     bound_epilog += vf - 1;
2627   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2628     bound_epilog += 1;
2629   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2630   poly_uint64 bound_scalar = bound_epilog;
2631
2632   if (!prolog_peeling && !epilog_peeling)
2633     return NULL;
2634
2635   /* Before doing any peeling make sure to reset debug binds outside of
2636      the loop refering to defs not in LC SSA.  */
2637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2638   for (unsigned i = 0; i < loop->num_nodes; ++i)
2639     {
2640       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2641       imm_use_iterator ui;
2642       gimple *use_stmt;
2643       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2644            gsi_next (&gsi))
2645         {
2646           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2647             if (gimple_debug_bind_p (use_stmt)
2648                 && loop != gimple_bb (use_stmt)->loop_father
2649                 && !flow_loop_nested_p (loop,
2650                                         gimple_bb (use_stmt)->loop_father))
2651               {
2652                 gimple_debug_bind_reset_value (use_stmt);
2653                 update_stmt (use_stmt);
2654               }
2655         }
2656       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2657            gsi_next (&gsi))
2658         {
2659           ssa_op_iter op_iter;
2660           def_operand_p def_p;
2661           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2662             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2663               if (gimple_debug_bind_p (use_stmt)
2664                   && loop != gimple_bb (use_stmt)->loop_father
2665                   && !flow_loop_nested_p (loop,
2666                                           gimple_bb (use_stmt)->loop_father))
2667                 {
2668                   gimple_debug_bind_reset_value (use_stmt);
2669                   update_stmt (use_stmt);
2670                 }
2671         }
2672     }
2673
2674   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2675   estimated_vf = vect_vf_for_cost (loop_vinfo);
2676   if (estimated_vf == 2)
2677     estimated_vf = 3;
2678   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2679                         .apply_scale (estimated_vf - 1, estimated_vf);
2680
2681   class loop *prolog, *epilog = NULL;
2682   class loop *first_loop = loop;
2683   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2684
2685   /* We might have a queued need to update virtual SSA form.  As we
2686      delete the update SSA machinery below after doing a regular
2687      incremental SSA update during loop copying make sure we don't
2688      lose that fact.
2689      ???  Needing to update virtual SSA form by renaming is unfortunate
2690      but not all of the vectorizer code inserting new loads / stores
2691      properly assigns virtual operands to those statements.  */
2692   update_ssa (TODO_update_ssa_only_virtuals);
2693
2694   create_lcssa_for_virtual_phi (loop);
2695
2696   /* If we're vectorizing an epilogue loop, the update_ssa above will
2697      have ensured that the virtual operand is in SSA form throughout the
2698      vectorized main loop.  Normally it is possible to trace the updated
2699      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2700      back to scalar-stmt vuses, meaning that the effect of the SSA update
2701      remains local to the main loop.  However, there are rare cases in
2702      which the vectorized loop has vdefs even when the original scalar
2703      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2704      introduces clobbers of the temporary vector array, which in turn
2705      needs new vdefs.  If the scalar loop doesn't write to memory, these
2706      new vdefs will be the only ones in the vector loop.
2707
2708      In that case, update_ssa will have added a new virtual phi to the
2709      main loop, which previously didn't need one.  Ensure that we (locally)
2710      maintain LCSSA form for the virtual operand, just as we would have
2711      done if the virtual phi had existed from the outset.  This makes it
2712      easier to duplicate the scalar epilogue loop below.  */
2713   tree vop_to_rename = NULL_TREE;
2714   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2715     {
2716       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2717       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2718     }
2719
2720   if (MAY_HAVE_DEBUG_BIND_STMTS)
2721     {
2722       gcc_assert (!adjust_vec.exists ());
2723       adjust_vec.create (32);
2724     }
2725   initialize_original_copy_tables ();
2726
2727   /* Record the anchor bb at which the guard should be placed if the scalar
2728      loop might be preferred.  */
2729   basic_block anchor = loop_preheader_edge (loop)->src;
2730
2731   /* Generate the number of iterations for the prolog loop.  We do this here
2732      so that we can also get the upper bound on the number of iterations.  */
2733   tree niters_prolog;
2734   int bound_prolog = 0;
2735   if (prolog_peeling)
2736     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2737                                                   &bound_prolog);
2738   else
2739     niters_prolog = build_int_cst (type, 0);
2740
2741   loop_vec_info epilogue_vinfo = NULL;
2742   if (vect_epilogues)
2743     {
2744       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2745       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2746     }
2747
2748   tree niters_vector_mult_vf = NULL_TREE;
2749   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2750   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2751   edge update_e = NULL, skip_e = NULL;
2752   unsigned int lowest_vf = constant_lower_bound (vf);
2753   /* If we know the number of scalar iterations for the main loop we should
2754      check whether after the main loop there are enough iterations left over
2755      for the epilogue.  */
2756   if (vect_epilogues
2757       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2758       && prolog_peeling >= 0
2759       && known_eq (vf, lowest_vf))
2760     {
2761       unsigned HOST_WIDE_INT eiters
2762         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2763            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2764
2765       eiters -= prolog_peeling;
2766       eiters
2767         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2768
2769       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2770         {
2771           delete epilogue_vinfo;
2772           epilogue_vinfo = NULL;
2773           if (loop_vinfo->epilogue_vinfos.length () == 0)
2774             {
2775               vect_epilogues = false;
2776               break;
2777             }
2778           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2779           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2780         }
2781       vect_epilogues_updated_niters = true;
2782     }
2783   /* Prolog loop may be skipped.  */
2784   bool skip_prolog = (prolog_peeling != 0);
2785   /* Skip this loop to epilog when there are not enough iterations to enter this
2786      vectorized loop.  If true we should perform runtime checks on the NITERS
2787      to check whether we should skip the current vectorized loop.  If we know
2788      the number of scalar iterations we may choose to add a runtime check if
2789      this number "maybe" smaller than the number of iterations required
2790      when we know the number of scalar iterations may potentially
2791      be smaller than the number of iterations required to enter this loop, for
2792      this we use the upper bounds on the prolog and epilog peeling.  When we
2793      don't know the number of iterations and don't require versioning it is
2794      because we have asserted that there are enough scalar iterations to enter
2795      the main loop, so this skip is not necessary.  When we are versioning then
2796      we only add such a skip if we have chosen to vectorize the epilogue.  */
2797   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2798                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2799                                   bound_prolog + bound_epilog)
2800                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2801                          || vect_epilogues));
2802   /* Epilog loop must be executed if the number of iterations for epilog
2803      loop is known at compile time, otherwise we need to add a check at
2804      the end of vector loop and skip to the end of epilog loop.  */
2805   bool skip_epilog = (prolog_peeling < 0
2806                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2807                       || !vf.is_constant ());
2808   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2809   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2810     skip_epilog = false;
2811
2812   if (skip_vector)
2813     {
2814       split_edge (loop_preheader_edge (loop));
2815
2816       /* Due to the order in which we peel prolog and epilog, we first
2817          propagate probability to the whole loop.  The purpose is to
2818          avoid adjusting probabilities of both prolog and vector loops
2819          separately.  Note in this case, the probability of epilog loop
2820          needs to be scaled back later.  */
2821       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2822       if (prob_vector.initialized_p ())
2823         {
2824           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2825           scale_loop_profile (loop, prob_vector, 0);
2826         }
2827     }
2828
2829   dump_user_location_t loop_loc = find_loop_location (loop);
2830   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2831   if (vect_epilogues)
2832     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2833        use the original scalar loop as remaining epilogue if necessary.  */
2834     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2835       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2836
2837   if (prolog_peeling)
2838     {
2839       e = loop_preheader_edge (loop);
2840       if (!slpeel_can_duplicate_loop_p (loop, e))
2841         {
2842           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2843                            "loop can't be duplicated to preheader edge.\n");
2844           gcc_unreachable ();
2845         }
2846       /* Peel prolog and put it on preheader edge of loop.  */
2847       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2848       if (!prolog)
2849         {
2850           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2851                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2852           gcc_unreachable ();
2853         }
2854       prolog->force_vectorize = false;
2855       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2856       first_loop = prolog;
2857       reset_original_copy_tables ();
2858
2859       /* Update the number of iterations for prolog loop.  */
2860       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2861       vect_set_loop_condition (prolog, NULL, niters_prolog,
2862                                step_prolog, NULL_TREE, false);
2863
2864       /* Skip the prolog loop.  */
2865       if (skip_prolog)
2866         {
2867           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2868                                     niters_prolog, build_int_cst (type, 0));
2869           guard_bb = loop_preheader_edge (prolog)->src;
2870           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2871           guard_to = split_edge (loop_preheader_edge (loop));
2872           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2873                                            guard_to, guard_bb,
2874                                            prob_prolog.invert (),
2875                                            irred_flag);
2876           e = EDGE_PRED (guard_to, 0);
2877           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2878           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2879
2880           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2881           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2882         }
2883
2884       /* Update init address of DRs.  */
2885       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2886       /* Update niters for vector loop.  */
2887       LOOP_VINFO_NITERS (loop_vinfo)
2888         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2889       LOOP_VINFO_NITERSM1 (loop_vinfo)
2890         = fold_build2 (MINUS_EXPR, type,
2891                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2892       bool new_var_p = false;
2893       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2894       /* It's guaranteed that vector loop bound before vectorization is at
2895          least VF, so set range information for newly generated var.  */
2896       if (new_var_p)
2897         {
2898           value_range vr (type,
2899                           wi::to_wide (build_int_cst (type, vf)),
2900                           wi::to_wide (TYPE_MAX_VALUE (type)));
2901           set_range_info (niters, vr);
2902         }
2903
2904       /* Prolog iterates at most bound_prolog times, latch iterates at
2905          most bound_prolog - 1 times.  */
2906       record_niter_bound (prolog, bound_prolog - 1, false, true);
2907       delete_update_ssa ();
2908       adjust_vec_debug_stmts ();
2909       scev_reset ();
2910     }
2911
2912   if (epilog_peeling)
2913     {
2914       e = single_exit (loop);
2915       if (!slpeel_can_duplicate_loop_p (loop, e))
2916         {
2917           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2918                            "loop can't be duplicated to exit edge.\n");
2919           gcc_unreachable ();
2920         }
2921       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2922          said epilog then we should use a copy of the main loop as a starting
2923          point.  This loop may have already had some preliminary transformations
2924          to allow for more optimal vectorization, for example if-conversion.
2925          If we are not vectorizing the epilog then we should use the scalar loop
2926          as the transformations mentioned above make less or no sense when not
2927          vectorizing.  */
2928       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2929       if (vop_to_rename)
2930         {
2931           /* Vectorizing the main loop can sometimes introduce a vdef to
2932              a loop that previously didn't have one; see the comment above
2933              the definition of VOP_TO_RENAME for details.  The definition
2934              D that holds on E will then be different from the definition
2935              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2936              rename VOP_TO_RENAME to D when copying the loop.
2937
2938              The virtual operand is in LCSSA form for the main loop,
2939              and no stmt between the main loop and E needs a vdef,
2940              so we know that D is provided by a phi rather than by a
2941              vdef on a normal gimple stmt.  */
2942           basic_block vdef_bb = e->src;
2943           gphi *vphi;
2944           while (!(vphi = get_virtual_phi (vdef_bb)))
2945             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2946           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2947           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2948         }
2949       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2950       if (!epilog)
2951         {
2952           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2953                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2954           gcc_unreachable ();
2955         }
2956       epilog->force_vectorize = false;
2957       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2958
2959       /* Scalar version loop may be preferred.  In this case, add guard
2960          and skip to epilog.  Note this only happens when the number of
2961          iterations of loop is unknown at compile time, otherwise this
2962          won't be vectorized.  */
2963       if (skip_vector)
2964         {
2965           /* Additional epilogue iteration is peeled if gap exists.  */
2966           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2967                                                 bound_prolog, bound_epilog,
2968                                                 th, &bound_scalar,
2969                                                 check_profitability);
2970           /* Build guard against NITERSM1 since NITERS may overflow.  */
2971           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2972           guard_bb = anchor;
2973           guard_to = split_edge (loop_preheader_edge (epilog));
2974           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2975                                            guard_to, guard_bb,
2976                                            prob_vector.invert (),
2977                                            irred_flag);
2978           skip_e = guard_e;
2979           e = EDGE_PRED (guard_to, 0);
2980           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2981           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2982
2983           /* Simply propagate profile info from guard_bb to guard_to which is
2984              a merge point of control flow.  */
2985           guard_to->count = guard_bb->count;
2986
2987           /* Scale probability of epilog loop back.
2988              FIXME: We should avoid scaling down and back up.  Profile may
2989              get lost if we scale down to 0.  */
2990           basic_block *bbs = get_loop_body (epilog);
2991           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2992             bbs[i]->count = bbs[i]->count.apply_scale
2993                                  (bbs[i]->count,
2994                                   bbs[i]->count.apply_probability
2995                                     (prob_vector));
2996           free (bbs);
2997         }
2998
2999       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
3000       /* If loop is peeled for non-zero constant times, now niters refers to
3001          orig_niters - prolog_peeling, it won't overflow even the orig_niters
3002          overflows.  */
3003       niters_no_overflow |= (prolog_peeling > 0);
3004       vect_gen_vector_loop_niters (loop_vinfo, niters,
3005                                    niters_vector, step_vector,
3006                                    niters_no_overflow);
3007       if (!integer_onep (*step_vector))
3008         {
3009           /* On exit from the loop we will have an easy way of calcalating
3010              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3011              until then.  */
3012           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3013           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3014           *niters_vector_mult_vf_var = niters_vector_mult_vf;
3015         }
3016       else
3017         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3018                                              &niters_vector_mult_vf);
3019       /* Update IVs of original loop as if they were advanced by
3020          niters_vector_mult_vf steps.  */
3021       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3022       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3023       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3024                                         update_e);
3025
3026       if (skip_epilog)
3027         {
3028           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3029                                     niters, niters_vector_mult_vf);
3030           guard_bb = single_exit (loop)->dest;
3031           guard_to = split_edge (single_exit (epilog));
3032           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3033                                            skip_vector ? anchor : guard_bb,
3034                                            prob_epilog.invert (),
3035                                            irred_flag);
3036           if (vect_epilogues)
3037             epilogue_vinfo->skip_this_loop_edge = guard_e;
3038           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3039                                               single_exit (epilog));
3040           /* Only need to handle basic block before epilog loop if it's not
3041              the guard_bb, which is the case when skip_vector is true.  */
3042           if (guard_bb != bb_before_epilog)
3043             {
3044               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3045
3046               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3047             }
3048           scale_loop_profile (epilog, prob_epilog, 0);
3049         }
3050       else
3051         slpeel_update_phi_nodes_for_lcssa (epilog);
3052
3053       unsigned HOST_WIDE_INT bound;
3054       if (bound_scalar.is_constant (&bound))
3055         {
3056           gcc_assert (bound != 0);
3057           /* -1 to convert loop iterations to latch iterations.  */
3058           record_niter_bound (epilog, bound - 1, false, true);
3059         }
3060
3061       delete_update_ssa ();
3062       adjust_vec_debug_stmts ();
3063       scev_reset ();
3064     }
3065
3066   if (vect_epilogues)
3067     {
3068       epilog->aux = epilogue_vinfo;
3069       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3070
3071       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3072
3073       /* We now must calculate the number of NITERS performed by the previous
3074          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3075       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3076                                  niters_prolog, niters_vector_mult_vf);
3077
3078       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3079          determine whether we are coming from the previous vectorized loop
3080          using the update_e edge or the skip_vector basic block using the
3081          skip_e edge.  */
3082       if (skip_vector)
3083         {
3084           gcc_assert (update_e != NULL
3085                       && skip_e != NULL
3086                       && !vect_epilogues_updated_niters);
3087           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3088                                            update_e->dest);
3089           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3090           gimple *stmt = gimple_build_assign (new_ssa, niters);
3091           gimple_stmt_iterator gsi;
3092           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3093               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3094             {
3095               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3096               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3097             }
3098           else
3099             {
3100               gsi = gsi_last_bb (update_e->src);
3101               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3102             }
3103
3104           niters = new_ssa;
3105           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3106           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3107                        UNKNOWN_LOCATION);
3108           niters = PHI_RESULT (new_phi);
3109           epilogue_vinfo->main_loop_edge = update_e;
3110           epilogue_vinfo->skip_main_loop_edge = skip_e;
3111         }
3112
3113       /* Set ADVANCE to the number of iterations performed by the previous
3114          loop and its prologue.  */
3115       *advance = niters;
3116
3117       if (!vect_epilogues_updated_niters)
3118         {
3119           /* Subtract the number of iterations performed by the vectorized loop
3120              from the number of total iterations.  */
3121           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3122                                               before_loop_niters,
3123                                               niters);
3124
3125           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3126           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3127             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3128                            epilogue_niters,
3129                            build_one_cst (TREE_TYPE (epilogue_niters)));
3130
3131           /* Decide what to do if the number of epilogue iterations is not
3132              a multiple of the epilogue loop's vectorization factor.
3133              We should have rejected the loop during the analysis phase
3134              if this fails.  */
3135           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3136                                                            true))
3137             gcc_unreachable ();
3138         }
3139     }
3140
3141   adjust_vec.release ();
3142   free_original_copy_tables ();
3143
3144   return vect_epilogues ? epilog : NULL;
3145 }
3146
3147 /* Function vect_create_cond_for_niters_checks.
3148
3149    Create a conditional expression that represents the run-time checks for
3150    loop's niter.  The loop is guaranteed to terminate if the run-time
3151    checks hold.
3152
3153    Input:
3154    COND_EXPR  - input conditional expression.  New conditions will be chained
3155                 with logical AND operation.  If it is NULL, then the function
3156                 is used to return the number of alias checks.
3157    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3158                 to be checked.
3159
3160    Output:
3161    COND_EXPR - conditional expression.
3162
3163    The returned COND_EXPR is the conditional expression to be used in the
3164    if statement that controls which version of the loop gets executed at
3165    runtime.  */
3166
3167 static void
3168 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3169 {
3170   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3171
3172   if (*cond_expr)
3173     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3174                               *cond_expr, part_cond_expr);
3175   else
3176     *cond_expr = part_cond_expr;
3177 }
3178
3179 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3180    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3181
3182 static void
3183 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3184 {
3185   if (*cond_expr)
3186     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3187                               *cond_expr, part_cond_expr);
3188   else
3189     *cond_expr = part_cond_expr;
3190 }
3191
3192 /* Function vect_create_cond_for_align_checks.
3193
3194    Create a conditional expression that represents the alignment checks for
3195    all of data references (array element references) whose alignment must be
3196    checked at runtime.
3197
3198    Input:
3199    COND_EXPR  - input conditional expression.  New conditions will be chained
3200                 with logical AND operation.
3201    LOOP_VINFO - two fields of the loop information are used.
3202                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3203                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3204
3205    Output:
3206    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3207                          expression.
3208    The returned value is the conditional expression to be used in the if
3209    statement that controls which version of the loop gets executed at runtime.
3210
3211    The algorithm makes two assumptions:
3212      1) The number of bytes "n" in a vector is a power of 2.
3213      2) An address "a" is aligned if a%n is zero and that this
3214         test can be done as a&(n-1) == 0.  For example, for 16
3215         byte vectors the test is a&0xf == 0.  */
3216
3217 static void
3218 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3219                                    tree *cond_expr,
3220                                    gimple_seq *cond_expr_stmt_list)
3221 {
3222   const vec<stmt_vec_info> &may_misalign_stmts
3223     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3224   stmt_vec_info stmt_info;
3225   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3226   tree mask_cst;
3227   unsigned int i;
3228   tree int_ptrsize_type;
3229   char tmp_name[20];
3230   tree or_tmp_name = NULL_TREE;
3231   tree and_tmp_name;
3232   gimple *and_stmt;
3233   tree ptrsize_zero;
3234   tree part_cond_expr;
3235
3236   /* Check that mask is one less than a power of 2, i.e., mask is
3237      all zeros followed by all ones.  */
3238   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3239
3240   int_ptrsize_type = signed_type_for (ptr_type_node);
3241
3242   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3243      of the first vector of the i'th data reference. */
3244
3245   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3246     {
3247       gimple_seq new_stmt_list = NULL;
3248       tree addr_base;
3249       tree addr_tmp_name;
3250       tree new_or_tmp_name;
3251       gimple *addr_stmt, *or_stmt;
3252       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3253       bool negative = tree_int_cst_compare
3254         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3255       tree offset = negative
3256         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3257                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3258         : size_zero_node;
3259
3260       /* create: addr_tmp = (int)(address_of_first_vector) */
3261       addr_base =
3262         vect_create_addr_base_for_vector_ref (loop_vinfo,
3263                                               stmt_info, &new_stmt_list,
3264                                               offset);
3265       if (new_stmt_list != NULL)
3266         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3267
3268       sprintf (tmp_name, "addr2int%d", i);
3269       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3270       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3271       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3272
3273       /* The addresses are OR together.  */
3274
3275       if (or_tmp_name != NULL_TREE)
3276         {
3277           /* create: or_tmp = or_tmp | addr_tmp */
3278           sprintf (tmp_name, "orptrs%d", i);
3279           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3280           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3281                                          or_tmp_name, addr_tmp_name);
3282           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3283           or_tmp_name = new_or_tmp_name;
3284         }
3285       else
3286         or_tmp_name = addr_tmp_name;
3287
3288     } /* end for i */
3289
3290   mask_cst = build_int_cst (int_ptrsize_type, mask);
3291
3292   /* create: and_tmp = or_tmp & mask  */
3293   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3294
3295   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3296                                   or_tmp_name, mask_cst);
3297   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3298
3299   /* Make and_tmp the left operand of the conditional test against zero.
3300      if and_tmp has a nonzero bit then some address is unaligned.  */
3301   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3302   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3303                                 and_tmp_name, ptrsize_zero);
3304   chain_cond_expr (cond_expr, part_cond_expr);
3305 }
3306
3307 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3308    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3309    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3310    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3311
3312 static void
3313 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3314 {
3315   const vec<vec_object_pair> &pairs
3316     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3317   unsigned int i;
3318   vec_object_pair *pair;
3319   FOR_EACH_VEC_ELT (pairs, i, pair)
3320     {
3321       tree addr1 = build_fold_addr_expr (pair->first);
3322       tree addr2 = build_fold_addr_expr (pair->second);
3323       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3324                                          addr1, addr2);
3325       chain_cond_expr (cond_expr, part_cond_expr);
3326     }
3327 }
3328
3329 /* Create an expression that is true when all lower-bound conditions for
3330    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3331
3332 static void
3333 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3334 {
3335   const vec<vec_lower_bound> &lower_bounds
3336     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3337   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3338     {
3339       tree expr = lower_bounds[i].expr;
3340       tree type = unsigned_type_for (TREE_TYPE (expr));
3341       expr = fold_convert (type, expr);
3342       poly_uint64 bound = lower_bounds[i].min_value;
3343       if (!lower_bounds[i].unsigned_p)
3344         {
3345           expr = fold_build2 (PLUS_EXPR, type, expr,
3346                               build_int_cstu (type, bound - 1));
3347           bound += bound - 1;
3348         }
3349       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3350                                          build_int_cstu (type, bound));
3351       chain_cond_expr (cond_expr, part_cond_expr);
3352     }
3353 }
3354
3355 /* Function vect_create_cond_for_alias_checks.
3356
3357    Create a conditional expression that represents the run-time checks for
3358    overlapping of address ranges represented by a list of data references
3359    relations passed as input.
3360
3361    Input:
3362    COND_EXPR  - input conditional expression.  New conditions will be chained
3363                 with logical AND operation.  If it is NULL, then the function
3364                 is used to return the number of alias checks.
3365    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3366                 to be checked.
3367
3368    Output:
3369    COND_EXPR - conditional expression.
3370
3371    The returned COND_EXPR is the conditional expression to be used in the if
3372    statement that controls which version of the loop gets executed at runtime.
3373 */
3374
3375 void
3376 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3377 {
3378   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3379     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3380
3381   if (comp_alias_ddrs.is_empty ())
3382     return;
3383
3384   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3385                                &comp_alias_ddrs, cond_expr);
3386   if (dump_enabled_p ())
3387     dump_printf_loc (MSG_NOTE, vect_location,
3388                      "created %u versioning for alias checks.\n",
3389                      comp_alias_ddrs.length ());
3390 }
3391
3392
3393 /* Function vect_loop_versioning.
3394
3395    If the loop has data references that may or may not be aligned or/and
3396    has data reference relations whose independence was not proven then
3397    two versions of the loop need to be generated, one which is vectorized
3398    and one which isn't.  A test is then generated to control which of the
3399    loops is executed.  The test checks for the alignment of all of the
3400    data references that may or may not be aligned.  An additional
3401    sequence of runtime tests is generated for each pairs of DDRs whose
3402    independence was not proven.  The vectorized version of loop is
3403    executed only if both alias and alignment tests are passed.
3404
3405    The test generated to check which version of loop is executed
3406    is modified to also check for profitability as indicated by the
3407    cost model threshold TH.
3408
3409    The versioning precondition(s) are placed in *COND_EXPR and
3410    *COND_EXPR_STMT_LIST.  */
3411
3412 class loop *
3413 vect_loop_versioning (loop_vec_info loop_vinfo,
3414                       gimple *loop_vectorized_call)
3415 {
3416   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3417   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3418   basic_block condition_bb;
3419   gphi_iterator gsi;
3420   gimple_stmt_iterator cond_exp_gsi;
3421   basic_block merge_bb;
3422   basic_block new_exit_bb;
3423   edge new_exit_e, e;
3424   gphi *orig_phi, *new_phi;
3425   tree cond_expr = NULL_TREE;
3426   gimple_seq cond_expr_stmt_list = NULL;
3427   tree arg;
3428   profile_probability prob = profile_probability::likely ();
3429   gimple_seq gimplify_stmt_list = NULL;
3430   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3431   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3432   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3433   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3434   poly_uint64 versioning_threshold
3435     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3436   tree version_simd_if_cond
3437     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3438   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3439
3440   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3441       && !ordered_p (th, versioning_threshold))
3442     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3443                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3444                                             th - 1));
3445   if (maybe_ne (versioning_threshold, 0U))
3446     {
3447       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3448                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3449                                               versioning_threshold - 1));
3450       if (cond_expr)
3451         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3452                                  expr, cond_expr);
3453       else
3454         cond_expr = expr;
3455     }
3456
3457   tree cost_name = NULL_TREE;
3458   profile_probability prob2 = profile_probability::uninitialized ();
3459   if (cond_expr
3460       && !integer_truep (cond_expr)
3461       && (version_niter
3462           || version_align
3463           || version_alias
3464           || version_simd_if_cond))
3465     {
3466       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3467                                                       &cond_expr_stmt_list,
3468                                                       is_gimple_val, NULL_TREE);
3469       /* Split prob () into two so that the overall probability of passing
3470          both the cost-model and versioning checks is the orig prob.  */
3471       prob2 = prob.split (prob);
3472     }
3473
3474   if (version_niter)
3475     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3476
3477   if (cond_expr)
3478     {
3479       gimple_seq tem = NULL;
3480       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3481                                           &tem, is_gimple_condexpr_for_cond,
3482                                           NULL_TREE);
3483       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3484     }
3485
3486   if (version_align)
3487     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3488                                        &cond_expr_stmt_list);
3489
3490   if (version_alias)
3491     {
3492       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3493       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3494       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3495     }
3496
3497   if (version_simd_if_cond)
3498     {
3499       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3500       if (flag_checking)
3501         if (basic_block bb
3502             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3503           gcc_assert (bb != loop->header
3504                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3505                       && (scalar_loop == NULL
3506                           || (bb != scalar_loop->header
3507                               && dominated_by_p (CDI_DOMINATORS,
3508                                                  scalar_loop->header, bb))));
3509       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3510       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3511                             version_simd_if_cond, zero);
3512       if (cond_expr)
3513         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3514                                  c, cond_expr);
3515       else
3516         cond_expr = c;
3517       if (dump_enabled_p ())
3518         dump_printf_loc (MSG_NOTE, vect_location,
3519                          "created versioning for simd if condition check.\n");
3520     }
3521
3522   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3523                                       &gimplify_stmt_list,
3524                                       is_gimple_condexpr_for_cond, NULL_TREE);
3525   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3526
3527   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3528      invariant in.  */
3529   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3530   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3531        !gsi_end_p (gsi); gsi_next (&gsi))
3532     {
3533       gimple *stmt = gsi_stmt (gsi);
3534       update_stmt (stmt);
3535       ssa_op_iter iter;
3536       use_operand_p use_p;
3537       basic_block def_bb;
3538       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3539         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3540             && flow_bb_inside_loop_p (outermost, def_bb))
3541           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3542     }
3543
3544   /* Search for the outermost loop we can version.  Avoid versioning of
3545      non-perfect nests but allow if-conversion versioned loops inside.  */
3546   class loop *loop_to_version = loop;
3547   if (flow_loop_nested_p (outermost, loop))
3548     {
3549       if (dump_enabled_p ())
3550         dump_printf_loc (MSG_NOTE, vect_location,
3551                          "trying to apply versioning to outer loop %d\n",
3552                          outermost->num);
3553       if (outermost->num == 0)
3554         outermost = superloop_at_depth (loop, 1);
3555       /* And avoid applying versioning on non-perfect nests.  */
3556       while (loop_to_version != outermost
3557              && (e = single_exit (loop_outer (loop_to_version)))
3558              && !(e->flags & EDGE_COMPLEX)
3559              && (!loop_outer (loop_to_version)->inner->next
3560                  || vect_loop_vectorized_call (loop_to_version))
3561              && (!loop_outer (loop_to_version)->inner->next
3562                  || !loop_outer (loop_to_version)->inner->next->next))
3563         loop_to_version = loop_outer (loop_to_version);
3564     }
3565
3566   /* Apply versioning.  If there is already a scalar version created by
3567      if-conversion re-use that.  Note we cannot re-use the copy of
3568      an if-converted outer-loop when vectorizing the inner loop only.  */
3569   gcond *cond;
3570   if ((!loop_to_version->inner || loop == loop_to_version)
3571       && loop_vectorized_call)
3572     {
3573       gcc_assert (scalar_loop);
3574       condition_bb = gimple_bb (loop_vectorized_call);
3575       cond = as_a <gcond *> (last_stmt (condition_bb));
3576       gimple_cond_set_condition_from_tree (cond, cond_expr);
3577       update_stmt (cond);
3578
3579       if (cond_expr_stmt_list)
3580         {
3581           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3582           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3583                                  GSI_SAME_STMT);
3584         }
3585
3586       /* if-conversion uses profile_probability::always () for both paths,
3587          reset the paths probabilities appropriately.  */
3588       edge te, fe;
3589       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3590       te->probability = prob;
3591       fe->probability = prob.invert ();
3592       /* We can scale loops counts immediately but have to postpone
3593          scaling the scalar loop because we re-use it during peeling.  */
3594       scale_loop_frequencies (loop_to_version, te->probability);
3595       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3596
3597       nloop = scalar_loop;
3598       if (dump_enabled_p ())
3599         dump_printf_loc (MSG_NOTE, vect_location,
3600                          "reusing %sloop version created by if conversion\n",
3601                          loop_to_version != loop ? "outer " : "");
3602     }
3603   else
3604     {
3605       if (loop_to_version != loop
3606           && dump_enabled_p ())
3607         dump_printf_loc (MSG_NOTE, vect_location,
3608                          "applying loop versioning to outer loop %d\n",
3609                          loop_to_version->num);
3610
3611       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3612
3613       initialize_original_copy_tables ();
3614       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3615                             prob, prob.invert (), prob, prob.invert (), true);
3616       gcc_assert (nloop);
3617       nloop = get_loop_copy (loop);
3618
3619       /* For cycle vectorization with SLP we rely on the PHI arguments
3620          appearing in the same order as the SLP node operands which for the
3621          loop PHI nodes means the preheader edge dest index needs to remain
3622          the same for the analyzed loop which also becomes the vectorized one.
3623          Make it so in case the state after versioning differs by redirecting
3624          the first edge into the header to the same destination which moves
3625          it last.  */
3626       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3627         {
3628           edge e = EDGE_PRED (loop->header, 0);
3629           ssa_redirect_edge (e, e->dest);
3630           flush_pending_stmts (e);
3631         }
3632       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3633
3634       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3635          reap those otherwise;  they also refer to the original
3636          loops.  */
3637       class loop *l = loop;
3638       while (gimple *call = vect_loop_vectorized_call (l))
3639         {
3640           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3641           fold_loop_internal_call (call, boolean_false_node);
3642           l = loop_outer (l);
3643         }
3644       free_original_copy_tables ();
3645
3646       if (cond_expr_stmt_list)
3647         {
3648           cond_exp_gsi = gsi_last_bb (condition_bb);
3649           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3650                                  GSI_SAME_STMT);
3651         }
3652
3653       /* Loop versioning violates an assumption we try to maintain during
3654          vectorization - that the loop exit block has a single predecessor.
3655          After versioning, the exit block of both loop versions is the same
3656          basic block (i.e. it has two predecessors). Just in order to simplify
3657          following transformations in the vectorizer, we fix this situation
3658          here by adding a new (empty) block on the exit-edge of the loop,
3659          with the proper loop-exit phis to maintain loop-closed-form.
3660          If loop versioning wasn't done from loop, but scalar_loop instead,
3661          merge_bb will have already just a single successor.  */
3662
3663       merge_bb = single_exit (loop_to_version)->dest;
3664       if (EDGE_COUNT (merge_bb->preds) >= 2)
3665         {
3666           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3667           new_exit_bb = split_edge (single_exit (loop_to_version));
3668           new_exit_e = single_exit (loop_to_version);
3669           e = EDGE_SUCC (new_exit_bb, 0);
3670
3671           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3672                gsi_next (&gsi))
3673             {
3674               tree new_res;
3675               orig_phi = gsi.phi ();
3676               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3677               new_phi = create_phi_node (new_res, new_exit_bb);
3678               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3679               add_phi_arg (new_phi, arg, new_exit_e,
3680                            gimple_phi_arg_location_from_edge (orig_phi, e));
3681               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3682             }
3683         }
3684
3685       update_ssa (TODO_update_ssa);
3686     }
3687
3688   /* Split the cost model check off to a separate BB.  Costing assumes
3689      this is the only thing we perform when we enter the scalar loop
3690      from a failed cost decision.  */
3691   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3692     {
3693       gimple *def = SSA_NAME_DEF_STMT (cost_name);
3694       /* All uses of the cost check are 'true' after the check we
3695          are going to insert.  */
3696       replace_uses_by (cost_name, boolean_true_node);
3697       /* And we're going to build the new single use of it.  */
3698       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3699                                        NULL_TREE, NULL_TREE);
3700       edge e = split_block (gimple_bb (def), def);
3701       gimple_stmt_iterator gsi = gsi_for_stmt (def);
3702       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3703       edge true_e, false_e;
3704       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3705       e->flags &= ~EDGE_FALLTHRU;
3706       e->flags |= EDGE_TRUE_VALUE;
3707       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3708       e->probability = prob2;
3709       e2->probability = prob2.invert ();
3710       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3711       auto_vec<basic_block, 3> adj;
3712       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3713            son;
3714            son = next_dom_son (CDI_DOMINATORS, son))
3715         if (EDGE_COUNT (son->preds) > 1)
3716           adj.safe_push (son);
3717       for (auto son : adj)
3718         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3719     }
3720
3721   if (version_niter)
3722     {
3723       /* The versioned loop could be infinite, we need to clear existing
3724          niter information which is copied from the original loop.  */
3725       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3726       vect_free_loop_info_assumptions (nloop);
3727     }
3728
3729   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3730       && dump_enabled_p ())
3731     {
3732       if (version_alias)
3733         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3734                          vect_location,
3735                          "loop versioned for vectorization because of "
3736                          "possible aliasing\n");
3737       if (version_align)
3738         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3739                          vect_location,
3740                          "loop versioned for vectorization to enhance "
3741                          "alignment\n");
3742
3743     }
3744
3745   return nloop;
3746 }