gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 316                                *indices);
 317 }
 318
 319 /* Try to use permutes to define the masks in DEST_RGM using the masks
 320    in SRC_RGM, given that the former has twice as many masks as the
 321    latter.  Return true on success, adding any new statements to SEQ.  */
 322
 323 static bool
 324 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 325                                rgroup_controls *src_rgm)
 326 {
 327   tree src_masktype = src_rgm->type;
 328   tree dest_masktype = dest_rgm->type;
 329   machine_mode src_mode = TYPE_MODE (src_masktype);
 330   insn_code icode1, icode2;
 331   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 332       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 333                                   src_mode)) != CODE_FOR_nothing
 334       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 335                                   src_mode)) != CODE_FOR_nothing)
 336     {
 337       /* Unpacking the source masks gives at least as many mask bits as
 338          we need.  We can then VIEW_CONVERT any excess bits away.  */
 339       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 340       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 341       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 342       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 343         {
 344           tree src = src_rgm->controls[i / 2];
 345           tree dest = dest_rgm->controls[i];
 346           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 347                             ? VEC_UNPACK_HI_EXPR
 348                             : VEC_UNPACK_LO_EXPR);
 349           gassign *stmt;
 350           if (dest_masktype == unpack_masktype)
 351             stmt = gimple_build_assign (dest, code, src);
 352           else
 353             {
 354               tree temp = make_ssa_name (unpack_masktype);
 355               stmt = gimple_build_assign (temp, code, src);
 356               gimple_seq_add_stmt (seq, stmt);
 357               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 358                                           build1 (VIEW_CONVERT_EXPR,
 359                                                   dest_masktype, temp));
 360             }
 361           gimple_seq_add_stmt (seq, stmt);
 362         }
 363       return true;
 364     }
 365   vec_perm_indices indices[2];
 366   if (dest_masktype == src_masktype
 367       && interleave_supported_p (&indices[0], src_masktype, 0)
 368       && interleave_supported_p (&indices[1], src_masktype, 1))
 369     {
 370       /* The destination requires twice as many mask bits as the source, so
 371          we can use interleaving permutes to double up the number of bits.  */
 372       tree masks[2];
 373       for (unsigned int i = 0; i < 2; ++i)
 374         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 375       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 376         {
 377           tree src = src_rgm->controls[i / 2];
 378           tree dest = dest_rgm->controls[i];
 379           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 380                                               src, src, masks[i & 1]);
 381           gimple_seq_add_stmt (seq, stmt);
 382         }
 383       return true;
 384     }
 385   return false;
 386 }
 387
 388 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 389    for all the rgroup controls in RGC and return a control that is nonzero
 390    when the loop needs to iterate.  Add any new preheader statements to
 391    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 392
 393    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 394    times and has been vectorized according to LOOP_VINFO.
 395
 396    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 397    starts with NITERS_SKIP dummy iterations of the scalar loop before
 398    the real work starts.  The mask elements for these dummy iterations
 399    must be 0, to ensure that the extra iterations do not have an effect.
 400
 401    It is known that:
 402
 403      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 404
 405    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 406    variable that starts at 0 and has step:
 407
 408      VF * RGC->max_nscalars_per_iter * RGC->factor
 409
 410    might overflow before hitting a value above:
 411
 412      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 413
 414    This means that we cannot guarantee that such an induction variable
 415    would ever hit a value that produces a set of all-false masks or zero
 416    lengths for RGC.
 417
 418    Note: the cost of the code generated by this function is modeled
 419    by vect_estimate_min_profitable_iters, so changes here may need
 420    corresponding changes there.  */
 421
 422 static tree
 423 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 424                                  gimple_seq *preheader_seq,
 425                                  gimple_seq *header_seq,
 426                                  gimple_stmt_iterator loop_cond_gsi,
 427                                  rgroup_controls *rgc, tree niters,
 428                                  tree niters_skip, bool might_wrap_p)
 429 {
 430   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 431   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 432   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 433
 434   tree ctrl_type = rgc->type;
 435   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 436   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 437   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 438   tree length_limit = NULL_TREE;
 439   /* For length, we need length_limit to ensure length in range.  */
 440   if (!use_masks_p)
 441     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 442
 443   /* Calculate the maximum number of item values that the rgroup
 444      handles in total, the number that it handles for each iteration
 445      of the vector loop, and the number that it should skip during the
 446      first iteration of the vector loop.  */
 447   tree nitems_total = niters;
 448   tree nitems_step = build_int_cst (iv_type, vf);
 449   tree nitems_skip = niters_skip;
 450   if (nitems_per_iter != 1)
 451     {
 452       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 453          these multiplications don't overflow.  */
 454       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 455       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 456       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 457                                    nitems_total, compare_factor);
 458       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 459                                   nitems_step, iv_factor);
 460       if (nitems_skip)
 461         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 462                                     nitems_skip, compare_factor);
 463     }
 464
 465   /* Create an induction variable that counts the number of items
 466      processed.  */
 467   tree index_before_incr, index_after_incr;
 468   gimple_stmt_iterator incr_gsi;
 469   bool insert_after;
 470   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 471   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 472              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 473
 474   tree zero_index = build_int_cst (compare_type, 0);
 475   tree test_index, test_limit, first_limit;
 476   gimple_stmt_iterator *test_gsi;
 477   if (might_wrap_p)
 478     {
 479       /* In principle the loop should stop iterating once the incremented
 480          IV reaches a value greater than or equal to:
 481
 482            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 483
 484          However, there's no guarantee that this addition doesn't overflow
 485          the comparison type, or that the IV hits a value above it before
 486          wrapping around.  We therefore adjust the limit down by one
 487          IV step:
 488
 489            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 490            -[infinite-prec] NITEMS_STEP
 491
 492          and compare the IV against this limit _before_ incrementing it.
 493          Since the comparison type is unsigned, we actually want the
 494          subtraction to saturate at zero:
 495
 496            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 497            -[sat] NITEMS_STEP
 498
 499          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 500
 501            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 502
 503          where the rightmost subtraction can be done directly in
 504          COMPARE_TYPE.  */
 505       test_index = index_before_incr;
 506       tree adjust = gimple_convert (preheader_seq, compare_type,
 507                                     nitems_step);
 508       if (nitems_skip)
 509         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 510                                adjust, nitems_skip);
 511       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 512                                  nitems_total, adjust);
 513       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 514                                  test_limit, adjust);
 515       test_gsi = &incr_gsi;
 516
 517       /* Get a safe limit for the first iteration.  */
 518       if (nitems_skip)
 519         {
 520           /* The first vector iteration can handle at most NITEMS_STEP
 521              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 522              NITEMS_SKIP to that cannot overflow.  */
 523           tree const_limit = build_int_cst (compare_type,
 524                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 525                                             * nitems_per_iter);
 526           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 527                                       nitems_total, const_limit);
 528           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 529                                       first_limit, nitems_skip);
 530         }
 531       else
 532         /* For the first iteration it doesn't matter whether the IV hits
 533            a value above NITEMS_TOTAL.  That only matters for the latch
 534            condition.  */
 535         first_limit = nitems_total;
 536     }
 537   else
 538     {
 539       /* Test the incremented IV, which will always hit a value above
 540          the bound before wrapping.  */
 541       test_index = index_after_incr;
 542       test_limit = nitems_total;
 543       if (nitems_skip)
 544         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 545                                    test_limit, nitems_skip);
 546       test_gsi = &loop_cond_gsi;
 547
 548       first_limit = test_limit;
 549     }
 550
 551   /* Convert the IV value to the comparison type (either a no-op or
 552      a demotion).  */
 553   gimple_seq test_seq = NULL;
 554   test_index = gimple_convert (&test_seq, compare_type, test_index);
 555   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 556
 557   /* Provide a definition of each control in the group.  */
 558   tree next_ctrl = NULL_TREE;
 559   tree ctrl;
 560   unsigned int i;
 561   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 562     {
 563       /* Previous controls will cover BIAS items.  This control covers the
 564          next batch.  */
 565       poly_uint64 bias = nitems_per_ctrl * i;
 566       tree bias_tree = build_int_cst (compare_type, bias);
 567
 568       /* See whether the first iteration of the vector loop is known
 569          to have a full control.  */
 570       poly_uint64 const_limit;
 571       bool first_iteration_full
 572         = (poly_int_tree_p (first_limit, &const_limit)
 573            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 574
 575       /* Rather than have a new IV that starts at BIAS and goes up to
 576          TEST_LIMIT, prefer to use the same 0-based IV for each control
 577          and adjust the bound down by BIAS.  */
 578       tree this_test_limit = test_limit;
 579       if (i != 0)
 580         {
 581           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 582                                           compare_type, this_test_limit,
 583                                           bias_tree);
 584           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 585                                           compare_type, this_test_limit,
 586                                           bias_tree);
 587         }
 588
 589       /* Create the initial control.  First include all items that
 590          are within the loop limit.  */
 591       tree init_ctrl = NULL_TREE;
 592       if (!first_iteration_full)
 593         {
 594           tree start, end;
 595           if (first_limit == test_limit)
 596             {
 597               /* Use a natural test between zero (the initial IV value)
 598                  and the loop limit.  The "else" block would be valid too,
 599                  but this choice can avoid the need to load BIAS_TREE into
 600                  a register.  */
 601               start = zero_index;
 602               end = this_test_limit;
 603             }
 604           else
 605             {
 606               /* FIRST_LIMIT is the maximum number of items handled by the
 607                  first iteration of the vector loop.  Test the portion
 608                  associated with this control.  */
 609               start = bias_tree;
 610               end = first_limit;
 611             }
 612
 613           if (use_masks_p)
 614             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 615                                         start, end, "max_mask");
 616           else
 617             {
 618               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 619               gimple_seq seq = vect_gen_len (init_ctrl, start,
 620                                              end, length_limit);
 621               gimple_seq_add_seq (preheader_seq, seq);
 622             }
 623         }
 624
 625       /* Now AND out the bits that are within the number of skipped
 626          items.  */
 627       poly_uint64 const_skip;
 628       if (nitems_skip
 629           && !(poly_int_tree_p (nitems_skip, &const_skip)
 630                && known_le (const_skip, bias)))
 631         {
 632           gcc_assert (use_masks_p);
 633           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 634                                                     bias_tree, nitems_skip);
 635           if (init_ctrl)
 636             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 637                                       init_ctrl, unskipped_mask);
 638           else
 639             init_ctrl = unskipped_mask;
 640         }
 641
 642       if (!init_ctrl)
 643         {
 644           /* First iteration is full.  */
 645           if (use_masks_p)
 646             init_ctrl = build_minus_one_cst (ctrl_type);
 647           else
 648             init_ctrl = length_limit;
 649         }
 650
 651       /* Get the control value for the next iteration of the loop.  */
 652       if (use_masks_p)
 653         {
 654           gimple_seq stmts = NULL;
 655           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 656                                       this_test_limit, "next_mask");
 657           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 658         }
 659       else
 660         {
 661           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 662           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 663                                          length_limit);
 664           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 665         }
 666
 667       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 668     }
 669
 670   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 671   if (partial_load_bias != 0)
 672     {
 673       tree adjusted_len = rgc->bias_adjusted_ctrl;
 674       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 675                                             rgc->controls[0],
 676                                             build_int_cst
 677                                             (TREE_TYPE (rgc->controls[0]),
 678                                              partial_load_bias));
 679       gimple_seq_add_stmt (header_seq, minus);
 680     }
 681
 682   return next_ctrl;
 683 }
 684
 685 /* Set up the iteration condition and rgroup controls for LOOP, given
 686    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 687    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 688    the number of iterations of the original scalar loop that should be
 689    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 690    for vect_set_loop_condition.
 691
 692    Insert the branch-back condition before LOOP_COND_GSI and return the
 693    final gcond.  */
 694
 695 static gcond *
 696 vect_set_loop_condition_partial_vectors (class loop *loop,
 697                                          loop_vec_info loop_vinfo, tree niters,
 698                                          tree final_iv, bool niters_maybe_zero,
 699                                          gimple_stmt_iterator loop_cond_gsi)
 700 {
 701   gimple_seq preheader_seq = NULL;
 702   gimple_seq header_seq = NULL;
 703
 704   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 705   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 706   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 707   tree orig_niters = niters;
 708
 709   /* Type of the initial value of NITERS.  */
 710   tree ni_actual_type = TREE_TYPE (niters);
 711   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 712   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 713
 714   /* Convert NITERS to the same size as the compare.  */
 715   if (compare_precision > ni_actual_precision
 716       && niters_maybe_zero)
 717     {
 718       /* We know that there is always at least one iteration, so if the
 719          count is zero then it must have wrapped.  Cope with this by
 720          subtracting 1 before the conversion and adding 1 to the result.  */
 721       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 722       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 723                              niters, build_minus_one_cst (ni_actual_type));
 724       niters = gimple_convert (&preheader_seq, compare_type, niters);
 725       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 726                              niters, build_one_cst (compare_type));
 727     }
 728   else
 729     niters = gimple_convert (&preheader_seq, compare_type, niters);
 730
 731   /* Iterate over all the rgroups and fill in their controls.  We could use
 732      the first control from any rgroup for the loop condition; here we
 733      arbitrarily pick the last.  */
 734   tree test_ctrl = NULL_TREE;
 735   rgroup_controls *rgc;
 736   unsigned int i;
 737   auto_vec<rgroup_controls> *controls = use_masks_p
 738                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 739                                           : &LOOP_VINFO_LENS (loop_vinfo);
 740   FOR_EACH_VEC_ELT (*controls, i, rgc)
 741     if (!rgc->controls.is_empty ())
 742       {
 743         /* First try using permutes.  This adds a single vector
 744            instruction to the loop for each mask, but needs no extra
 745            loop invariants or IVs.  */
 746         unsigned int nmasks = i + 1;
 747         if (use_masks_p && (nmasks & 1) == 0)
 748           {
 749             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 750             if (!half_rgc->controls.is_empty ()
 751                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 752               continue;
 753           }
 754
 755         /* See whether zero-based IV would ever generate all-false masks
 756            or zero length before wrapping around.  */
 757         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 758
 759         /* Set up all controls for this group.  */
 760         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 761                                                      &preheader_seq,
 762                                                      &header_seq,
 763                                                      loop_cond_gsi, rgc,
 764                                                      niters, niters_skip,
 765                                                      might_wrap_p);
 766       }
 767
 768   /* Emit all accumulated statements.  */
 769   add_preheader_seq (loop, preheader_seq);
 770   add_header_seq (loop, header_seq);
 771
 772   /* Get a boolean result that tells us whether to iterate.  */
 773   edge exit_edge = single_exit (loop);
 774   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 775   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 776   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 777                                         NULL_TREE, NULL_TREE);
 778   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 779
 780   /* The loop iterates (NITERS - 1) / VF + 1 times.
 781      Subtract one from this to get the latch count.  */
 782   tree step = build_int_cst (compare_type,
 783                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 784   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 785                                        build_minus_one_cst (compare_type));
 786   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 787                                      niters_minus_one, step);
 788
 789   if (final_iv)
 790     {
 791       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 792       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 793     }
 794
 795   return cond_stmt;
 796 }
 797
 798 /* Like vect_set_loop_condition, but handle the case in which the vector
 799    loop handles exactly VF scalars per iteration.  */
 800
 801 static gcond *
 802 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 803                                 tree final_iv, bool niters_maybe_zero,
 804                                 gimple_stmt_iterator loop_cond_gsi)
 805 {
 806   tree indx_before_incr, indx_after_incr;
 807   gcond *cond_stmt;
 808   gcond *orig_cond;
 809   edge pe = loop_preheader_edge (loop);
 810   edge exit_edge = single_exit (loop);
 811   gimple_stmt_iterator incr_gsi;
 812   bool insert_after;
 813   enum tree_code code;
 814   tree niters_type = TREE_TYPE (niters);
 815
 816   orig_cond = get_loop_exit_condition (loop);
 817   gcc_assert (orig_cond);
 818   loop_cond_gsi = gsi_for_stmt (orig_cond);
 819
 820   tree init, limit;
 821   if (!niters_maybe_zero && integer_onep (step))
 822     {
 823       /* In this case we can use a simple 0-based IV:
 824
 825          A:
 826            x = 0;
 827            do
 828              {
 829                ...
 830                x += 1;
 831              }
 832            while (x < NITERS);  */
 833       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 834       init = build_zero_cst (niters_type);
 835       limit = niters;
 836     }
 837   else
 838     {
 839       /* The following works for all values of NITERS except 0:
 840
 841          B:
 842            x = 0;
 843            do
 844              {
 845                ...
 846                x += STEP;
 847              }
 848            while (x <= NITERS - STEP);
 849
 850          so that the loop continues to iterate if x + STEP - 1 < NITERS
 851          but stops if x + STEP - 1 >= NITERS.
 852
 853          However, if NITERS is zero, x never hits a value above NITERS - STEP
 854          before wrapping around.  There are two obvious ways of dealing with
 855          this:
 856
 857          - start at STEP - 1 and compare x before incrementing it
 858          - start at -1 and compare x after incrementing it
 859
 860          The latter is simpler and is what we use.  The loop in this case
 861          looks like:
 862
 863          C:
 864            x = -1;
 865            do
 866              {
 867                ...
 868                x += STEP;
 869              }
 870            while (x < NITERS - STEP);
 871
 872          In both cases the loop limit is NITERS - STEP.  */
 873       gimple_seq seq = NULL;
 874       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 875       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 876       if (seq)
 877         {
 878           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 879           gcc_assert (!new_bb);
 880         }
 881       if (niters_maybe_zero)
 882         {
 883           /* Case C.  */
 884           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 885           init = build_all_ones_cst (niters_type);
 886         }
 887       else
 888         {
 889           /* Case B.  */
 890           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 891           init = build_zero_cst (niters_type);
 892         }
 893     }
 894
 895   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 896   create_iv (init, step, NULL_TREE, loop,
 897              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 898   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 899                                               true, NULL_TREE, true,
 900                                               GSI_SAME_STMT);
 901   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 902                                      true, GSI_SAME_STMT);
 903
 904   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 905                                  NULL_TREE);
 906
 907   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 908
 909   /* Record the number of latch iterations.  */
 910   if (limit == niters)
 911     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 912        latch count.  */
 913     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 914                                        build_int_cst (niters_type, 1));
 915   else
 916     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 917        Subtract one from this to get the latch count.  */
 918     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 919                                        limit, step);
 920
 921   if (final_iv)
 922     {
 923       gassign *assign;
 924       edge exit = single_exit (loop);
 925       gcc_assert (single_pred_p (exit->dest));
 926       tree phi_dest
 927         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
 928       /* Make sure to maintain LC SSA form here and elide the subtraction
 929          if the value is zero.  */
 930       gphi *phi = create_phi_node (phi_dest, exit->dest);
 931       add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
 932       if (!integer_zerop (init))
 933         {
 934           assign = gimple_build_assign (final_iv, MINUS_EXPR,
 935                                         phi_dest, init);
 936           gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
 937           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
 938         }
 939     }
 940
 941   return cond_stmt;
 942 }
 943
 944 /* If we're using fully-masked loops, make LOOP iterate:
 945
 946       N == (NITERS - 1) / STEP + 1
 947
 948    times.  When NITERS is zero, this is equivalent to making the loop
 949    execute (1 << M) / STEP times, where M is the precision of NITERS.
 950    NITERS_MAYBE_ZERO is true if this last case might occur.
 951
 952    If we're not using fully-masked loops, make LOOP iterate:
 953
 954       N == (NITERS - STEP) / STEP + 1
 955
 956    times, where NITERS is known to be outside the range [1, STEP - 1].
 957    This is equivalent to making the loop execute NITERS / STEP times
 958    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 959    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 960
 961    If FINAL_IV is nonnull, it is an SSA name that should be set to
 962    N * STEP on exit from the loop.
 963
 964    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 965
 966 void
 967 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 968                          tree niters, tree step, tree final_iv,
 969                          bool niters_maybe_zero)
 970 {
 971   gcond *cond_stmt;
 972   gcond *orig_cond = get_loop_exit_condition (loop);
 973   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 974
 975   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 976     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 977                                                          niters, final_iv,
 978                                                          niters_maybe_zero,
 979                                                          loop_cond_gsi);
 980   else
 981     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 982                                                 niters_maybe_zero,
 983                                                 loop_cond_gsi);
 984
 985   /* Remove old loop exit test.  */
 986   stmt_vec_info orig_cond_info;
 987   if (loop_vinfo
 988       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 989     loop_vinfo->remove_stmt (orig_cond_info);
 990   else
 991     gsi_remove (&loop_cond_gsi, true);
 992
 993   if (dump_enabled_p ())
 994     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 995                      (gimple *) cond_stmt);
 996 }
 997
 998 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 999    For all PHI arguments in FROM->dest and TO->dest from those
1000    edges ensure that TO->dest PHI arguments have current_def
1001    to that in from.  */
1002
1003 static void
1004 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1005 {
1006   gimple_stmt_iterator gsi_from, gsi_to;
1007
1008   for (gsi_from = gsi_start_phis (from->dest),
1009        gsi_to = gsi_start_phis (to->dest);
1010        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1011     {
1012       gimple *from_phi = gsi_stmt (gsi_from);
1013       gimple *to_phi = gsi_stmt (gsi_to);
1014       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1015       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1016       if (virtual_operand_p (from_arg))
1017         {
1018           gsi_next (&gsi_from);
1019           continue;
1020         }
1021       if (virtual_operand_p (to_arg))
1022         {
1023           gsi_next (&gsi_to);
1024           continue;
1025         }
1026       if (TREE_CODE (from_arg) != SSA_NAME)
1027         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1028       else if (TREE_CODE (to_arg) == SSA_NAME
1029                && from_arg != to_arg)
1030         {
1031           if (get_current_def (to_arg) == NULL_TREE)
1032             {
1033               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1034                                               TREE_TYPE (get_current_def
1035                                                            (from_arg))));
1036               set_current_def (to_arg, get_current_def (from_arg));
1037             }
1038         }
1039       gsi_next (&gsi_from);
1040       gsi_next (&gsi_to);
1041     }
1042
1043   gphi *from_phi = get_virtual_phi (from->dest);
1044   gphi *to_phi = get_virtual_phi (to->dest);
1045   if (from_phi)
1046     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1047                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1048 }
1049
1050
1051 /* Given LOOP this function generates a new copy of it and puts it
1052    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1053    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1054    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1055    entry or exit of LOOP.  */
1056
1057 class loop *
1058 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1059                                         class loop *scalar_loop, edge e)
1060 {
1061   class loop *new_loop;
1062   basic_block *new_bbs, *bbs, *pbbs;
1063   bool at_exit;
1064   bool was_imm_dom;
1065   basic_block exit_dest;
1066   edge exit, new_exit;
1067   bool duplicate_outer_loop = false;
1068
1069   exit = single_exit (loop);
1070   at_exit = (e == exit);
1071   if (!at_exit && e != loop_preheader_edge (loop))
1072     return NULL;
1073
1074   if (scalar_loop == NULL)
1075     scalar_loop = loop;
1076
1077   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1078   pbbs = bbs + 1;
1079   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1080   /* Allow duplication of outer loops.  */
1081   if (scalar_loop->inner)
1082     duplicate_outer_loop = true;
1083
1084   /* Generate new loop structure.  */
1085   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1086   duplicate_subloops (scalar_loop, new_loop);
1087
1088   exit_dest = exit->dest;
1089   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1090                                           exit_dest) == loop->header ?
1091                  true : false);
1092
1093   /* Also copy the pre-header, this avoids jumping through hoops to
1094      duplicate the loop entry PHI arguments.  Create an empty
1095      pre-header unconditionally for this.  */
1096   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1097   edge entry_e = single_pred_edge (preheader);
1098   bbs[0] = preheader;
1099   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1100
1101   exit = single_exit (scalar_loop);
1102   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1103             &exit, 1, &new_exit, NULL,
1104             at_exit ? loop->latch : e->src, true);
1105   exit = single_exit (loop);
1106   basic_block new_preheader = new_bbs[0];
1107
1108   /* Before installing PHI arguments make sure that the edges
1109      into them match that of the scalar loop we analyzed.  This
1110      makes sure the SLP tree matches up between the main vectorized
1111      loop and the epilogue vectorized copies.  */
1112   if (single_succ_edge (preheader)->dest_idx
1113       != single_succ_edge (new_bbs[0])->dest_idx)
1114     {
1115       basic_block swap_bb = new_bbs[1];
1116       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1117       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1118       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1119       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1120     }
1121   if (duplicate_outer_loop)
1122     {
1123       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1124       if (loop_preheader_edge (scalar_loop)->dest_idx
1125           != loop_preheader_edge (new_inner_loop)->dest_idx)
1126         {
1127           basic_block swap_bb = new_inner_loop->header;
1128           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1129           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1130           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1131           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1132         }
1133     }
1134
1135   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1136
1137   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1138   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1139     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1140
1141   if (scalar_loop != loop)
1142     {
1143       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1144          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1145          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1146          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1147          header) to have current_def set, so copy them over.  */
1148       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1149                                                 exit);
1150       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1151                                                            0),
1152                                                 EDGE_SUCC (loop->latch, 0));
1153     }
1154
1155   if (at_exit) /* Add the loop copy at exit.  */
1156     {
1157       if (scalar_loop != loop)
1158         {
1159           gphi_iterator gsi;
1160           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1161
1162           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1163                gsi_next (&gsi))
1164             {
1165               gphi *phi = gsi.phi ();
1166               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1167               location_t orig_locus
1168                 = gimple_phi_arg_location_from_edge (phi, e);
1169
1170               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1171             }
1172         }
1173       redirect_edge_and_branch_force (e, new_preheader);
1174       flush_pending_stmts (e);
1175       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1176       if (was_imm_dom || duplicate_outer_loop)
1177         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1178
1179       /* And remove the non-necessary forwarder again.  Keep the other
1180          one so we have a proper pre-header for the loop at the exit edge.  */
1181       redirect_edge_pred (single_succ_edge (preheader),
1182                           single_pred (preheader));
1183       delete_basic_block (preheader);
1184       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1185                                loop_preheader_edge (scalar_loop)->src);
1186     }
1187   else /* Add the copy at entry.  */
1188     {
1189       if (scalar_loop != loop)
1190         {
1191           /* Remove the non-necessary forwarder of scalar_loop again.  */
1192           redirect_edge_pred (single_succ_edge (preheader),
1193                               single_pred (preheader));
1194           delete_basic_block (preheader);
1195           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1196                                    loop_preheader_edge (scalar_loop)->src);
1197           preheader = split_edge (loop_preheader_edge (loop));
1198           entry_e = single_pred_edge (preheader);
1199         }
1200
1201       redirect_edge_and_branch_force (entry_e, new_preheader);
1202       flush_pending_stmts (entry_e);
1203       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1204
1205       redirect_edge_and_branch_force (new_exit, preheader);
1206       flush_pending_stmts (new_exit);
1207       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1208
1209       /* And remove the non-necessary forwarder again.  Keep the other
1210          one so we have a proper pre-header for the loop at the exit edge.  */
1211       redirect_edge_pred (single_succ_edge (new_preheader),
1212                           single_pred (new_preheader));
1213       delete_basic_block (new_preheader);
1214       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1215                                loop_preheader_edge (new_loop)->src);
1216     }
1217
1218   if (scalar_loop != loop)
1219     {
1220       /* Update new_loop->header PHIs, so that on the preheader
1221          edge they are the ones from loop rather than scalar_loop.  */
1222       gphi_iterator gsi_orig, gsi_new;
1223       edge orig_e = loop_preheader_edge (loop);
1224       edge new_e = loop_preheader_edge (new_loop);
1225
1226       for (gsi_orig = gsi_start_phis (loop->header),
1227            gsi_new = gsi_start_phis (new_loop->header);
1228            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1229            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1230         {
1231           gphi *orig_phi = gsi_orig.phi ();
1232           gphi *new_phi = gsi_new.phi ();
1233           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1234           location_t orig_locus
1235             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1236
1237           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1238         }
1239     }
1240
1241   free (new_bbs);
1242   free (bbs);
1243
1244   checking_verify_dominators (CDI_DOMINATORS);
1245
1246   return new_loop;
1247 }
1248
1249
1250 /* Given the condition expression COND, put it as the last statement of
1251    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1252    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1253    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1254    new edge as irreducible if IRREDUCIBLE_P is true.  */
1255
1256 static edge
1257 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1258                        basic_block guard_to, basic_block dom_bb,
1259                        profile_probability probability, bool irreducible_p)
1260 {
1261   gimple_stmt_iterator gsi;
1262   edge new_e, enter_e;
1263   gcond *cond_stmt;
1264   gimple_seq gimplify_stmt_list = NULL;
1265
1266   enter_e = EDGE_SUCC (guard_bb, 0);
1267   enter_e->flags &= ~EDGE_FALLTHRU;
1268   enter_e->flags |= EDGE_FALSE_VALUE;
1269   gsi = gsi_last_bb (guard_bb);
1270
1271   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1272                                  is_gimple_condexpr_for_cond, NULL_TREE);
1273   if (gimplify_stmt_list)
1274     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1275
1276   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1277   gsi = gsi_last_bb (guard_bb);
1278   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1279
1280   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1281   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1282
1283   new_e->probability = probability;
1284   if (irreducible_p)
1285     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1286
1287   enter_e->probability = probability.invert ();
1288   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1289
1290   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1291   if (enter_e->dest->loop_father->header == enter_e->dest)
1292     split_edge (enter_e);
1293
1294   return new_e;
1295 }
1296
1297
1298 /* This function verifies that the following restrictions apply to LOOP:
1299    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1300        for innermost loop and 5 basic blocks for outer-loop.
1301    (2) it is single entry, single exit
1302    (3) its exit condition is the last stmt in the header
1303    (4) E is the entry/exit edge of LOOP.
1304  */
1305
1306 bool
1307 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1308 {
1309   edge exit_e = single_exit (loop);
1310   edge entry_e = loop_preheader_edge (loop);
1311   gcond *orig_cond = get_loop_exit_condition (loop);
1312   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1313   unsigned int num_bb = loop->inner? 5 : 2;
1314
1315   /* All loops have an outer scope; the only case loop->outer is NULL is for
1316      the function itself.  */
1317   if (!loop_outer (loop)
1318       || loop->num_nodes != num_bb
1319       || !empty_block_p (loop->latch)
1320       || !single_exit (loop)
1321       /* Verify that new loop exit condition can be trivially modified.  */
1322       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1323       || (e != exit_e && e != entry_e))
1324     return false;
1325
1326   basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1327   get_loop_body_with_size (loop, bbs, loop->num_nodes);
1328   bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1329   free (bbs);
1330   return ret;
1331 }
1332
1333 /* Function vect_get_loop_location.
1334
1335    Extract the location of the loop in the source code.
1336    If the loop is not well formed for vectorization, an estimated
1337    location is calculated.
1338    Return the loop location if succeed and NULL if not.  */
1339
1340 dump_user_location_t
1341 find_loop_location (class loop *loop)
1342 {
1343   gimple *stmt = NULL;
1344   basic_block bb;
1345   gimple_stmt_iterator si;
1346
1347   if (!loop)
1348     return dump_user_location_t ();
1349
1350   stmt = get_loop_exit_condition (loop);
1351
1352   if (stmt
1353       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1354     return stmt;
1355
1356   /* If we got here the loop is probably not "well formed",
1357      try to estimate the loop location */
1358
1359   if (!loop->header)
1360     return dump_user_location_t ();
1361
1362   bb = loop->header;
1363
1364   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1365     {
1366       stmt = gsi_stmt (si);
1367       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1368         return stmt;
1369     }
1370
1371   return dump_user_location_t ();
1372 }
1373
1374 /* Return true if the phi described by STMT_INFO defines an IV of the
1375    loop to be vectorized.  */
1376
1377 static bool
1378 iv_phi_p (stmt_vec_info stmt_info)
1379 {
1380   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1381   if (virtual_operand_p (PHI_RESULT (phi)))
1382     return false;
1383
1384   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1385       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1386     return false;
1387
1388   return true;
1389 }
1390
1391 /* Return true if vectorizer can peel for nonlinear iv.  */
1392 static bool
1393 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
1394                               enum vect_induction_op_type induction_type)
1395 {
1396   tree niters_skip;
1397   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
1398      if niters or vf is unkown:
1399      For shift, when shift mount >= precision, there would be UD.
1400      For mult, don't known how to generate
1401      init_expr * pow (step, niters) for variable niters.
1402      For neg, it should be ok, since niters of vectorized main loop
1403      will always be multiple of 2.  */
1404   if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1405        || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
1406       && induction_type != vect_step_op_neg)
1407     {
1408       if (dump_enabled_p ())
1409         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410                          "Peeling for epilogue is not supported"
1411                          " for nonlinear induction except neg"
1412                          " when iteration count is unknown.\n");
1413       return false;
1414     }
1415
1416   /* Also doens't support peel for neg when niter is variable.
1417      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
1418   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
1419   if ((niters_skip != NULL_TREE
1420        && TREE_CODE (niters_skip) != INTEGER_CST)
1421       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
1422           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
1423     {
1424       if (dump_enabled_p ())
1425         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426                          "Peeling for alignement is not supported"
1427                          " for nonlinear induction when niters_skip"
1428                          " is not constant.\n");
1429       return false;
1430     }
1431
1432   return true;
1433 }
1434
1435 /* Function vect_can_advance_ivs_p
1436
1437    In case the number of iterations that LOOP iterates is unknown at compile
1438    time, an epilog loop will be generated, and the loop induction variables
1439    (IVs) will be "advanced" to the value they are supposed to take just before
1440    the epilog loop.  Here we check that the access function of the loop IVs
1441    and the expression that represents the loop bound are simple enough.
1442    These restrictions will be relaxed in the future.  */
1443
1444 bool
1445 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1446 {
1447   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1448   basic_block bb = loop->header;
1449   gphi_iterator gsi;
1450
1451   /* Analyze phi functions of the loop header.  */
1452
1453   if (dump_enabled_p ())
1454     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1455   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1456     {
1457       tree evolution_part;
1458       enum vect_induction_op_type induction_type;
1459
1460       gphi *phi = gsi.phi ();
1461       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1462       if (dump_enabled_p ())
1463         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1464                          phi_info->stmt);
1465
1466       /* Skip virtual phi's. The data dependences that are associated with
1467          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1468
1469          Skip reduction phis.  */
1470       if (!iv_phi_p (phi_info))
1471         {
1472           if (dump_enabled_p ())
1473             dump_printf_loc (MSG_NOTE, vect_location,
1474                              "reduc or virtual phi. skip.\n");
1475           continue;
1476         }
1477
1478       induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1479       if (induction_type != vect_step_op_add)
1480         {
1481           if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
1482             return false;
1483
1484           continue;
1485         }
1486
1487       /* Analyze the evolution function.  */
1488
1489       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1490       if (evolution_part == NULL_TREE)
1491         {
1492           if (dump_enabled_p ())
1493             dump_printf (MSG_MISSED_OPTIMIZATION,
1494                          "No access function or evolution.\n");
1495           return false;
1496         }
1497
1498       /* FORNOW: We do not transform initial conditions of IVs
1499          which evolution functions are not invariants in the loop.  */
1500
1501       if (!expr_invariant_in_loop_p (loop, evolution_part))
1502         {
1503           if (dump_enabled_p ())
1504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505                              "evolution not invariant in loop.\n");
1506           return false;
1507         }
1508
1509       /* FORNOW: We do not transform initial conditions of IVs
1510          which evolution functions are a polynomial of degree >= 2.  */
1511
1512       if (tree_is_chrec (evolution_part))
1513         {
1514           if (dump_enabled_p ())
1515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1516                              "evolution is chrec.\n");
1517           return false;
1518         }
1519     }
1520
1521   return true;
1522 }
1523
1524
1525 /*   Function vect_update_ivs_after_vectorizer.
1526
1527      "Advance" the induction variables of LOOP to the value they should take
1528      after the execution of LOOP.  This is currently necessary because the
1529      vectorizer does not handle induction variables that are used after the
1530      loop.  Such a situation occurs when the last iterations of LOOP are
1531      peeled, because:
1532      1. We introduced new uses after LOOP for IVs that were not originally used
1533         after LOOP: the IVs of LOOP are now used by an epilog loop.
1534      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1535         times, whereas the loop IVs should be bumped N times.
1536
1537      Input:
1538      - LOOP - a loop that is going to be vectorized. The last few iterations
1539               of LOOP were peeled.
1540      - NITERS - the number of iterations that LOOP executes (before it is
1541                 vectorized). i.e, the number of times the ivs should be bumped.
1542      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1543                   coming out from LOOP on which there are uses of the LOOP ivs
1544                   (this is the path from LOOP->exit to epilog_loop->preheader).
1545
1546                   The new definitions of the ivs are placed in LOOP->exit.
1547                   The phi args associated with the edge UPDATE_E in the bb
1548                   UPDATE_E->dest are updated accordingly.
1549
1550      Assumption 1: Like the rest of the vectorizer, this function assumes
1551      a single loop exit that has a single predecessor.
1552
1553      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1554      organized in the same order.
1555
1556      Assumption 3: The access function of the ivs is simple enough (see
1557      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1558
1559      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1560      coming out of LOOP on which the ivs of LOOP are used (this is the path
1561      that leads to the epilog loop; other paths skip the epilog loop).  This
1562      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1563      needs to have its phis updated.
1564  */
1565
1566 static void
1567 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1568                                   tree niters, edge update_e)
1569 {
1570   gphi_iterator gsi, gsi1;
1571   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1572   basic_block update_bb = update_e->dest;
1573   basic_block exit_bb = single_exit (loop)->dest;
1574
1575   /* Make sure there exists a single-predecessor exit bb:  */
1576   gcc_assert (single_pred_p (exit_bb));
1577   gcc_assert (single_succ_edge (exit_bb) == update_e);
1578
1579   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1580        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1581        gsi_next (&gsi), gsi_next (&gsi1))
1582     {
1583       tree init_expr;
1584       tree step_expr, off;
1585       tree type;
1586       tree var, ni, ni_name;
1587       gimple_stmt_iterator last_gsi;
1588
1589       gphi *phi = gsi.phi ();
1590       gphi *phi1 = gsi1.phi ();
1591       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1592       if (dump_enabled_p ())
1593         dump_printf_loc (MSG_NOTE, vect_location,
1594                          "vect_update_ivs_after_vectorizer: phi: %G",
1595                          (gimple *) phi);
1596
1597       /* Skip reduction and virtual phis.  */
1598       if (!iv_phi_p (phi_info))
1599         {
1600           if (dump_enabled_p ())
1601             dump_printf_loc (MSG_NOTE, vect_location,
1602                              "reduc or virtual phi. skip.\n");
1603           continue;
1604         }
1605
1606       type = TREE_TYPE (gimple_phi_result (phi));
1607       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1608       step_expr = unshare_expr (step_expr);
1609
1610       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1611          of degree >= 2 or exponential.  */
1612       gcc_assert (!tree_is_chrec (step_expr));
1613
1614       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1615       gimple_seq stmts = NULL;
1616       enum vect_induction_op_type induction_type
1617         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1618
1619       if (induction_type == vect_step_op_add)
1620         {
1621           tree stype = TREE_TYPE (step_expr);
1622           off = fold_build2 (MULT_EXPR, stype,
1623                              fold_convert (stype, niters), step_expr);
1624           if (POINTER_TYPE_P (type))
1625             ni = fold_build_pointer_plus (init_expr, off);
1626           else
1627             ni = fold_convert (type,
1628                                fold_build2 (PLUS_EXPR, stype,
1629                                             fold_convert (stype, init_expr),
1630                                             off));
1631         }
1632       /* Don't bother call vect_peel_nonlinear_iv_init.  */
1633       else if (induction_type == vect_step_op_neg)
1634         ni = init_expr;
1635       else
1636         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
1637                                           niters, step_expr,
1638                                           induction_type);
1639
1640       var = create_tmp_var (type, "tmp");
1641
1642       last_gsi = gsi_last_bb (exit_bb);
1643       gimple_seq new_stmts = NULL;
1644       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1645       /* Exit_bb shouldn't be empty.  */
1646       if (!gsi_end_p (last_gsi))
1647         {
1648           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
1649           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1650         }
1651       else
1652         {
1653           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
1654           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1655         }
1656
1657       /* Fix phi expressions in the successor bb.  */
1658       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1659     }
1660 }
1661
1662 /* Return a gimple value containing the misalignment (measured in vector
1663    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1664    it is away from a perfectly aligned address.  Add any new statements
1665    to SEQ.  */
1666
1667 static tree
1668 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1669 {
1670   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1671   stmt_vec_info stmt_info = dr_info->stmt;
1672   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1673
1674   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1675   unsigned HOST_WIDE_INT target_align_c;
1676   tree target_align_minus_1;
1677
1678   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1679                                         size_zero_node) < 0;
1680   tree offset = (negative
1681                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1682                              * TREE_INT_CST_LOW
1683                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1684                  : size_zero_node);
1685   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1686                                                           stmt_info, seq,
1687                                                           offset);
1688   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1689   if (target_align.is_constant (&target_align_c))
1690     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1691   else
1692     {
1693       tree vla = build_int_cst (type, target_align);
1694       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1695                                     fold_build2 (MINUS_EXPR, type,
1696                                                  build_int_cst (type, 0), vla));
1697       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1698                                           build_int_cst (type, 1));
1699     }
1700
1701   HOST_WIDE_INT elem_size
1702     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1703   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1704
1705   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1706   tree int_start_addr = fold_convert (type, start_addr);
1707   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1708                                         target_align_minus_1);
1709
1710   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1711   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1712                                         elem_size_log);
1713
1714   return misalign_in_elems;
1715 }
1716
1717 /* Function vect_gen_prolog_loop_niters
1718
1719    Generate the number of iterations which should be peeled as prolog for the
1720    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1721    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1722    As a result, after the execution of this loop, the data reference DR will
1723    refer to an aligned location.  The following computation is generated:
1724
1725    If the misalignment of DR is known at compile time:
1726      addr_mis = int mis = DR_MISALIGNMENT (dr);
1727    Else, compute address misalignment in bytes:
1728      addr_mis = addr & (target_align - 1)
1729
1730    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1731
1732    (elem_size = element type size; an element is the scalar element whose type
1733    is the inner type of the vectype)
1734
1735    The computations will be emitted at the end of BB.  We also compute and
1736    store upper bound (included) of the result in BOUND.
1737
1738    When the step of the data-ref in the loop is not 1 (as in interleaved data
1739    and SLP), the number of iterations of the prolog must be divided by the step
1740    (which is equal to the size of interleaved group).
1741
1742    The above formulas assume that VF == number of elements in the vector. This
1743    may not hold when there are multiple-types in the loop.
1744    In this case, for some data-references in the loop the VF does not represent
1745    the number of elements that fit in the vector.  Therefore, instead of VF we
1746    use TYPE_VECTOR_SUBPARTS.  */
1747
1748 static tree
1749 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1750                              basic_block bb, int *bound)
1751 {
1752   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1753   tree var;
1754   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1755   gimple_seq stmts = NULL, new_stmts = NULL;
1756   tree iters, iters_name;
1757   stmt_vec_info stmt_info = dr_info->stmt;
1758   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1759   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1760
1761   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1762     {
1763       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1764
1765       if (dump_enabled_p ())
1766         dump_printf_loc (MSG_NOTE, vect_location,
1767                          "known peeling = %d.\n", npeel);
1768
1769       iters = build_int_cst (niters_type, npeel);
1770       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1771     }
1772   else
1773     {
1774       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1775       tree type = TREE_TYPE (misalign_in_elems);
1776       HOST_WIDE_INT elem_size
1777         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1778       /* We only do prolog peeling if the target alignment is known at compile
1779          time.  */
1780       poly_uint64 align_in_elems =
1781         exact_div (target_align, elem_size);
1782       tree align_in_elems_minus_1 =
1783         build_int_cst (type, align_in_elems - 1);
1784       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1785
1786       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1787                                  & (align_in_elems - 1)).  */
1788       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1789                                             size_zero_node) < 0;
1790       if (negative)
1791         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1792                              align_in_elems_tree);
1793       else
1794         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1795                              misalign_in_elems);
1796       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1797       iters = fold_convert (niters_type, iters);
1798       unsigned HOST_WIDE_INT align_in_elems_c;
1799       if (align_in_elems.is_constant (&align_in_elems_c))
1800         *bound = align_in_elems_c - 1;
1801       else
1802         *bound = -1;
1803     }
1804
1805   if (dump_enabled_p ())
1806     dump_printf_loc (MSG_NOTE, vect_location,
1807                      "niters for prolog loop: %T\n", iters);
1808
1809   var = create_tmp_var (niters_type, "prolog_loop_niters");
1810   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1811
1812   if (new_stmts)
1813     gimple_seq_add_seq (&stmts, new_stmts);
1814   if (stmts)
1815     {
1816       gcc_assert (single_succ_p (bb));
1817       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1818       if (gsi_end_p (gsi))
1819         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1820       else
1821         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1822     }
1823   return iters_name;
1824 }
1825
1826
1827 /* Function vect_update_init_of_dr
1828
1829    If CODE is PLUS, the vector loop starts NITERS iterations after the
1830    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1831    iterations before the scalar one (using masking to skip inactive
1832    elements).  This function updates the information recorded in DR to
1833    account for the difference.  Specifically, it updates the OFFSET
1834    field of DR_INFO.  */
1835
1836 static void
1837 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1838 {
1839   struct data_reference *dr = dr_info->dr;
1840   tree offset = dr_info->offset;
1841   if (!offset)
1842     offset = build_zero_cst (sizetype);
1843
1844   niters = fold_build2 (MULT_EXPR, sizetype,
1845                         fold_convert (sizetype, niters),
1846                         fold_convert (sizetype, DR_STEP (dr)));
1847   offset = fold_build2 (code, sizetype,
1848                         fold_convert (sizetype, offset), niters);
1849   dr_info->offset = offset;
1850 }
1851
1852
1853 /* Function vect_update_inits_of_drs
1854
1855    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1856    CODE and NITERS are as for vect_update_inits_of_dr.  */
1857
1858 void
1859 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1860                           tree_code code)
1861 {
1862   unsigned int i;
1863   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1864   struct data_reference *dr;
1865
1866   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1867
1868   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1869      here, but since we might use these niters to update the epilogues niters
1870      and data references we can't insert them here as this definition might not
1871      always dominate its uses.  */
1872   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1873     niters = fold_convert (sizetype, niters);
1874
1875   FOR_EACH_VEC_ELT (datarefs, i, dr)
1876     {
1877       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1878       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1879           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1880         vect_update_init_of_dr (dr_info, niters, code);
1881     }
1882 }
1883
1884 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1885    by masking.  This involves calculating the number of iterations to
1886    be peeled and then aligning all memory references appropriately.  */
1887
1888 void
1889 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1890 {
1891   tree misalign_in_elems;
1892   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1893
1894   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1895
1896   /* From the information recorded in LOOP_VINFO get the number of iterations
1897      that need to be skipped via masking.  */
1898   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1899     {
1900       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1901                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1902       misalign_in_elems = build_int_cst (type, misalign);
1903     }
1904   else
1905     {
1906       gimple_seq seq1 = NULL, seq2 = NULL;
1907       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1908       misalign_in_elems = fold_convert (type, misalign_in_elems);
1909       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1910                                                 &seq2, true, NULL_TREE);
1911       gimple_seq_add_seq (&seq1, seq2);
1912       if (seq1)
1913         {
1914           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1915           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1916           gcc_assert (!new_bb);
1917         }
1918     }
1919
1920   if (dump_enabled_p ())
1921     dump_printf_loc (MSG_NOTE, vect_location,
1922                      "misalignment for fully-masked loop: %T\n",
1923                      misalign_in_elems);
1924
1925   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1926
1927   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1928 }
1929
1930 /* This function builds ni_name = number of iterations.  Statements
1931    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1932    it to TRUE if new ssa_var is generated.  */
1933
1934 tree
1935 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1936 {
1937   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1938   if (TREE_CODE (ni) == INTEGER_CST)
1939     return ni;
1940   else
1941     {
1942       tree ni_name, var;
1943       gimple_seq stmts = NULL;
1944       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1945
1946       var = create_tmp_var (TREE_TYPE (ni), "niters");
1947       ni_name = force_gimple_operand (ni, &stmts, false, var);
1948       if (stmts)
1949         {
1950           gsi_insert_seq_on_edge_immediate (pe, stmts);
1951           if (new_var_p != NULL)
1952             *new_var_p = true;
1953         }
1954
1955       return ni_name;
1956     }
1957 }
1958
1959 /* Calculate the number of iterations above which vectorized loop will be
1960    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1961    of prolog loop.  If it's integer const, the integer number is also passed
1962    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1963    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1964    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1965    threshold below which the scalar (rather than vectorized) loop will be
1966    executed.  This function stores the upper bound (inclusive) of the result
1967    in BOUND_SCALAR.  */
1968
1969 static tree
1970 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1971                              int bound_prolog, poly_int64 bound_epilog, int th,
1972                              poly_uint64 *bound_scalar,
1973                              bool check_profitability)
1974 {
1975   tree type = TREE_TYPE (niters_prolog);
1976   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1977                              build_int_cst (type, bound_epilog));
1978
1979   *bound_scalar = bound_prolog + bound_epilog;
1980   if (check_profitability)
1981     {
1982       /* TH indicates the minimum niters of vectorized loop, while we
1983          compute the maximum niters of scalar loop.  */
1984       th--;
1985       /* Peeling for constant times.  */
1986       if (int_niters_prolog >= 0)
1987         {
1988           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1989           return build_int_cst (type, *bound_scalar);
1990         }
1991       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1992          and BOUND_EPILOG are inclusive upper bounds.  */
1993       if (known_ge (th, bound_prolog + bound_epilog))
1994         {
1995           *bound_scalar = th;
1996           return build_int_cst (type, th);
1997         }
1998       /* Need to do runtime comparison.  */
1999       else if (maybe_gt (th, bound_epilog))
2000         {
2001           *bound_scalar = upper_bound (*bound_scalar, th);
2002           return fold_build2 (MAX_EXPR, type,
2003                               build_int_cst (type, th), niters);
2004         }
2005     }
2006   return niters;
2007 }
2008
2009 /* NITERS is the number of times that the original scalar loop executes
2010    after peeling.  Work out the maximum number of iterations N that can
2011    be handled by the vectorized form of the loop and then either:
2012
2013    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2014
2015         niters_vector = N
2016
2017    b) set *STEP_VECTOR_PTR to one and generate:
2018
2019         niters_vector = N / vf
2020
2021    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2022    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
2023    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
2024
2025 void
2026 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2027                              tree *niters_vector_ptr, tree *step_vector_ptr,
2028                              bool niters_no_overflow)
2029 {
2030   tree ni_minus_gap, var;
2031   tree niters_vector, step_vector, type = TREE_TYPE (niters);
2032   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2033   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2034   tree log_vf = NULL_TREE;
2035
2036   /* If epilogue loop is required because of data accesses with gaps, we
2037      subtract one iteration from the total number of iterations here for
2038      correct calculation of RATIO.  */
2039   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2040     {
2041       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2042                                   build_one_cst (type));
2043       if (!is_gimple_val (ni_minus_gap))
2044         {
2045           var = create_tmp_var (type, "ni_gap");
2046           gimple *stmts = NULL;
2047           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2048                                                true, var);
2049           gsi_insert_seq_on_edge_immediate (pe, stmts);
2050         }
2051     }
2052   else
2053     ni_minus_gap = niters;
2054
2055   /* To silence some unexpected warnings, simply initialize to 0. */
2056   unsigned HOST_WIDE_INT const_vf = 0;
2057   if (vf.is_constant (&const_vf)
2058       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2059     {
2060       /* Create: niters >> log2(vf) */
2061       /* If it's known that niters == number of latch executions + 1 doesn't
2062          overflow, we can generate niters >> log2(vf); otherwise we generate
2063          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2064          will be at least one.  */
2065       log_vf = build_int_cst (type, exact_log2 (const_vf));
2066       if (niters_no_overflow)
2067         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2068       else
2069         niters_vector
2070           = fold_build2 (PLUS_EXPR, type,
2071                          fold_build2 (RSHIFT_EXPR, type,
2072                                       fold_build2 (MINUS_EXPR, type,
2073                                                    ni_minus_gap,
2074                                                    build_int_cst (type, vf)),
2075                                       log_vf),
2076                          build_int_cst (type, 1));
2077       step_vector = build_one_cst (type);
2078     }
2079   else
2080     {
2081       niters_vector = ni_minus_gap;
2082       step_vector = build_int_cst (type, vf);
2083     }
2084
2085   if (!is_gimple_val (niters_vector))
2086     {
2087       var = create_tmp_var (type, "bnd");
2088       gimple_seq stmts = NULL;
2089       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2090       gsi_insert_seq_on_edge_immediate (pe, stmts);
2091       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2092          we set range information to make niters analyzer's life easier.
2093          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2094          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2095       if (stmts != NULL && log_vf)
2096         {
2097           if (niters_no_overflow)
2098             {
2099               value_range vr (type,
2100                               wi::one (TYPE_PRECISION (type)),
2101                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2102                                                          TYPE_SIGN (type)),
2103                                           exact_log2 (const_vf),
2104                                           TYPE_SIGN (type)));
2105               set_range_info (niters_vector, vr);
2106             }
2107           /* For VF == 1 the vector IV might also overflow so we cannot
2108              assert a minimum value of 1.  */
2109           else if (const_vf > 1)
2110             {
2111               value_range vr (type,
2112                               wi::one (TYPE_PRECISION (type)),
2113                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2114                                                          TYPE_SIGN (type))
2115                                           - (const_vf - 1),
2116                                           exact_log2 (const_vf), TYPE_SIGN (type))
2117                               + 1);
2118               set_range_info (niters_vector, vr);
2119             }
2120         }
2121     }
2122   *niters_vector_ptr = niters_vector;
2123   *step_vector_ptr = step_vector;
2124
2125   return;
2126 }
2127
2128 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2129    loop specified by LOOP_VINFO after vectorization, compute the number
2130    of iterations before vectorization (niters_vector * vf) and store it
2131    to NITERS_VECTOR_MULT_VF_PTR.  */
2132
2133 static void
2134 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2135                                      tree niters_vector,
2136                                      tree *niters_vector_mult_vf_ptr)
2137 {
2138   /* We should be using a step_vector of VF if VF is variable.  */
2139   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2140   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2141   tree type = TREE_TYPE (niters_vector);
2142   tree log_vf = build_int_cst (type, exact_log2 (vf));
2143   basic_block exit_bb = single_exit (loop)->dest;
2144
2145   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2146   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2147                                             niters_vector, log_vf);
2148   if (!is_gimple_val (niters_vector_mult_vf))
2149     {
2150       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2151       gimple_seq stmts = NULL;
2152       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2153                                                     &stmts, true, var);
2154       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2155       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2156     }
2157   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2158 }
2159
2160 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2161    this function searches for the corresponding lcssa phi node in exit
2162    bb of LOOP.  If it is found, return the phi result; otherwise return
2163    NULL.  */
2164
2165 static tree
2166 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2167                 gphi *lcssa_phi)
2168 {
2169   gphi_iterator gsi;
2170   edge e = single_exit (loop);
2171
2172   gcc_assert (single_pred_p (e->dest));
2173   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2174     {
2175       gphi *phi = gsi.phi ();
2176       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2177                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2178         return PHI_RESULT (phi);
2179     }
2180   return NULL_TREE;
2181 }
2182
2183 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2184    from SECOND/FIRST and puts it at the original loop's preheader/exit
2185    edge, the two loops are arranged as below:
2186
2187        preheader_a:
2188      first_loop:
2189        header_a:
2190          i_1 = PHI<i_0, i_2>;
2191          ...
2192          i_2 = i_1 + 1;
2193          if (cond_a)
2194            goto latch_a;
2195          else
2196            goto between_bb;
2197        latch_a:
2198          goto header_a;
2199
2200        between_bb:
2201          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2202
2203      second_loop:
2204        header_b:
2205          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2206                                  or with i_2 if no LCSSA phi is created
2207                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2208          ...
2209          i_4 = i_3 + 1;
2210          if (cond_b)
2211            goto latch_b;
2212          else
2213            goto exit_bb;
2214        latch_b:
2215          goto header_b;
2216
2217        exit_bb:
2218
2219    This function creates loop closed SSA for the first loop; update the
2220    second loop's PHI nodes by replacing argument on incoming edge with the
2221    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2222    is false, Loop closed ssa phis will only be created for non-iv phis for
2223    the first loop.
2224
2225    This function assumes exit bb of the first loop is preheader bb of the
2226    second loop, i.e, between_bb in the example code.  With PHIs updated,
2227    the second loop will execute rest iterations of the first.  */
2228
2229 static void
2230 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2231                                    class loop *first, class loop *second,
2232                                    bool create_lcssa_for_iv_phis)
2233 {
2234   gphi_iterator gsi_update, gsi_orig;
2235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2236
2237   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2238   edge second_preheader_e = loop_preheader_edge (second);
2239   basic_block between_bb = single_exit (first)->dest;
2240
2241   gcc_assert (between_bb == second_preheader_e->src);
2242   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2243   /* Either the first loop or the second is the loop to be vectorized.  */
2244   gcc_assert (loop == first || loop == second);
2245
2246   for (gsi_orig = gsi_start_phis (first->header),
2247        gsi_update = gsi_start_phis (second->header);
2248        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2249        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2250     {
2251       gphi *orig_phi = gsi_orig.phi ();
2252       gphi *update_phi = gsi_update.phi ();
2253
2254       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2255       /* Generate lcssa PHI node for the first loop.  */
2256       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2257       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2258       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2259         {
2260           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2261           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2262           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2263           arg = new_res;
2264         }
2265
2266       /* Update PHI node in the second loop by replacing arg on the loop's
2267          incoming edge.  */
2268       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2269     }
2270
2271   /* For epilogue peeling we have to make sure to copy all LC PHIs
2272      for correct vectorization of live stmts.  */
2273   if (loop == first)
2274     {
2275       basic_block orig_exit = single_exit (second)->dest;
2276       for (gsi_orig = gsi_start_phis (orig_exit);
2277            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2278         {
2279           gphi *orig_phi = gsi_orig.phi ();
2280           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2281           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2282             continue;
2283
2284           /* Already created in the above loop.   */
2285           if (find_guard_arg (first, second, orig_phi))
2286             continue;
2287
2288           tree new_res = copy_ssa_name (orig_arg);
2289           gphi *lcphi = create_phi_node (new_res, between_bb);
2290           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2291         }
2292     }
2293 }
2294
2295 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2296    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2297    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2298    appear like below:
2299
2300        guard_bb:
2301          if (cond)
2302            goto merge_bb;
2303          else
2304            goto skip_loop;
2305
2306      skip_loop:
2307        header_a:
2308          i_1 = PHI<i_0, i_2>;
2309          ...
2310          i_2 = i_1 + 1;
2311          if (cond_a)
2312            goto latch_a;
2313          else
2314            goto exit_a;
2315        latch_a:
2316          goto header_a;
2317
2318        exit_a:
2319          i_5 = PHI<i_2>;
2320
2321        merge_bb:
2322          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2323
2324      update_loop:
2325        header_b:
2326          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2327          ...
2328          i_4 = i_3 + 1;
2329          if (cond_b)
2330            goto latch_b;
2331          else
2332            goto exit_bb;
2333        latch_b:
2334          goto header_b;
2335
2336        exit_bb:
2337
2338    This function creates PHI nodes at merge_bb and replaces the use of i_5
2339    in the update_loop's PHI node with the result of new PHI result.  */
2340
2341 static void
2342 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2343                                     class loop *update_loop,
2344                                     edge guard_edge, edge merge_edge)
2345 {
2346   location_t merge_loc, guard_loc;
2347   edge orig_e = loop_preheader_edge (skip_loop);
2348   edge update_e = loop_preheader_edge (update_loop);
2349   gphi_iterator gsi_orig, gsi_update;
2350
2351   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2352         gsi_update = gsi_start_phis (update_loop->header));
2353        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2354        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2355     {
2356       gphi *orig_phi = gsi_orig.phi ();
2357       gphi *update_phi = gsi_update.phi ();
2358
2359       /* Generate new phi node at merge bb of the guard.  */
2360       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2361       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2362
2363       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2364          args in NEW_PHI for these edges.  */
2365       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2366       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2367       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2368       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2369       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2370       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2371
2372       /* Update phi in UPDATE_PHI.  */
2373       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2374     }
2375 }
2376
2377 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2378    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2379    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2380    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2381    The CFG looks like:
2382
2383      loop:
2384        header_a:
2385          i_1 = PHI<i_0, i_2>;
2386          ...
2387          i_2 = i_1 + 1;
2388          if (cond_a)
2389            goto latch_a;
2390          else
2391            goto exit_a;
2392        latch_a:
2393          goto header_a;
2394
2395        exit_a:
2396
2397        guard_bb:
2398          if (cond)
2399            goto merge_bb;
2400          else
2401            goto epilog_loop;
2402
2403        ;; fall_through_bb
2404
2405      epilog_loop:
2406        header_b:
2407          i_3 = PHI<i_2, i_4>;
2408          ...
2409          i_4 = i_3 + 1;
2410          if (cond_b)
2411            goto latch_b;
2412          else
2413            goto merge_bb;
2414        latch_b:
2415          goto header_b;
2416
2417        merge_bb:
2418          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2419
2420        exit_bb:
2421          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2422
2423    For each name used out side EPILOG (i.e - for each name that has a lcssa
2424    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2425    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2426    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2427    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2428    in exit_bb will also be updated.  */
2429
2430 static void
2431 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2432                                     edge guard_edge, edge merge_edge)
2433 {
2434   gphi_iterator gsi;
2435   basic_block merge_bb = guard_edge->dest;
2436
2437   gcc_assert (single_succ_p (merge_bb));
2438   edge e = single_succ_edge (merge_bb);
2439   basic_block exit_bb = e->dest;
2440   gcc_assert (single_pred_p (exit_bb));
2441   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2442
2443   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2444     {
2445       gphi *update_phi = gsi.phi ();
2446       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2447
2448       tree merge_arg = NULL_TREE;
2449
2450       /* If the old argument is a SSA_NAME use its current_def.  */
2451       if (TREE_CODE (old_arg) == SSA_NAME)
2452         merge_arg = get_current_def (old_arg);
2453       /* If it's a constant or doesn't have a current_def, just use the old
2454          argument.  */
2455       if (!merge_arg)
2456         merge_arg = old_arg;
2457
2458       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2459       /* If the var is live after loop but not a reduction, we simply
2460          use the old arg.  */
2461       if (!guard_arg)
2462         guard_arg = old_arg;
2463
2464       /* Create new phi node in MERGE_BB:  */
2465       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2466       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2467
2468       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2469          the two PHI args in merge_phi for these edges.  */
2470       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2471       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2472
2473       /* Update the original phi in exit_bb.  */
2474       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2475     }
2476 }
2477
2478 /* EPILOG loop is duplicated from the original loop for vectorizing,
2479    the arg of its loop closed ssa PHI needs to be updated.  */
2480
2481 static void
2482 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2483 {
2484   gphi_iterator gsi;
2485   basic_block exit_bb = single_exit (epilog)->dest;
2486
2487   gcc_assert (single_pred_p (exit_bb));
2488   edge e = EDGE_PRED (exit_bb, 0);
2489   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2490     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2491 }
2492
2493 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2494    iterate exactly CONST_NITERS times.  Make a final decision about
2495    whether the epilogue loop should be used, returning true if so.  */
2496
2497 static bool
2498 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2499                              unsigned HOST_WIDE_INT const_niters)
2500 {
2501   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2502      using an epilogue loop for a single scalar iteration, even if
2503      we could in principle implement that using partial vectors.  */
2504   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2505   if (const_niters <= gap_niters + 1)
2506     return false;
2507
2508   /* Install the number of iterations.  */
2509   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2510   tree niters_tree = build_int_cst (niters_type, const_niters);
2511   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2512
2513   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2514   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2515
2516   /* Decide what to do if the number of epilogue iterations is not
2517      a multiple of the epilogue loop's vectorization factor.  */
2518   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2519 }
2520
2521 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2522    Return a value that equals:
2523
2524    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2525    - SKIP_VALUE when the main loop is skipped.  */
2526
2527 tree
2528 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2529                            tree skip_value)
2530 {
2531   gcc_assert (loop_vinfo->main_loop_edge);
2532
2533   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2534   basic_block bb = loop_vinfo->main_loop_edge->dest;
2535   gphi *new_phi = create_phi_node (phi_result, bb);
2536   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2537                UNKNOWN_LOCATION);
2538   add_phi_arg (new_phi, skip_value,
2539                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2540   return phi_result;
2541 }
2542
2543 /* Function vect_do_peeling.
2544
2545    Input:
2546    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2547
2548        preheader:
2549      LOOP:
2550        header_bb:
2551          loop_body
2552          if (exit_loop_cond) goto exit_bb
2553          else                goto header_bb
2554        exit_bb:
2555
2556    - NITERS: The number of iterations of the loop.
2557    - NITERSM1: The number of iterations of the loop's latch.
2558    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2559    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2560                               CHECK_PROFITABILITY is true.
2561    Output:
2562    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2563      iterate after vectorization; see vect_set_loop_condition for details.
2564    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2565      should be set to the number of scalar iterations handled by the
2566      vector loop.  The SSA name is only used on exit from the loop.
2567
2568    This function peels prolog and epilog from the loop, adds guards skipping
2569    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2570    would look like:
2571
2572        guard_bb_1:
2573          if (prefer_scalar_loop) goto merge_bb_1
2574          else                    goto guard_bb_2
2575
2576        guard_bb_2:
2577          if (skip_prolog) goto merge_bb_2
2578          else             goto prolog_preheader
2579
2580        prolog_preheader:
2581      PROLOG:
2582        prolog_header_bb:
2583          prolog_body
2584          if (exit_prolog_cond) goto prolog_exit_bb
2585          else                  goto prolog_header_bb
2586        prolog_exit_bb:
2587
2588        merge_bb_2:
2589
2590        vector_preheader:
2591      VECTOR LOOP:
2592        vector_header_bb:
2593          vector_body
2594          if (exit_vector_cond) goto vector_exit_bb
2595          else                  goto vector_header_bb
2596        vector_exit_bb:
2597
2598        guard_bb_3:
2599          if (skip_epilog) goto merge_bb_3
2600          else             goto epilog_preheader
2601
2602        merge_bb_1:
2603
2604        epilog_preheader:
2605      EPILOG:
2606        epilog_header_bb:
2607          epilog_body
2608          if (exit_epilog_cond) goto merge_bb_3
2609          else                  goto epilog_header_bb
2610
2611        merge_bb_3:
2612
2613    Note this function peels prolog and epilog only if it's necessary,
2614    as well as guards.
2615    This function returns the epilogue loop if a decision was made to vectorize
2616    it, otherwise NULL.
2617
2618    The analysis resulting in this epilogue loop's loop_vec_info was performed
2619    in the same vect_analyze_loop call as the main loop's.  At that time
2620    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2621    vectorization factors than the main loop.  This list is stored in the main
2622    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2623    vectorize the epilogue loop for a lower vectorization factor,  the
2624    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2625    updated and linked to the epilogue loop.  This is later used to vectorize
2626    the epilogue.  The reason the loop_vec_info needs updating is that it was
2627    constructed based on the original main loop, and the epilogue loop is a
2628    copy of this loop, so all links pointing to statements in the original loop
2629    need updating.  Furthermore, these loop_vec_infos share the
2630    data_reference's records, which will also need to be updated.
2631
2632    TODO: Guard for prefer_scalar_loop should be emitted along with
2633    versioning conditions if loop versioning is needed.  */
2634
2635
2636 class loop *
2637 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2638                  tree *niters_vector, tree *step_vector,
2639                  tree *niters_vector_mult_vf_var, int th,
2640                  bool check_profitability, bool niters_no_overflow,
2641                  tree *advance)
2642 {
2643   edge e, guard_e;
2644   tree type = TREE_TYPE (niters), guard_cond;
2645   basic_block guard_bb, guard_to;
2646   profile_probability prob_prolog, prob_vector, prob_epilog;
2647   int estimated_vf;
2648   int prolog_peeling = 0;
2649   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2650   bool vect_epilogues_updated_niters = false;
2651   /* We currently do not support prolog peeling if the target alignment is not
2652      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2653      target alignment being constant.  */
2654   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2655   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2656     return NULL;
2657
2658   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2659     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2660
2661   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2662   poly_uint64 bound_epilog = 0;
2663   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2664       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2665     bound_epilog += vf - 1;
2666   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2667     bound_epilog += 1;
2668   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2669   poly_uint64 bound_scalar = bound_epilog;
2670
2671   if (!prolog_peeling && !epilog_peeling)
2672     return NULL;
2673
2674   /* Before doing any peeling make sure to reset debug binds outside of
2675      the loop refering to defs not in LC SSA.  */
2676   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2677   for (unsigned i = 0; i < loop->num_nodes; ++i)
2678     {
2679       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2680       imm_use_iterator ui;
2681       gimple *use_stmt;
2682       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2683            gsi_next (&gsi))
2684         {
2685           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2686             if (gimple_debug_bind_p (use_stmt)
2687                 && loop != gimple_bb (use_stmt)->loop_father
2688                 && !flow_loop_nested_p (loop,
2689                                         gimple_bb (use_stmt)->loop_father))
2690               {
2691                 gimple_debug_bind_reset_value (use_stmt);
2692                 update_stmt (use_stmt);
2693               }
2694         }
2695       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2696            gsi_next (&gsi))
2697         {
2698           ssa_op_iter op_iter;
2699           def_operand_p def_p;
2700           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2701             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2702               if (gimple_debug_bind_p (use_stmt)
2703                   && loop != gimple_bb (use_stmt)->loop_father
2704                   && !flow_loop_nested_p (loop,
2705                                           gimple_bb (use_stmt)->loop_father))
2706                 {
2707                   gimple_debug_bind_reset_value (use_stmt);
2708                   update_stmt (use_stmt);
2709                 }
2710         }
2711     }
2712
2713   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2714   estimated_vf = vect_vf_for_cost (loop_vinfo);
2715   if (estimated_vf == 2)
2716     estimated_vf = 3;
2717   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2718                         .apply_scale (estimated_vf - 1, estimated_vf);
2719
2720   class loop *prolog, *epilog = NULL;
2721   class loop *first_loop = loop;
2722   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2723
2724   /* SSA form needs to be up-to-date since we are going to manually
2725      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
2726      update SSA state after that, so we have to make sure to not lose any
2727      pending update needs.  */
2728   gcc_assert (!need_ssa_update_p (cfun));
2729
2730   /* If we're vectorizing an epilogue loop, we have ensured that the
2731      virtual operand is in SSA form throughout the vectorized main loop.
2732      Normally it is possible to trace the updated
2733      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2734      back to scalar-stmt vuses, meaning that the effect of the SSA update
2735      remains local to the main loop.  However, there are rare cases in
2736      which the vectorized loop should have vdefs even when the original scalar
2737      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2738      introduces clobbers of the temporary vector array, which in turn
2739      needs new vdefs.  If the scalar loop doesn't write to memory, these
2740      new vdefs will be the only ones in the vector loop.
2741      We are currently defering updating virtual SSA form and creating
2742      of a virtual PHI for this case so we do not have to make sure the
2743      newly introduced virtual def is in LCSSA form.  */
2744
2745   if (MAY_HAVE_DEBUG_BIND_STMTS)
2746     {
2747       gcc_assert (!adjust_vec.exists ());
2748       adjust_vec.create (32);
2749     }
2750   initialize_original_copy_tables ();
2751
2752   /* Record the anchor bb at which the guard should be placed if the scalar
2753      loop might be preferred.  */
2754   basic_block anchor = loop_preheader_edge (loop)->src;
2755
2756   /* Generate the number of iterations for the prolog loop.  We do this here
2757      so that we can also get the upper bound on the number of iterations.  */
2758   tree niters_prolog;
2759   int bound_prolog = 0;
2760   if (prolog_peeling)
2761     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2762                                                   &bound_prolog);
2763   else
2764     niters_prolog = build_int_cst (type, 0);
2765
2766   loop_vec_info epilogue_vinfo = NULL;
2767   if (vect_epilogues)
2768     {
2769       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2770       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2771     }
2772
2773   tree niters_vector_mult_vf = NULL_TREE;
2774   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2775   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2776   edge update_e = NULL, skip_e = NULL;
2777   unsigned int lowest_vf = constant_lower_bound (vf);
2778   /* If we know the number of scalar iterations for the main loop we should
2779      check whether after the main loop there are enough iterations left over
2780      for the epilogue.  */
2781   if (vect_epilogues
2782       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2783       && prolog_peeling >= 0
2784       && known_eq (vf, lowest_vf))
2785     {
2786       unsigned HOST_WIDE_INT eiters
2787         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2788            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2789
2790       eiters -= prolog_peeling;
2791       eiters
2792         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2793
2794       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2795         {
2796           delete epilogue_vinfo;
2797           epilogue_vinfo = NULL;
2798           if (loop_vinfo->epilogue_vinfos.length () == 0)
2799             {
2800               vect_epilogues = false;
2801               break;
2802             }
2803           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2804           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2805         }
2806       vect_epilogues_updated_niters = true;
2807     }
2808   /* Prolog loop may be skipped.  */
2809   bool skip_prolog = (prolog_peeling != 0);
2810   /* Skip this loop to epilog when there are not enough iterations to enter this
2811      vectorized loop.  If true we should perform runtime checks on the NITERS
2812      to check whether we should skip the current vectorized loop.  If we know
2813      the number of scalar iterations we may choose to add a runtime check if
2814      this number "maybe" smaller than the number of iterations required
2815      when we know the number of scalar iterations may potentially
2816      be smaller than the number of iterations required to enter this loop, for
2817      this we use the upper bounds on the prolog and epilog peeling.  When we
2818      don't know the number of iterations and don't require versioning it is
2819      because we have asserted that there are enough scalar iterations to enter
2820      the main loop, so this skip is not necessary.  When we are versioning then
2821      we only add such a skip if we have chosen to vectorize the epilogue.  */
2822   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2823                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2824                                   bound_prolog + bound_epilog)
2825                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2826                          || vect_epilogues));
2827   /* Epilog loop must be executed if the number of iterations for epilog
2828      loop is known at compile time, otherwise we need to add a check at
2829      the end of vector loop and skip to the end of epilog loop.  */
2830   bool skip_epilog = (prolog_peeling < 0
2831                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2832                       || !vf.is_constant ());
2833   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2834   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2835     skip_epilog = false;
2836
2837   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2838   auto_vec<profile_count> original_counts;
2839   basic_block *original_bbs = NULL;
2840
2841   if (skip_vector)
2842     {
2843       split_edge (loop_preheader_edge (loop));
2844
2845       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
2846         {
2847           original_bbs = get_loop_body (loop);
2848           for (unsigned int i = 0; i < loop->num_nodes; i++)
2849             original_counts.safe_push(original_bbs[i]->count);
2850         }
2851
2852       /* Due to the order in which we peel prolog and epilog, we first
2853          propagate probability to the whole loop.  The purpose is to
2854          avoid adjusting probabilities of both prolog and vector loops
2855          separately.  Note in this case, the probability of epilog loop
2856          needs to be scaled back later.  */
2857       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2858       if (prob_vector.initialized_p ())
2859         {
2860           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2861           scale_loop_profile (loop, prob_vector, 0);
2862         }
2863     }
2864
2865   if (vect_epilogues)
2866     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2867        use the original scalar loop as remaining epilogue if necessary.  */
2868     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2869       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2870
2871   if (prolog_peeling)
2872     {
2873       e = loop_preheader_edge (loop);
2874       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
2875
2876       /* Peel prolog and put it on preheader edge of loop.  */
2877       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2878       gcc_assert (prolog);
2879       prolog->force_vectorize = false;
2880       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2881       first_loop = prolog;
2882       reset_original_copy_tables ();
2883
2884       /* Update the number of iterations for prolog loop.  */
2885       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2886       vect_set_loop_condition (prolog, NULL, niters_prolog,
2887                                step_prolog, NULL_TREE, false);
2888
2889       /* Skip the prolog loop.  */
2890       if (skip_prolog)
2891         {
2892           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2893                                     niters_prolog, build_int_cst (type, 0));
2894           guard_bb = loop_preheader_edge (prolog)->src;
2895           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2896           guard_to = split_edge (loop_preheader_edge (loop));
2897           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2898                                            guard_to, guard_bb,
2899                                            prob_prolog.invert (),
2900                                            irred_flag);
2901           e = EDGE_PRED (guard_to, 0);
2902           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2903           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2904
2905           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2906           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2907         }
2908
2909       /* Update init address of DRs.  */
2910       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2911       /* Update niters for vector loop.  */
2912       LOOP_VINFO_NITERS (loop_vinfo)
2913         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2914       LOOP_VINFO_NITERSM1 (loop_vinfo)
2915         = fold_build2 (MINUS_EXPR, type,
2916                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2917       bool new_var_p = false;
2918       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2919       /* It's guaranteed that vector loop bound before vectorization is at
2920          least VF, so set range information for newly generated var.  */
2921       if (new_var_p)
2922         {
2923           value_range vr (type,
2924                           wi::to_wide (build_int_cst (type, vf)),
2925                           wi::to_wide (TYPE_MAX_VALUE (type)));
2926           set_range_info (niters, vr);
2927         }
2928
2929       /* Prolog iterates at most bound_prolog times, latch iterates at
2930          most bound_prolog - 1 times.  */
2931       record_niter_bound (prolog, bound_prolog - 1, false, true);
2932       delete_update_ssa ();
2933       adjust_vec_debug_stmts ();
2934       scev_reset ();
2935     }
2936
2937   if (epilog_peeling)
2938     {
2939       e = single_exit (loop);
2940       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
2941
2942       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2943          said epilog then we should use a copy of the main loop as a starting
2944          point.  This loop may have already had some preliminary transformations
2945          to allow for more optimal vectorization, for example if-conversion.
2946          If we are not vectorizing the epilog then we should use the scalar loop
2947          as the transformations mentioned above make less or no sense when not
2948          vectorizing.  */
2949       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2950       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2951       gcc_assert (epilog);
2952
2953       epilog->force_vectorize = false;
2954       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2955
2956       /* Scalar version loop may be preferred.  In this case, add guard
2957          and skip to epilog.  Note this only happens when the number of
2958          iterations of loop is unknown at compile time, otherwise this
2959          won't be vectorized.  */
2960       if (skip_vector)
2961         {
2962           /* Additional epilogue iteration is peeled if gap exists.  */
2963           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2964                                                 bound_prolog, bound_epilog,
2965                                                 th, &bound_scalar,
2966                                                 check_profitability);
2967           /* Build guard against NITERSM1 since NITERS may overflow.  */
2968           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2969           guard_bb = anchor;
2970           guard_to = split_edge (loop_preheader_edge (epilog));
2971           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2972                                            guard_to, guard_bb,
2973                                            prob_vector.invert (),
2974                                            irred_flag);
2975           skip_e = guard_e;
2976           e = EDGE_PRED (guard_to, 0);
2977           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2978           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2979
2980           /* Simply propagate profile info from guard_bb to guard_to which is
2981              a merge point of control flow.  */
2982           guard_to->count = guard_bb->count;
2983
2984           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
2985           if (vect_epilogues || scalar_loop == NULL)
2986             {
2987               gcc_assert(epilog->num_nodes == loop->num_nodes);
2988               basic_block *bbs = get_loop_body (epilog);
2989               for (unsigned int i = 0; i < epilog->num_nodes; i++)
2990                 {
2991                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
2992                   bbs[i]->count = original_counts[i];
2993                 }
2994               free (bbs);
2995               free (original_bbs);
2996             }
2997         }
2998
2999       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
3000       /* If loop is peeled for non-zero constant times, now niters refers to
3001          orig_niters - prolog_peeling, it won't overflow even the orig_niters
3002          overflows.  */
3003       niters_no_overflow |= (prolog_peeling > 0);
3004       vect_gen_vector_loop_niters (loop_vinfo, niters,
3005                                    niters_vector, step_vector,
3006                                    niters_no_overflow);
3007       if (!integer_onep (*step_vector))
3008         {
3009           /* On exit from the loop we will have an easy way of calcalating
3010              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3011              until then.  */
3012           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3013           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3014           *niters_vector_mult_vf_var = niters_vector_mult_vf;
3015         }
3016       else
3017         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3018                                              &niters_vector_mult_vf);
3019       /* Update IVs of original loop as if they were advanced by
3020          niters_vector_mult_vf steps.  */
3021       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3022       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3023       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3024                                         update_e);
3025
3026       if (skip_epilog)
3027         {
3028           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3029                                     niters, niters_vector_mult_vf);
3030           guard_bb = single_exit (loop)->dest;
3031           guard_to = split_edge (single_exit (epilog));
3032           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3033                                            skip_vector ? anchor : guard_bb,
3034                                            prob_epilog.invert (),
3035                                            irred_flag);
3036           if (vect_epilogues)
3037             epilogue_vinfo->skip_this_loop_edge = guard_e;
3038           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3039                                               single_exit (epilog));
3040           /* Only need to handle basic block before epilog loop if it's not
3041              the guard_bb, which is the case when skip_vector is true.  */
3042           if (guard_bb != bb_before_epilog)
3043             {
3044               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3045
3046               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3047             }
3048           scale_loop_profile (epilog, prob_epilog, 0);
3049         }
3050       else
3051         slpeel_update_phi_nodes_for_lcssa (epilog);
3052
3053       unsigned HOST_WIDE_INT bound;
3054       if (bound_scalar.is_constant (&bound))
3055         {
3056           gcc_assert (bound != 0);
3057           /* -1 to convert loop iterations to latch iterations.  */
3058           record_niter_bound (epilog, bound - 1, false, true);
3059         }
3060
3061       delete_update_ssa ();
3062       adjust_vec_debug_stmts ();
3063       scev_reset ();
3064     }
3065
3066   if (vect_epilogues)
3067     {
3068       epilog->aux = epilogue_vinfo;
3069       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3070
3071       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3072
3073       /* We now must calculate the number of NITERS performed by the previous
3074          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3075       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3076                                  niters_prolog, niters_vector_mult_vf);
3077
3078       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3079          determine whether we are coming from the previous vectorized loop
3080          using the update_e edge or the skip_vector basic block using the
3081          skip_e edge.  */
3082       if (skip_vector)
3083         {
3084           gcc_assert (update_e != NULL
3085                       && skip_e != NULL
3086                       && !vect_epilogues_updated_niters);
3087           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3088                                            update_e->dest);
3089           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3090           gimple *stmt = gimple_build_assign (new_ssa, niters);
3091           gimple_stmt_iterator gsi;
3092           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3093               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3094             {
3095               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3096               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3097             }
3098           else
3099             {
3100               gsi = gsi_last_bb (update_e->src);
3101               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3102             }
3103
3104           niters = new_ssa;
3105           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3106           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3107                        UNKNOWN_LOCATION);
3108           niters = PHI_RESULT (new_phi);
3109           epilogue_vinfo->main_loop_edge = update_e;
3110           epilogue_vinfo->skip_main_loop_edge = skip_e;
3111         }
3112
3113       /* Set ADVANCE to the number of iterations performed by the previous
3114          loop and its prologue.  */
3115       *advance = niters;
3116
3117       if (!vect_epilogues_updated_niters)
3118         {
3119           /* Subtract the number of iterations performed by the vectorized loop
3120              from the number of total iterations.  */
3121           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3122                                               before_loop_niters,
3123                                               niters);
3124
3125           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3126           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3127             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3128                            epilogue_niters,
3129                            build_one_cst (TREE_TYPE (epilogue_niters)));
3130
3131           /* Decide what to do if the number of epilogue iterations is not
3132              a multiple of the epilogue loop's vectorization factor.
3133              We should have rejected the loop during the analysis phase
3134              if this fails.  */
3135           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3136                                                            true))
3137             gcc_unreachable ();
3138         }
3139     }
3140
3141   adjust_vec.release ();
3142   free_original_copy_tables ();
3143
3144   return vect_epilogues ? epilog : NULL;
3145 }
3146
3147 /* Function vect_create_cond_for_niters_checks.
3148
3149    Create a conditional expression that represents the run-time checks for
3150    loop's niter.  The loop is guaranteed to terminate if the run-time
3151    checks hold.
3152
3153    Input:
3154    COND_EXPR  - input conditional expression.  New conditions will be chained
3155                 with logical AND operation.  If it is NULL, then the function
3156                 is used to return the number of alias checks.
3157    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3158                 to be checked.
3159
3160    Output:
3161    COND_EXPR - conditional expression.
3162
3163    The returned COND_EXPR is the conditional expression to be used in the
3164    if statement that controls which version of the loop gets executed at
3165    runtime.  */
3166
3167 static void
3168 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3169 {
3170   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3171
3172   if (*cond_expr)
3173     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3174                               *cond_expr, part_cond_expr);
3175   else
3176     *cond_expr = part_cond_expr;
3177 }
3178
3179 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3180    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3181
3182 static void
3183 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3184 {
3185   if (*cond_expr)
3186     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3187                               *cond_expr, part_cond_expr);
3188   else
3189     *cond_expr = part_cond_expr;
3190 }
3191
3192 /* Function vect_create_cond_for_align_checks.
3193
3194    Create a conditional expression that represents the alignment checks for
3195    all of data references (array element references) whose alignment must be
3196    checked at runtime.
3197
3198    Input:
3199    COND_EXPR  - input conditional expression.  New conditions will be chained
3200                 with logical AND operation.
3201    LOOP_VINFO - two fields of the loop information are used.
3202                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3203                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3204
3205    Output:
3206    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3207                          expression.
3208    The returned value is the conditional expression to be used in the if
3209    statement that controls which version of the loop gets executed at runtime.
3210
3211    The algorithm makes two assumptions:
3212      1) The number of bytes "n" in a vector is a power of 2.
3213      2) An address "a" is aligned if a%n is zero and that this
3214         test can be done as a&(n-1) == 0.  For example, for 16
3215         byte vectors the test is a&0xf == 0.  */
3216
3217 static void
3218 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3219                                    tree *cond_expr,
3220                                    gimple_seq *cond_expr_stmt_list)
3221 {
3222   const vec<stmt_vec_info> &may_misalign_stmts
3223     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3224   stmt_vec_info stmt_info;
3225   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3226   tree mask_cst;
3227   unsigned int i;
3228   tree int_ptrsize_type;
3229   char tmp_name[20];
3230   tree or_tmp_name = NULL_TREE;
3231   tree and_tmp_name;
3232   gimple *and_stmt;
3233   tree ptrsize_zero;
3234   tree part_cond_expr;
3235
3236   /* Check that mask is one less than a power of 2, i.e., mask is
3237      all zeros followed by all ones.  */
3238   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3239
3240   int_ptrsize_type = signed_type_for (ptr_type_node);
3241
3242   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3243      of the first vector of the i'th data reference. */
3244
3245   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3246     {
3247       gimple_seq new_stmt_list = NULL;
3248       tree addr_base;
3249       tree addr_tmp_name;
3250       tree new_or_tmp_name;
3251       gimple *addr_stmt, *or_stmt;
3252       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3253       bool negative = tree_int_cst_compare
3254         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3255       tree offset = negative
3256         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3257                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3258         : size_zero_node;
3259
3260       /* create: addr_tmp = (int)(address_of_first_vector) */
3261       addr_base =
3262         vect_create_addr_base_for_vector_ref (loop_vinfo,
3263                                               stmt_info, &new_stmt_list,
3264                                               offset);
3265       if (new_stmt_list != NULL)
3266         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3267
3268       sprintf (tmp_name, "addr2int%d", i);
3269       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3270       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3271       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3272
3273       /* The addresses are OR together.  */
3274
3275       if (or_tmp_name != NULL_TREE)
3276         {
3277           /* create: or_tmp = or_tmp | addr_tmp */
3278           sprintf (tmp_name, "orptrs%d", i);
3279           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3280           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3281                                          or_tmp_name, addr_tmp_name);
3282           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3283           or_tmp_name = new_or_tmp_name;
3284         }
3285       else
3286         or_tmp_name = addr_tmp_name;
3287
3288     } /* end for i */
3289
3290   mask_cst = build_int_cst (int_ptrsize_type, mask);
3291
3292   /* create: and_tmp = or_tmp & mask  */
3293   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3294
3295   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3296                                   or_tmp_name, mask_cst);
3297   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3298
3299   /* Make and_tmp the left operand of the conditional test against zero.
3300      if and_tmp has a nonzero bit then some address is unaligned.  */
3301   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3302   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3303                                 and_tmp_name, ptrsize_zero);
3304   chain_cond_expr (cond_expr, part_cond_expr);
3305 }
3306
3307 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3308    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3309    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3310    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3311
3312 static void
3313 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3314 {
3315   const vec<vec_object_pair> &pairs
3316     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3317   unsigned int i;
3318   vec_object_pair *pair;
3319   FOR_EACH_VEC_ELT (pairs, i, pair)
3320     {
3321       tree addr1 = build_fold_addr_expr (pair->first);
3322       tree addr2 = build_fold_addr_expr (pair->second);
3323       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3324                                          addr1, addr2);
3325       chain_cond_expr (cond_expr, part_cond_expr);
3326     }
3327 }
3328
3329 /* Create an expression that is true when all lower-bound conditions for
3330    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3331
3332 static void
3333 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3334 {
3335   const vec<vec_lower_bound> &lower_bounds
3336     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3337   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3338     {
3339       tree expr = lower_bounds[i].expr;
3340       tree type = unsigned_type_for (TREE_TYPE (expr));
3341       expr = fold_convert (type, expr);
3342       poly_uint64 bound = lower_bounds[i].min_value;
3343       if (!lower_bounds[i].unsigned_p)
3344         {
3345           expr = fold_build2 (PLUS_EXPR, type, expr,
3346                               build_int_cstu (type, bound - 1));
3347           bound += bound - 1;
3348         }
3349       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3350                                          build_int_cstu (type, bound));
3351       chain_cond_expr (cond_expr, part_cond_expr);
3352     }
3353 }
3354
3355 /* Function vect_create_cond_for_alias_checks.
3356
3357    Create a conditional expression that represents the run-time checks for
3358    overlapping of address ranges represented by a list of data references
3359    relations passed as input.
3360
3361    Input:
3362    COND_EXPR  - input conditional expression.  New conditions will be chained
3363                 with logical AND operation.  If it is NULL, then the function
3364                 is used to return the number of alias checks.
3365    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3366                 to be checked.
3367
3368    Output:
3369    COND_EXPR - conditional expression.
3370
3371    The returned COND_EXPR is the conditional expression to be used in the if
3372    statement that controls which version of the loop gets executed at runtime.
3373 */
3374
3375 void
3376 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3377 {
3378   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3379     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3380
3381   if (comp_alias_ddrs.is_empty ())
3382     return;
3383
3384   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3385                                &comp_alias_ddrs, cond_expr);
3386   if (dump_enabled_p ())
3387     dump_printf_loc (MSG_NOTE, vect_location,
3388                      "created %u versioning for alias checks.\n",
3389                      comp_alias_ddrs.length ());
3390 }
3391
3392
3393 /* Function vect_loop_versioning.
3394
3395    If the loop has data references that may or may not be aligned or/and
3396    has data reference relations whose independence was not proven then
3397    two versions of the loop need to be generated, one which is vectorized
3398    and one which isn't.  A test is then generated to control which of the
3399    loops is executed.  The test checks for the alignment of all of the
3400    data references that may or may not be aligned.  An additional
3401    sequence of runtime tests is generated for each pairs of DDRs whose
3402    independence was not proven.  The vectorized version of loop is
3403    executed only if both alias and alignment tests are passed.
3404
3405    The test generated to check which version of loop is executed
3406    is modified to also check for profitability as indicated by the
3407    cost model threshold TH.
3408
3409    The versioning precondition(s) are placed in *COND_EXPR and
3410    *COND_EXPR_STMT_LIST.  */
3411
3412 class loop *
3413 vect_loop_versioning (loop_vec_info loop_vinfo,
3414                       gimple *loop_vectorized_call)
3415 {
3416   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3417   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3418   basic_block condition_bb;
3419   gphi_iterator gsi;
3420   gimple_stmt_iterator cond_exp_gsi;
3421   basic_block merge_bb;
3422   basic_block new_exit_bb;
3423   edge new_exit_e, e;
3424   gphi *orig_phi, *new_phi;
3425   tree cond_expr = NULL_TREE;
3426   gimple_seq cond_expr_stmt_list = NULL;
3427   tree arg;
3428   profile_probability prob = profile_probability::likely ();
3429   gimple_seq gimplify_stmt_list = NULL;
3430   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3431   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3432   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3433   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3434   poly_uint64 versioning_threshold
3435     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3436   tree version_simd_if_cond
3437     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3438   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3439
3440   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3441       && !ordered_p (th, versioning_threshold))
3442     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3443                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3444                                             th - 1));
3445   if (maybe_ne (versioning_threshold, 0U))
3446     {
3447       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3448                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3449                                               versioning_threshold - 1));
3450       if (cond_expr)
3451         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3452                                  expr, cond_expr);
3453       else
3454         cond_expr = expr;
3455     }
3456
3457   tree cost_name = NULL_TREE;
3458   profile_probability prob2 = profile_probability::uninitialized ();
3459   if (cond_expr
3460       && EXPR_P (cond_expr)
3461       && (version_niter
3462           || version_align
3463           || version_alias
3464           || version_simd_if_cond))
3465     {
3466       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3467                                                       &cond_expr_stmt_list,
3468                                                       is_gimple_val, NULL_TREE);
3469       /* Split prob () into two so that the overall probability of passing
3470          both the cost-model and versioning checks is the orig prob.  */
3471       prob2 = prob.split (prob);
3472     }
3473
3474   if (version_niter)
3475     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3476
3477   if (cond_expr)
3478     {
3479       gimple_seq tem = NULL;
3480       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3481                                           &tem, is_gimple_condexpr_for_cond,
3482                                           NULL_TREE);
3483       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3484     }
3485
3486   if (version_align)
3487     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3488                                        &cond_expr_stmt_list);
3489
3490   if (version_alias)
3491     {
3492       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3493       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3494       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3495     }
3496
3497   if (version_simd_if_cond)
3498     {
3499       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3500       if (flag_checking)
3501         if (basic_block bb
3502             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3503           gcc_assert (bb != loop->header
3504                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3505                       && (scalar_loop == NULL
3506                           || (bb != scalar_loop->header
3507                               && dominated_by_p (CDI_DOMINATORS,
3508                                                  scalar_loop->header, bb))));
3509       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3510       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3511                             version_simd_if_cond, zero);
3512       if (cond_expr)
3513         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3514                                  c, cond_expr);
3515       else
3516         cond_expr = c;
3517       if (dump_enabled_p ())
3518         dump_printf_loc (MSG_NOTE, vect_location,
3519                          "created versioning for simd if condition check.\n");
3520     }
3521
3522   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3523                                       &gimplify_stmt_list,
3524                                       is_gimple_condexpr_for_cond, NULL_TREE);
3525   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3526
3527   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3528      invariant in.  */
3529   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3530   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3531        !gsi_end_p (gsi); gsi_next (&gsi))
3532     {
3533       gimple *stmt = gsi_stmt (gsi);
3534       update_stmt (stmt);
3535       ssa_op_iter iter;
3536       use_operand_p use_p;
3537       basic_block def_bb;
3538       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3539         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3540             && flow_bb_inside_loop_p (outermost, def_bb))
3541           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3542     }
3543
3544   /* Search for the outermost loop we can version.  Avoid versioning of
3545      non-perfect nests but allow if-conversion versioned loops inside.  */
3546   class loop *loop_to_version = loop;
3547   if (flow_loop_nested_p (outermost, loop))
3548     {
3549       if (dump_enabled_p ())
3550         dump_printf_loc (MSG_NOTE, vect_location,
3551                          "trying to apply versioning to outer loop %d\n",
3552                          outermost->num);
3553       if (outermost->num == 0)
3554         outermost = superloop_at_depth (loop, 1);
3555       /* And avoid applying versioning on non-perfect nests.  */
3556       while (loop_to_version != outermost
3557              && (e = single_exit (loop_outer (loop_to_version)))
3558              && !(e->flags & EDGE_COMPLEX)
3559              && (!loop_outer (loop_to_version)->inner->next
3560                  || vect_loop_vectorized_call (loop_to_version))
3561              && (!loop_outer (loop_to_version)->inner->next
3562                  || !loop_outer (loop_to_version)->inner->next->next))
3563         loop_to_version = loop_outer (loop_to_version);
3564     }
3565
3566   /* Apply versioning.  If there is already a scalar version created by
3567      if-conversion re-use that.  Note we cannot re-use the copy of
3568      an if-converted outer-loop when vectorizing the inner loop only.  */
3569   gcond *cond;
3570   if ((!loop_to_version->inner || loop == loop_to_version)
3571       && loop_vectorized_call)
3572     {
3573       gcc_assert (scalar_loop);
3574       condition_bb = gimple_bb (loop_vectorized_call);
3575       cond = as_a <gcond *> (last_stmt (condition_bb));
3576       gimple_cond_set_condition_from_tree (cond, cond_expr);
3577       update_stmt (cond);
3578
3579       if (cond_expr_stmt_list)
3580         {
3581           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3582           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3583                                  GSI_SAME_STMT);
3584         }
3585
3586       /* if-conversion uses profile_probability::always () for both paths,
3587          reset the paths probabilities appropriately.  */
3588       edge te, fe;
3589       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3590       te->probability = prob;
3591       fe->probability = prob.invert ();
3592       /* We can scale loops counts immediately but have to postpone
3593          scaling the scalar loop because we re-use it during peeling.  */
3594       scale_loop_frequencies (loop_to_version, te->probability);
3595       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3596
3597       nloop = scalar_loop;
3598       if (dump_enabled_p ())
3599         dump_printf_loc (MSG_NOTE, vect_location,
3600                          "reusing %sloop version created by if conversion\n",
3601                          loop_to_version != loop ? "outer " : "");
3602     }
3603   else
3604     {
3605       if (loop_to_version != loop
3606           && dump_enabled_p ())
3607         dump_printf_loc (MSG_NOTE, vect_location,
3608                          "applying loop versioning to outer loop %d\n",
3609                          loop_to_version->num);
3610
3611       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3612
3613       initialize_original_copy_tables ();
3614       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3615                             prob, prob.invert (), prob, prob.invert (), true);
3616       gcc_assert (nloop);
3617       nloop = get_loop_copy (loop);
3618
3619       /* For cycle vectorization with SLP we rely on the PHI arguments
3620          appearing in the same order as the SLP node operands which for the
3621          loop PHI nodes means the preheader edge dest index needs to remain
3622          the same for the analyzed loop which also becomes the vectorized one.
3623          Make it so in case the state after versioning differs by redirecting
3624          the first edge into the header to the same destination which moves
3625          it last.  */
3626       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3627         {
3628           edge e = EDGE_PRED (loop->header, 0);
3629           ssa_redirect_edge (e, e->dest);
3630           flush_pending_stmts (e);
3631         }
3632       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3633
3634       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3635          reap those otherwise;  they also refer to the original
3636          loops.  */
3637       class loop *l = loop;
3638       while (gimple *call = vect_loop_vectorized_call (l))
3639         {
3640           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3641           fold_loop_internal_call (call, boolean_false_node);
3642           l = loop_outer (l);
3643         }
3644       free_original_copy_tables ();
3645
3646       if (cond_expr_stmt_list)
3647         {
3648           cond_exp_gsi = gsi_last_bb (condition_bb);
3649           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3650                                  GSI_SAME_STMT);
3651         }
3652
3653       /* Loop versioning violates an assumption we try to maintain during
3654          vectorization - that the loop exit block has a single predecessor.
3655          After versioning, the exit block of both loop versions is the same
3656          basic block (i.e. it has two predecessors). Just in order to simplify
3657          following transformations in the vectorizer, we fix this situation
3658          here by adding a new (empty) block on the exit-edge of the loop,
3659          with the proper loop-exit phis to maintain loop-closed-form.
3660          If loop versioning wasn't done from loop, but scalar_loop instead,
3661          merge_bb will have already just a single successor.  */
3662
3663       merge_bb = single_exit (loop_to_version)->dest;
3664       if (EDGE_COUNT (merge_bb->preds) >= 2)
3665         {
3666           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3667           new_exit_bb = split_edge (single_exit (loop_to_version));
3668           new_exit_e = single_exit (loop_to_version);
3669           e = EDGE_SUCC (new_exit_bb, 0);
3670
3671           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3672                gsi_next (&gsi))
3673             {
3674               tree new_res;
3675               orig_phi = gsi.phi ();
3676               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3677               new_phi = create_phi_node (new_res, new_exit_bb);
3678               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3679               add_phi_arg (new_phi, arg, new_exit_e,
3680                            gimple_phi_arg_location_from_edge (orig_phi, e));
3681               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3682             }
3683         }
3684
3685       update_ssa (TODO_update_ssa_no_phi);
3686     }
3687
3688   /* Split the cost model check off to a separate BB.  Costing assumes
3689      this is the only thing we perform when we enter the scalar loop
3690      from a failed cost decision.  */
3691   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3692     {
3693       gimple *def = SSA_NAME_DEF_STMT (cost_name);
3694       gcc_assert (gimple_bb (def) == condition_bb);
3695       /* All uses of the cost check are 'true' after the check we
3696          are going to insert.  */
3697       replace_uses_by (cost_name, boolean_true_node);
3698       /* And we're going to build the new single use of it.  */
3699       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3700                                        NULL_TREE, NULL_TREE);
3701       edge e = split_block (gimple_bb (def), def);
3702       gimple_stmt_iterator gsi = gsi_for_stmt (def);
3703       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3704       edge true_e, false_e;
3705       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3706       e->flags &= ~EDGE_FALLTHRU;
3707       e->flags |= EDGE_TRUE_VALUE;
3708       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3709       e->probability = prob2;
3710       e2->probability = prob2.invert ();
3711       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3712       auto_vec<basic_block, 3> adj;
3713       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3714            son;
3715            son = next_dom_son (CDI_DOMINATORS, son))
3716         if (EDGE_COUNT (son->preds) > 1)
3717           adj.safe_push (son);
3718       for (auto son : adj)
3719         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3720     }
3721
3722   if (version_niter)
3723     {
3724       /* The versioned loop could be infinite, we need to clear existing
3725          niter information which is copied from the original loop.  */
3726       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3727       vect_free_loop_info_assumptions (nloop);
3728     }
3729
3730   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3731       && dump_enabled_p ())
3732     {
3733       if (version_alias)
3734         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3735                          vect_location,
3736                          "loop versioned for vectorization because of "
3737                          "possible aliasing\n");
3738       if (version_align)
3739         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3740                          vect_location,
3741                          "loop versioned for vectorization to enhance "
3742                          "alignment\n");
3743
3744     }
3745
3746   return nloop;
3747 }