gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   struct loop *loop = bb->loop_father;
  93   struct loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (struct loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (struct loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 321                                rgroup_masks *src_rgm)
 322 {
 323   tree src_masktype = src_rgm->mask_type;
 324   tree dest_masktype = dest_rgm->mask_type;
 325   machine_mode src_mode = TYPE_MODE (src_masktype);
 326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 329     {
 330       /* Unpacking the source masks gives at least as many mask bits as
 331          we need.  We can then VIEW_CONVERT any excess bits away.  */
 332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
 333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 334         {
 335           tree src = src_rgm->masks[i / 2];
 336           tree dest = dest_rgm->masks[i];
 337           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 338                             ? VEC_UNPACK_HI_EXPR
 339                             : VEC_UNPACK_LO_EXPR);
 340           gassign *stmt;
 341           if (dest_masktype == unpack_masktype)
 342             stmt = gimple_build_assign (dest, code, src);
 343           else
 344             {
 345               tree temp = make_ssa_name (unpack_masktype);
 346               stmt = gimple_build_assign (temp, code, src);
 347               gimple_seq_add_stmt (seq, stmt);
 348               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 349                                           build1 (VIEW_CONVERT_EXPR,
 350                                                   dest_masktype, temp));
 351             }
 352           gimple_seq_add_stmt (seq, stmt);
 353         }
 354       return true;
 355     }
 356   vec_perm_indices indices[2];
 357   if (dest_masktype == src_masktype
 358       && interleave_supported_p (&indices[0], src_masktype, 0)
 359       && interleave_supported_p (&indices[1], src_masktype, 1))
 360     {
 361       /* The destination requires twice as many mask bits as the source, so
 362          we can use interleaving permutes to double up the number of bits.  */
 363       tree masks[2];
 364       for (unsigned int i = 0; i < 2; ++i)
 365         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 367         {
 368           tree src = src_rgm->masks[i / 2];
 369           tree dest = dest_rgm->masks[i];
 370           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 371                                               src, src, masks[i & 1]);
 372           gimple_seq_add_stmt (seq, stmt);
 373         }
 374       return true;
 375     }
 376   return false;
 377 }
 378
 379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 380    all the masks in RGM and return a mask that is nonzero when the loop
 381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 382    Use LOOP_COND_GSI to insert code before the exit gcond.
 383
 384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 385    times and has been vectorized according to LOOP_VINFO.  Each iteration
 386    of the vectorized loop handles VF iterations of the scalar loop.
 387
 388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 389    starts with NITERS_SKIP dummy iterations of the scalar loop before
 390    the real work starts.  The mask elements for these dummy iterations
 391    must be 0, to ensure that the extra iterations do not have an effect.
 392
 393    It is known that:
 394
 395      NITERS * RGM->max_nscalars_per_iter
 396
 397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 398    variable that starts at 0 and has step:
 399
 400      VF * RGM->max_nscalars_per_iter
 401
 402    might overflow before hitting a value above:
 403
 404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 405
 406    This means that we cannot guarantee that such an induction variable
 407    would ever hit a value that produces a set of all-false masks for RGM.  */
 408
 409 static tree
 410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 411                               gimple_seq *preheader_seq,
 412                               gimple_stmt_iterator loop_cond_gsi,
 413                               rgroup_masks *rgm, tree vf,
 414                               tree niters, tree niters_skip,
 415                               bool might_wrap_p)
 416 {
 417   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 418   tree mask_type = rgm->mask_type;
 419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 421
 422   /* Calculate the maximum number of scalar values that the rgroup
 423      handles in total, the number that it handles for each iteration
 424      of the vector loop, and the number that it should skip during the
 425      first iteration of the vector loop.  */
 426   tree nscalars_total = niters;
 427   tree nscalars_step = vf;
 428   tree nscalars_skip = niters_skip;
 429   if (nscalars_per_iter != 1)
 430     {
 431       /* We checked before choosing to use a fully-masked loop that these
 432          multiplications don't overflow.  */
 433       tree factor = build_int_cst (compare_type, nscalars_per_iter);
 434       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 435                                      nscalars_total, factor);
 436       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 437                                     nscalars_step, factor);
 438       if (nscalars_skip)
 439         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 440                                       nscalars_skip, factor);
 441     }
 442
 443   /* Create an induction variable that counts the number of scalars
 444      processed.  */
 445   tree index_before_incr, index_after_incr;
 446   gimple_stmt_iterator incr_gsi;
 447   bool insert_after;
 448   tree zero_index = build_int_cst (compare_type, 0);
 449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 450   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
 451              insert_after, &index_before_incr, &index_after_incr);
 452
 453   tree test_index, test_limit, first_limit;
 454   gimple_stmt_iterator *test_gsi;
 455   if (might_wrap_p)
 456     {
 457       /* In principle the loop should stop iterating once the incremented
 458          IV reaches a value greater than or equal to:
 459
 460            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 461
 462          However, there's no guarantee that this addition doesn't overflow
 463          the comparison type, or that the IV hits a value above it before
 464          wrapping around.  We therefore adjust the limit down by one
 465          IV step:
 466
 467            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 468            -[infinite-prec] NSCALARS_STEP
 469
 470          and compare the IV against this limit _before_ incrementing it.
 471          Since the comparison type is unsigned, we actually want the
 472          subtraction to saturate at zero:
 473
 474            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 475            -[sat] NSCALARS_STEP
 476
 477          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 478
 479            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 480
 481          where the rightmost subtraction can be done directly in
 482          COMPARE_TYPE.  */
 483       test_index = index_before_incr;
 484       tree adjust = nscalars_step;
 485       if (nscalars_skip)
 486         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 487                                adjust, nscalars_skip);
 488       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 489                                  nscalars_total, adjust);
 490       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 491                                  test_limit, adjust);
 492       test_gsi = &incr_gsi;
 493
 494       /* Get a safe limit for the first iteration.  */
 495       if (nscalars_skip)
 496         {
 497           /* The first vector iteration can handle at most NSCALARS_STEP
 498              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 499              NSCALARS_SKIP to that cannot overflow.  */
 500           tree const_limit = build_int_cst (compare_type,
 501                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 502                                             * nscalars_per_iter);
 503           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 504                                       nscalars_total, const_limit);
 505           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 506                                       first_limit, nscalars_skip);
 507         }
 508       else
 509         /* For the first iteration it doesn't matter whether the IV hits
 510            a value above NSCALARS_TOTAL.  That only matters for the latch
 511            condition.  */
 512         first_limit = nscalars_total;
 513     }
 514   else
 515     {
 516       /* Test the incremented IV, which will always hit a value above
 517          the bound before wrapping.  */
 518       test_index = index_after_incr;
 519       test_limit = nscalars_total;
 520       if (nscalars_skip)
 521         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 522                                    test_limit, nscalars_skip);
 523       test_gsi = &loop_cond_gsi;
 524
 525       first_limit = test_limit;
 526     }
 527
 528   /* Provide a definition of each mask in the group.  */
 529   tree next_mask = NULL_TREE;
 530   tree mask;
 531   unsigned int i;
 532   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 533     {
 534       /* Previous masks will cover BIAS scalars.  This mask covers the
 535          next batch.  */
 536       poly_uint64 bias = nscalars_per_mask * i;
 537       tree bias_tree = build_int_cst (compare_type, bias);
 538       gimple *tmp_stmt;
 539
 540       /* See whether the first iteration of the vector loop is known
 541          to have a full mask.  */
 542       poly_uint64 const_limit;
 543       bool first_iteration_full
 544         = (poly_int_tree_p (first_limit, &const_limit)
 545            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 546
 547       /* Rather than have a new IV that starts at BIAS and goes up to
 548          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 549          and adjust the bound down by BIAS.  */
 550       tree this_test_limit = test_limit;
 551       if (i != 0)
 552         {
 553           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 554                                           compare_type, this_test_limit,
 555                                           bias_tree);
 556           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 557                                           compare_type, this_test_limit,
 558                                           bias_tree);
 559         }
 560
 561       /* Create the initial mask.  First include all scalars that
 562          are within the loop limit.  */
 563       tree init_mask = NULL_TREE;
 564       if (!first_iteration_full)
 565         {
 566           tree start, end;
 567           if (first_limit == test_limit)
 568             {
 569               /* Use a natural test between zero (the initial IV value)
 570                  and the loop limit.  The "else" block would be valid too,
 571                  but this choice can avoid the need to load BIAS_TREE into
 572                  a register.  */
 573               start = zero_index;
 574               end = this_test_limit;
 575             }
 576           else
 577             {
 578               /* FIRST_LIMIT is the maximum number of scalars handled by the
 579                  first iteration of the vector loop.  Test the portion
 580                  associated with this mask.  */
 581               start = bias_tree;
 582               end = first_limit;
 583             }
 584
 585           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 586           tmp_stmt = vect_gen_while (init_mask, start, end);
 587           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 588         }
 589
 590       /* Now AND out the bits that are within the number of skipped
 591          scalars.  */
 592       poly_uint64 const_skip;
 593       if (nscalars_skip
 594           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 595                && known_le (const_skip, bias)))
 596         {
 597           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 598                                                     bias_tree, nscalars_skip);
 599           if (init_mask)
 600             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 601                                       init_mask, unskipped_mask);
 602           else
 603             init_mask = unskipped_mask;
 604         }
 605
 606       if (!init_mask)
 607         /* First iteration is full.  */
 608         init_mask = build_minus_one_cst (mask_type);
 609
 610       /* Get the mask value for the next iteration of the loop.  */
 611       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 612       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 613       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 614
 615       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 616     }
 617   return next_mask;
 618 }
 619
 620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 621    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 622    number of iterations of the original scalar loop that should be
 623    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 624    as for vect_set_loop_condition.
 625
 626    Insert the branch-back condition before LOOP_COND_GSI and return the
 627    final gcond.  */
 628
 629 static gcond *
 630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 631                                 tree niters, tree final_iv,
 632                                 bool niters_maybe_zero,
 633                                 gimple_stmt_iterator loop_cond_gsi)
 634 {
 635   gimple_seq preheader_seq = NULL;
 636   gimple_seq header_seq = NULL;
 637
 638   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 639   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 640   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
 641   tree orig_niters = niters;
 642
 643   /* Type of the initial value of NITERS.  */
 644   tree ni_actual_type = TREE_TYPE (niters);
 645   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 646
 647   /* Convert NITERS to the same size as the compare.  */
 648   if (compare_precision > ni_actual_precision
 649       && niters_maybe_zero)
 650     {
 651       /* We know that there is always at least one iteration, so if the
 652          count is zero then it must have wrapped.  Cope with this by
 653          subtracting 1 before the conversion and adding 1 to the result.  */
 654       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 655       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 656                              niters, build_minus_one_cst (ni_actual_type));
 657       niters = gimple_convert (&preheader_seq, compare_type, niters);
 658       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 659                              niters, build_one_cst (compare_type));
 660     }
 661   else
 662     niters = gimple_convert (&preheader_seq, compare_type, niters);
 663
 664   /* Convert skip_niters to the right type.  */
 665   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 666
 667   /* Now calculate the value that the induction variable must be able
 668      to hit in order to ensure that we end the loop with an all-false mask.
 669      This involves adding the maximum number of inactive trailing scalar
 670      iterations.  */
 671   widest_int iv_limit;
 672   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
 673   if (known_max_iters)
 674     {
 675       if (niters_skip)
 676         {
 677           /* Add the maximum number of skipped iterations to the
 678              maximum iteration count.  */
 679           if (TREE_CODE (niters_skip) == INTEGER_CST)
 680             iv_limit += wi::to_widest (niters_skip);
 681           else
 682             iv_limit += max_vf - 1;
 683         }
 684       /* IV_LIMIT is the maximum number of latch iterations, which is also
 685          the maximum in-range IV value.  Round this value down to the previous
 686          vector alignment boundary and then add an extra full iteration.  */
 687       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 688       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
 689     }
 690
 691   /* Get the vectorization factor in tree form.  */
 692   tree vf = build_int_cst (compare_type,
 693                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 694
 695   /* Iterate over all the rgroups and fill in their masks.  We could use
 696      the first mask from any rgroup for the loop condition; here we
 697      arbitrarily pick the last.  */
 698   tree test_mask = NULL_TREE;
 699   rgroup_masks *rgm;
 700   unsigned int i;
 701   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 702   FOR_EACH_VEC_ELT (*masks, i, rgm)
 703     if (!rgm->masks.is_empty ())
 704       {
 705         /* First try using permutes.  This adds a single vector
 706            instruction to the loop for each mask, but needs no extra
 707            loop invariants or IVs.  */
 708         unsigned int nmasks = i + 1;
 709         if ((nmasks & 1) == 0)
 710           {
 711             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 712             if (!half_rgm->masks.is_empty ()
 713                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 714               continue;
 715           }
 716
 717         /* See whether zero-based IV would ever generate all-false masks
 718            before wrapping around.  */
 719         bool might_wrap_p
 720           = (!known_max_iters
 721              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 722                                     UNSIGNED)
 723                  > compare_precision));
 724
 725         /* Set up all masks for this group.  */
 726         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 727                                                   &preheader_seq,
 728                                                   loop_cond_gsi, rgm, vf,
 729                                                   niters, niters_skip,
 730                                                   might_wrap_p);
 731       }
 732
 733   /* Emit all accumulated statements.  */
 734   add_preheader_seq (loop, preheader_seq);
 735   add_header_seq (loop, header_seq);
 736
 737   /* Get a boolean result that tells us whether to iterate.  */
 738   edge exit_edge = single_exit (loop);
 739   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 740   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 741   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 742                                         NULL_TREE, NULL_TREE);
 743   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 744
 745   /* The loop iterates (NITERS - 1) / VF + 1 times.
 746      Subtract one from this to get the latch count.  */
 747   tree step = build_int_cst (compare_type,
 748                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 749   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 750                                        build_minus_one_cst (compare_type));
 751   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 752                                      niters_minus_one, step);
 753
 754   if (final_iv)
 755     {
 756       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 757       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 758     }
 759
 760   return cond_stmt;
 761 }
 762
 763 /* Like vect_set_loop_condition, but handle the case in which there
 764    are no loop masks.  */
 765
 766 static gcond *
 767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
 768                                   tree step, tree final_iv,
 769                                   bool niters_maybe_zero,
 770                                   gimple_stmt_iterator loop_cond_gsi)
 771 {
 772   tree indx_before_incr, indx_after_incr;
 773   gcond *cond_stmt;
 774   gcond *orig_cond;
 775   edge pe = loop_preheader_edge (loop);
 776   edge exit_edge = single_exit (loop);
 777   gimple_stmt_iterator incr_gsi;
 778   bool insert_after;
 779   enum tree_code code;
 780   tree niters_type = TREE_TYPE (niters);
 781
 782   orig_cond = get_loop_exit_condition (loop);
 783   gcc_assert (orig_cond);
 784   loop_cond_gsi = gsi_for_stmt (orig_cond);
 785
 786   tree init, limit;
 787   if (!niters_maybe_zero && integer_onep (step))
 788     {
 789       /* In this case we can use a simple 0-based IV:
 790
 791          A:
 792            x = 0;
 793            do
 794              {
 795                ...
 796                x += 1;
 797              }
 798            while (x < NITERS);  */
 799       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 800       init = build_zero_cst (niters_type);
 801       limit = niters;
 802     }
 803   else
 804     {
 805       /* The following works for all values of NITERS except 0:
 806
 807          B:
 808            x = 0;
 809            do
 810              {
 811                ...
 812                x += STEP;
 813              }
 814            while (x <= NITERS - STEP);
 815
 816          so that the loop continues to iterate if x + STEP - 1 < NITERS
 817          but stops if x + STEP - 1 >= NITERS.
 818
 819          However, if NITERS is zero, x never hits a value above NITERS - STEP
 820          before wrapping around.  There are two obvious ways of dealing with
 821          this:
 822
 823          - start at STEP - 1 and compare x before incrementing it
 824          - start at -1 and compare x after incrementing it
 825
 826          The latter is simpler and is what we use.  The loop in this case
 827          looks like:
 828
 829          C:
 830            x = -1;
 831            do
 832              {
 833                ...
 834                x += STEP;
 835              }
 836            while (x < NITERS - STEP);
 837
 838          In both cases the loop limit is NITERS - STEP.  */
 839       gimple_seq seq = NULL;
 840       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 841       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 842       if (seq)
 843         {
 844           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 845           gcc_assert (!new_bb);
 846         }
 847       if (niters_maybe_zero)
 848         {
 849           /* Case C.  */
 850           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 851           init = build_all_ones_cst (niters_type);
 852         }
 853       else
 854         {
 855           /* Case B.  */
 856           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 857           init = build_zero_cst (niters_type);
 858         }
 859     }
 860
 861   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 862   create_iv (init, step, NULL_TREE, loop,
 863              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 864   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 865                                               true, NULL_TREE, true,
 866                                               GSI_SAME_STMT);
 867   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 868                                      true, GSI_SAME_STMT);
 869
 870   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 871                                  NULL_TREE);
 872
 873   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 874
 875   /* Record the number of latch iterations.  */
 876   if (limit == niters)
 877     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 878        latch count.  */
 879     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 880                                        build_int_cst (niters_type, 1));
 881   else
 882     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 883        Subtract one from this to get the latch count.  */
 884     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 885                                        limit, step);
 886
 887   if (final_iv)
 888     {
 889       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 890                                              indx_after_incr, init);
 891       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 892     }
 893
 894   return cond_stmt;
 895 }
 896
 897 /* If we're using fully-masked loops, make LOOP iterate:
 898
 899       N == (NITERS - 1) / STEP + 1
 900
 901    times.  When NITERS is zero, this is equivalent to making the loop
 902    execute (1 << M) / STEP times, where M is the precision of NITERS.
 903    NITERS_MAYBE_ZERO is true if this last case might occur.
 904
 905    If we're not using fully-masked loops, make LOOP iterate:
 906
 907       N == (NITERS - STEP) / STEP + 1
 908
 909    times, where NITERS is known to be outside the range [1, STEP - 1].
 910    This is equivalent to making the loop execute NITERS / STEP times
 911    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 912    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 913
 914    If FINAL_IV is nonnull, it is an SSA name that should be set to
 915    N * STEP on exit from the loop.
 916
 917    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 918
 919 void
 920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 921                          tree niters, tree step, tree final_iv,
 922                          bool niters_maybe_zero)
 923 {
 924   gcond *cond_stmt;
 925   gcond *orig_cond = get_loop_exit_condition (loop);
 926   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 927
 928   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 929     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 930                                                 final_iv, niters_maybe_zero,
 931                                                 loop_cond_gsi);
 932   else
 933     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 934                                                   final_iv, niters_maybe_zero,
 935                                                   loop_cond_gsi);
 936
 937   /* Remove old loop exit test.  */
 938   stmt_vec_info orig_cond_info;
 939   if (loop_vinfo
 940       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 941     loop_vinfo->remove_stmt (orig_cond_info);
 942   else
 943     gsi_remove (&loop_cond_gsi, true);
 944
 945   if (dump_enabled_p ())
 946     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 947                      cond_stmt);
 948 }
 949
 950 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 951    For all PHI arguments in FROM->dest and TO->dest from those
 952    edges ensure that TO->dest PHI arguments have current_def
 953    to that in from.  */
 954
 955 static void
 956 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 957 {
 958   gimple_stmt_iterator gsi_from, gsi_to;
 959
 960   for (gsi_from = gsi_start_phis (from->dest),
 961        gsi_to = gsi_start_phis (to->dest);
 962        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 963     {
 964       gimple *from_phi = gsi_stmt (gsi_from);
 965       gimple *to_phi = gsi_stmt (gsi_to);
 966       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 967       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 968       if (virtual_operand_p (from_arg))
 969         {
 970           gsi_next (&gsi_from);
 971           continue;
 972         }
 973       if (virtual_operand_p (to_arg))
 974         {
 975           gsi_next (&gsi_to);
 976           continue;
 977         }
 978       if (TREE_CODE (from_arg) != SSA_NAME)
 979         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 980       else if (TREE_CODE (to_arg) == SSA_NAME
 981                && from_arg != to_arg)
 982         {
 983           if (get_current_def (to_arg) == NULL_TREE)
 984             {
 985               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 986                                               TREE_TYPE (get_current_def
 987                                                            (from_arg))));
 988               set_current_def (to_arg, get_current_def (from_arg));
 989             }
 990         }
 991       gsi_next (&gsi_from);
 992       gsi_next (&gsi_to);
 993     }
 994
 995   gphi *from_phi = get_virtual_phi (from->dest);
 996   gphi *to_phi = get_virtual_phi (to->dest);
 997   if (from_phi)
 998     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 999                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1000 }
1001
1002
1003 /* Given LOOP this function generates a new copy of it and puts it
1004    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1005    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1006    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1007    entry or exit of LOOP.  */
1008
1009 struct loop *
1010 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1011                                         struct loop *scalar_loop, edge e)
1012 {
1013   struct loop *new_loop;
1014   basic_block *new_bbs, *bbs, *pbbs;
1015   bool at_exit;
1016   bool was_imm_dom;
1017   basic_block exit_dest;
1018   edge exit, new_exit;
1019   bool duplicate_outer_loop = false;
1020
1021   exit = single_exit (loop);
1022   at_exit = (e == exit);
1023   if (!at_exit && e != loop_preheader_edge (loop))
1024     return NULL;
1025
1026   if (scalar_loop == NULL)
1027     scalar_loop = loop;
1028
1029   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1030   pbbs = bbs + 1;
1031   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1032   /* Allow duplication of outer loops.  */
1033   if (scalar_loop->inner)
1034     duplicate_outer_loop = true;
1035   /* Check whether duplication is possible.  */
1036   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1037     {
1038       free (bbs);
1039       return NULL;
1040     }
1041
1042   /* Generate new loop structure.  */
1043   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1044   duplicate_subloops (scalar_loop, new_loop);
1045
1046   exit_dest = exit->dest;
1047   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1048                                           exit_dest) == loop->header ?
1049                  true : false);
1050
1051   /* Also copy the pre-header, this avoids jumping through hoops to
1052      duplicate the loop entry PHI arguments.  Create an empty
1053      pre-header unconditionally for this.  */
1054   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1055   edge entry_e = single_pred_edge (preheader);
1056   bbs[0] = preheader;
1057   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1058
1059   exit = single_exit (scalar_loop);
1060   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1061             &exit, 1, &new_exit, NULL,
1062             at_exit ? loop->latch : e->src, true);
1063   exit = single_exit (loop);
1064   basic_block new_preheader = new_bbs[0];
1065
1066   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1067
1068   if (scalar_loop != loop)
1069     {
1070       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1071          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1072          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1073          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1074          header) to have current_def set, so copy them over.  */
1075       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1076                                                 exit);
1077       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1078                                                            0),
1079                                                 EDGE_SUCC (loop->latch, 0));
1080     }
1081
1082   if (at_exit) /* Add the loop copy at exit.  */
1083     {
1084       if (scalar_loop != loop)
1085         {
1086           gphi_iterator gsi;
1087           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1088
1089           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1090                gsi_next (&gsi))
1091             {
1092               gphi *phi = gsi.phi ();
1093               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1094               location_t orig_locus
1095                 = gimple_phi_arg_location_from_edge (phi, e);
1096
1097               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1098             }
1099         }
1100       redirect_edge_and_branch_force (e, new_preheader);
1101       flush_pending_stmts (e);
1102       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1103       if (was_imm_dom || duplicate_outer_loop)
1104         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1105
1106       /* And remove the non-necessary forwarder again.  Keep the other
1107          one so we have a proper pre-header for the loop at the exit edge.  */
1108       redirect_edge_pred (single_succ_edge (preheader),
1109                           single_pred (preheader));
1110       delete_basic_block (preheader);
1111       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1112                                loop_preheader_edge (scalar_loop)->src);
1113     }
1114   else /* Add the copy at entry.  */
1115     {
1116       if (scalar_loop != loop)
1117         {
1118           /* Remove the non-necessary forwarder of scalar_loop again.  */
1119           redirect_edge_pred (single_succ_edge (preheader),
1120                               single_pred (preheader));
1121           delete_basic_block (preheader);
1122           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1123                                    loop_preheader_edge (scalar_loop)->src);
1124           preheader = split_edge (loop_preheader_edge (loop));
1125           entry_e = single_pred_edge (preheader);
1126         }
1127
1128       redirect_edge_and_branch_force (entry_e, new_preheader);
1129       flush_pending_stmts (entry_e);
1130       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1131
1132       redirect_edge_and_branch_force (new_exit, preheader);
1133       flush_pending_stmts (new_exit);
1134       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1135
1136       /* And remove the non-necessary forwarder again.  Keep the other
1137          one so we have a proper pre-header for the loop at the exit edge.  */
1138       redirect_edge_pred (single_succ_edge (new_preheader),
1139                           single_pred (new_preheader));
1140       delete_basic_block (new_preheader);
1141       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1142                                loop_preheader_edge (new_loop)->src);
1143     }
1144
1145   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1146   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1147     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1148
1149   if (scalar_loop != loop)
1150     {
1151       /* Update new_loop->header PHIs, so that on the preheader
1152          edge they are the ones from loop rather than scalar_loop.  */
1153       gphi_iterator gsi_orig, gsi_new;
1154       edge orig_e = loop_preheader_edge (loop);
1155       edge new_e = loop_preheader_edge (new_loop);
1156
1157       for (gsi_orig = gsi_start_phis (loop->header),
1158            gsi_new = gsi_start_phis (new_loop->header);
1159            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1160            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1161         {
1162           gphi *orig_phi = gsi_orig.phi ();
1163           gphi *new_phi = gsi_new.phi ();
1164           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1165           location_t orig_locus
1166             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1167
1168           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1169         }
1170     }
1171
1172   free (new_bbs);
1173   free (bbs);
1174
1175   checking_verify_dominators (CDI_DOMINATORS);
1176
1177   return new_loop;
1178 }
1179
1180
1181 /* Given the condition expression COND, put it as the last statement of
1182    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1183    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1184    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1185    new edge as irreducible if IRREDUCIBLE_P is true.  */
1186
1187 static edge
1188 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1189                        basic_block guard_to, basic_block dom_bb,
1190                        profile_probability probability, bool irreducible_p)
1191 {
1192   gimple_stmt_iterator gsi;
1193   edge new_e, enter_e;
1194   gcond *cond_stmt;
1195   gimple_seq gimplify_stmt_list = NULL;
1196
1197   enter_e = EDGE_SUCC (guard_bb, 0);
1198   enter_e->flags &= ~EDGE_FALLTHRU;
1199   enter_e->flags |= EDGE_FALSE_VALUE;
1200   gsi = gsi_last_bb (guard_bb);
1201
1202   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1203                                  NULL_TREE);
1204   if (gimplify_stmt_list)
1205     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1206
1207   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1208   gsi = gsi_last_bb (guard_bb);
1209   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1210
1211   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1212   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1213
1214   new_e->probability = probability;
1215   if (irreducible_p)
1216     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1217
1218   enter_e->probability = probability.invert ();
1219   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1220
1221   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1222   if (enter_e->dest->loop_father->header == enter_e->dest)
1223     split_edge (enter_e);
1224
1225   return new_e;
1226 }
1227
1228
1229 /* This function verifies that the following restrictions apply to LOOP:
1230    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1231        for innermost loop and 5 basic blocks for outer-loop.
1232    (2) it is single entry, single exit
1233    (3) its exit condition is the last stmt in the header
1234    (4) E is the entry/exit edge of LOOP.
1235  */
1236
1237 bool
1238 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1239 {
1240   edge exit_e = single_exit (loop);
1241   edge entry_e = loop_preheader_edge (loop);
1242   gcond *orig_cond = get_loop_exit_condition (loop);
1243   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1244   unsigned int num_bb = loop->inner? 5 : 2;
1245
1246   /* All loops have an outer scope; the only case loop->outer is NULL is for
1247      the function itself.  */
1248   if (!loop_outer (loop)
1249       || loop->num_nodes != num_bb
1250       || !empty_block_p (loop->latch)
1251       || !single_exit (loop)
1252       /* Verify that new loop exit condition can be trivially modified.  */
1253       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1254       || (e != exit_e && e != entry_e))
1255     return false;
1256
1257   return true;
1258 }
1259
1260 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1261    in the exit bb and rename all the uses after the loop.  This simplifies
1262    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1263    (but normally loop closed SSA form doesn't require virtual PHIs to be
1264    in the same form).  Doing this early simplifies the checking what
1265    uses should be renamed.  */
1266
1267 static void
1268 create_lcssa_for_virtual_phi (struct loop *loop)
1269 {
1270   gphi_iterator gsi;
1271   edge exit_e = single_exit (loop);
1272
1273   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1274     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1275       {
1276         gphi *phi = gsi.phi ();
1277         for (gsi = gsi_start_phis (exit_e->dest);
1278              !gsi_end_p (gsi); gsi_next (&gsi))
1279           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1280             break;
1281         if (gsi_end_p (gsi))
1282           {
1283             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1284             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1285             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1286             imm_use_iterator imm_iter;
1287             gimple *stmt;
1288             use_operand_p use_p;
1289
1290             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1291               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1292             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1293             gimple_phi_set_result (new_phi, new_vop);
1294             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1295               if (stmt != new_phi
1296                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1297                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1298                   SET_USE (use_p, new_vop);
1299           }
1300         break;
1301       }
1302
1303 }
1304
1305 /* Function vect_get_loop_location.
1306
1307    Extract the location of the loop in the source code.
1308    If the loop is not well formed for vectorization, an estimated
1309    location is calculated.
1310    Return the loop location if succeed and NULL if not.  */
1311
1312 dump_user_location_t
1313 find_loop_location (struct loop *loop)
1314 {
1315   gimple *stmt = NULL;
1316   basic_block bb;
1317   gimple_stmt_iterator si;
1318
1319   if (!loop)
1320     return dump_user_location_t ();
1321
1322   stmt = get_loop_exit_condition (loop);
1323
1324   if (stmt
1325       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1326     return stmt;
1327
1328   /* If we got here the loop is probably not "well formed",
1329      try to estimate the loop location */
1330
1331   if (!loop->header)
1332     return dump_user_location_t ();
1333
1334   bb = loop->header;
1335
1336   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1337     {
1338       stmt = gsi_stmt (si);
1339       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1340         return stmt;
1341     }
1342
1343   return dump_user_location_t ();
1344 }
1345
1346 /* Return true if the phi described by STMT_INFO defines an IV of the
1347    loop to be vectorized.  */
1348
1349 static bool
1350 iv_phi_p (stmt_vec_info stmt_info)
1351 {
1352   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1353   if (virtual_operand_p (PHI_RESULT (phi)))
1354     return false;
1355
1356   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1357       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1358     return false;
1359
1360   return true;
1361 }
1362
1363 /* Function vect_can_advance_ivs_p
1364
1365    In case the number of iterations that LOOP iterates is unknown at compile
1366    time, an epilog loop will be generated, and the loop induction variables
1367    (IVs) will be "advanced" to the value they are supposed to take just before
1368    the epilog loop.  Here we check that the access function of the loop IVs
1369    and the expression that represents the loop bound are simple enough.
1370    These restrictions will be relaxed in the future.  */
1371
1372 bool
1373 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1374 {
1375   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1376   basic_block bb = loop->header;
1377   gphi_iterator gsi;
1378
1379   /* Analyze phi functions of the loop header.  */
1380
1381   if (dump_enabled_p ())
1382     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1383   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1384     {
1385       tree evolution_part;
1386
1387       gphi *phi = gsi.phi ();
1388       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1389       if (dump_enabled_p ())
1390         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1391                          phi_info->stmt);
1392
1393       /* Skip virtual phi's. The data dependences that are associated with
1394          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1395
1396          Skip reduction phis.  */
1397       if (!iv_phi_p (phi_info))
1398         {
1399           if (dump_enabled_p ())
1400             dump_printf_loc (MSG_NOTE, vect_location,
1401                              "reduc or virtual phi. skip.\n");
1402           continue;
1403         }
1404
1405       /* Analyze the evolution function.  */
1406
1407       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1408       if (evolution_part == NULL_TREE)
1409         {
1410           if (dump_enabled_p ())
1411             dump_printf (MSG_MISSED_OPTIMIZATION,
1412                          "No access function or evolution.\n");
1413           return false;
1414         }
1415
1416       /* FORNOW: We do not transform initial conditions of IVs
1417          which evolution functions are not invariants in the loop.  */
1418
1419       if (!expr_invariant_in_loop_p (loop, evolution_part))
1420         {
1421           if (dump_enabled_p ())
1422             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423                              "evolution not invariant in loop.\n");
1424           return false;
1425         }
1426
1427       /* FORNOW: We do not transform initial conditions of IVs
1428          which evolution functions are a polynomial of degree >= 2.  */
1429
1430       if (tree_is_chrec (evolution_part))
1431         {
1432           if (dump_enabled_p ())
1433             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1434                              "evolution is chrec.\n");
1435           return false;
1436         }
1437     }
1438
1439   return true;
1440 }
1441
1442
1443 /*   Function vect_update_ivs_after_vectorizer.
1444
1445      "Advance" the induction variables of LOOP to the value they should take
1446      after the execution of LOOP.  This is currently necessary because the
1447      vectorizer does not handle induction variables that are used after the
1448      loop.  Such a situation occurs when the last iterations of LOOP are
1449      peeled, because:
1450      1. We introduced new uses after LOOP for IVs that were not originally used
1451         after LOOP: the IVs of LOOP are now used by an epilog loop.
1452      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1453         times, whereas the loop IVs should be bumped N times.
1454
1455      Input:
1456      - LOOP - a loop that is going to be vectorized. The last few iterations
1457               of LOOP were peeled.
1458      - NITERS - the number of iterations that LOOP executes (before it is
1459                 vectorized). i.e, the number of times the ivs should be bumped.
1460      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1461                   coming out from LOOP on which there are uses of the LOOP ivs
1462                   (this is the path from LOOP->exit to epilog_loop->preheader).
1463
1464                   The new definitions of the ivs are placed in LOOP->exit.
1465                   The phi args associated with the edge UPDATE_E in the bb
1466                   UPDATE_E->dest are updated accordingly.
1467
1468      Assumption 1: Like the rest of the vectorizer, this function assumes
1469      a single loop exit that has a single predecessor.
1470
1471      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1472      organized in the same order.
1473
1474      Assumption 3: The access function of the ivs is simple enough (see
1475      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1476
1477      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1478      coming out of LOOP on which the ivs of LOOP are used (this is the path
1479      that leads to the epilog loop; other paths skip the epilog loop).  This
1480      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1481      needs to have its phis updated.
1482  */
1483
1484 static void
1485 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1486                                   tree niters, edge update_e)
1487 {
1488   gphi_iterator gsi, gsi1;
1489   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1490   basic_block update_bb = update_e->dest;
1491   basic_block exit_bb = single_exit (loop)->dest;
1492
1493   /* Make sure there exists a single-predecessor exit bb:  */
1494   gcc_assert (single_pred_p (exit_bb));
1495   gcc_assert (single_succ_edge (exit_bb) == update_e);
1496
1497   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1498        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1499        gsi_next (&gsi), gsi_next (&gsi1))
1500     {
1501       tree init_expr;
1502       tree step_expr, off;
1503       tree type;
1504       tree var, ni, ni_name;
1505       gimple_stmt_iterator last_gsi;
1506
1507       gphi *phi = gsi.phi ();
1508       gphi *phi1 = gsi1.phi ();
1509       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1510       if (dump_enabled_p ())
1511         dump_printf_loc (MSG_NOTE, vect_location,
1512                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1513
1514       /* Skip reduction and virtual phis.  */
1515       if (!iv_phi_p (phi_info))
1516         {
1517           if (dump_enabled_p ())
1518             dump_printf_loc (MSG_NOTE, vect_location,
1519                              "reduc or virtual phi. skip.\n");
1520           continue;
1521         }
1522
1523       type = TREE_TYPE (gimple_phi_result (phi));
1524       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1525       step_expr = unshare_expr (step_expr);
1526
1527       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1528          of degree >= 2 or exponential.  */
1529       gcc_assert (!tree_is_chrec (step_expr));
1530
1531       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1532
1533       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1534                          fold_convert (TREE_TYPE (step_expr), niters),
1535                          step_expr);
1536       if (POINTER_TYPE_P (type))
1537         ni = fold_build_pointer_plus (init_expr, off);
1538       else
1539         ni = fold_build2 (PLUS_EXPR, type,
1540                           init_expr, fold_convert (type, off));
1541
1542       var = create_tmp_var (type, "tmp");
1543
1544       last_gsi = gsi_last_bb (exit_bb);
1545       gimple_seq new_stmts = NULL;
1546       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1547       /* Exit_bb shouldn't be empty.  */
1548       if (!gsi_end_p (last_gsi))
1549         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1550       else
1551         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1552
1553       /* Fix phi expressions in the successor bb.  */
1554       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1555     }
1556 }
1557
1558 /* Return a gimple value containing the misalignment (measured in vector
1559    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1560    it is away from a perfectly aligned address.  Add any new statements
1561    to SEQ.  */
1562
1563 static tree
1564 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1565 {
1566   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1567   stmt_vec_info stmt_info = dr_info->stmt;
1568   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1569
1570   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1571   unsigned HOST_WIDE_INT target_align_c;
1572   tree target_align_minus_1;
1573
1574   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1575                                         size_zero_node) < 0;
1576   tree offset = (negative
1577                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1578                  : size_zero_node);
1579   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1580                                                           offset);
1581   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1582   if (target_align.is_constant (&target_align_c))
1583     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1584   else
1585     {
1586       tree vla = build_int_cst (type, target_align);
1587       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1588                                     fold_build2 (MINUS_EXPR, type,
1589                                                  build_int_cst (type, 0), vla));
1590       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1591                                           build_int_cst (type, 1));
1592     }
1593
1594   HOST_WIDE_INT elem_size
1595     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1596   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1597
1598   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1599   tree int_start_addr = fold_convert (type, start_addr);
1600   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1601                                         target_align_minus_1);
1602
1603   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1604   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1605                                         elem_size_log);
1606
1607   return misalign_in_elems;
1608 }
1609
1610 /* Function vect_gen_prolog_loop_niters
1611
1612    Generate the number of iterations which should be peeled as prolog for the
1613    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1614    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1615    As a result, after the execution of this loop, the data reference DR will
1616    refer to an aligned location.  The following computation is generated:
1617
1618    If the misalignment of DR is known at compile time:
1619      addr_mis = int mis = DR_MISALIGNMENT (dr);
1620    Else, compute address misalignment in bytes:
1621      addr_mis = addr & (target_align - 1)
1622
1623    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1624
1625    (elem_size = element type size; an element is the scalar element whose type
1626    is the inner type of the vectype)
1627
1628    The computations will be emitted at the end of BB.  We also compute and
1629    store upper bound (included) of the result in BOUND.
1630
1631    When the step of the data-ref in the loop is not 1 (as in interleaved data
1632    and SLP), the number of iterations of the prolog must be divided by the step
1633    (which is equal to the size of interleaved group).
1634
1635    The above formulas assume that VF == number of elements in the vector. This
1636    may not hold when there are multiple-types in the loop.
1637    In this case, for some data-references in the loop the VF does not represent
1638    the number of elements that fit in the vector.  Therefore, instead of VF we
1639    use TYPE_VECTOR_SUBPARTS.  */
1640
1641 static tree
1642 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1643                              basic_block bb, int *bound)
1644 {
1645   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1646   tree var;
1647   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1648   gimple_seq stmts = NULL, new_stmts = NULL;
1649   tree iters, iters_name;
1650   stmt_vec_info stmt_info = dr_info->stmt;
1651   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1652   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1653
1654   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1655     {
1656       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1657
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_NOTE, vect_location,
1660                          "known peeling = %d.\n", npeel);
1661
1662       iters = build_int_cst (niters_type, npeel);
1663       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1664     }
1665   else
1666     {
1667       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1668       tree type = TREE_TYPE (misalign_in_elems);
1669       HOST_WIDE_INT elem_size
1670         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1671       /* We only do prolog peeling if the target alignment is known at compile
1672          time.  */
1673       poly_uint64 align_in_elems =
1674         exact_div (target_align, elem_size);
1675       tree align_in_elems_minus_1 =
1676         build_int_cst (type, align_in_elems - 1);
1677       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1678
1679       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1680                                  & (align_in_elems - 1)).  */
1681       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1682                                             size_zero_node) < 0;
1683       if (negative)
1684         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1685                              align_in_elems_tree);
1686       else
1687         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1688                              misalign_in_elems);
1689       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1690       iters = fold_convert (niters_type, iters);
1691       unsigned HOST_WIDE_INT align_in_elems_c;
1692       if (align_in_elems.is_constant (&align_in_elems_c))
1693         *bound = align_in_elems_c - 1;
1694       else
1695         *bound = -1;
1696     }
1697
1698   if (dump_enabled_p ())
1699     dump_printf_loc (MSG_NOTE, vect_location,
1700                      "niters for prolog loop: %T\n", iters);
1701
1702   var = create_tmp_var (niters_type, "prolog_loop_niters");
1703   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1704
1705   if (new_stmts)
1706     gimple_seq_add_seq (&stmts, new_stmts);
1707   if (stmts)
1708     {
1709       gcc_assert (single_succ_p (bb));
1710       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1711       if (gsi_end_p (gsi))
1712         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1713       else
1714         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1715     }
1716   return iters_name;
1717 }
1718
1719
1720 /* Function vect_update_init_of_dr
1721
1722    If CODE is PLUS, the vector loop starts NITERS iterations after the
1723    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1724    iterations before the scalar one (using masking to skip inactive
1725    elements).  This function updates the information recorded in DR to
1726    account for the difference.  Specifically, it updates the OFFSET
1727    field of DR.  */
1728
1729 static void
1730 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1731 {
1732   tree offset = DR_OFFSET (dr);
1733
1734   niters = fold_build2 (MULT_EXPR, sizetype,
1735                         fold_convert (sizetype, niters),
1736                         fold_convert (sizetype, DR_STEP (dr)));
1737   offset = fold_build2 (code, sizetype,
1738                         fold_convert (sizetype, offset), niters);
1739   DR_OFFSET (dr) = offset;
1740 }
1741
1742
1743 /* Function vect_update_inits_of_drs
1744
1745    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1746    CODE and NITERS are as for vect_update_inits_of_dr.  */
1747
1748 static void
1749 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1750                           tree_code code)
1751 {
1752   unsigned int i;
1753   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1754   struct data_reference *dr;
1755
1756   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1757
1758   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1759   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1760     {
1761       gimple_seq seq;
1762       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1763       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1764
1765       niters = fold_convert (sizetype, niters);
1766       niters = force_gimple_operand (niters, &seq, false, var);
1767       if (seq)
1768         {
1769           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1770           gcc_assert (!new_bb);
1771         }
1772     }
1773
1774   FOR_EACH_VEC_ELT (datarefs, i, dr)
1775     {
1776       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1777       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1778         vect_update_init_of_dr (dr, niters, code);
1779     }
1780 }
1781
1782 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1783    by masking.  This involves calculating the number of iterations to
1784    be peeled and then aligning all memory references appropriately.  */
1785
1786 void
1787 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1788 {
1789   tree misalign_in_elems;
1790   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1791
1792   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1793
1794   /* From the information recorded in LOOP_VINFO get the number of iterations
1795      that need to be skipped via masking.  */
1796   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1797     {
1798       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1799                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1800       misalign_in_elems = build_int_cst (type, misalign);
1801     }
1802   else
1803     {
1804       gimple_seq seq1 = NULL, seq2 = NULL;
1805       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1806       misalign_in_elems = fold_convert (type, misalign_in_elems);
1807       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1808                                                 &seq2, true, NULL_TREE);
1809       gimple_seq_add_seq (&seq1, seq2);
1810       if (seq1)
1811         {
1812           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1813           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1814           gcc_assert (!new_bb);
1815         }
1816     }
1817
1818   if (dump_enabled_p ())
1819     dump_printf_loc (MSG_NOTE, vect_location,
1820                      "misalignment for fully-masked loop: %T\n",
1821                      misalign_in_elems);
1822
1823   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1824
1825   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1826 }
1827
1828 /* This function builds ni_name = number of iterations.  Statements
1829    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1830    it to TRUE if new ssa_var is generated.  */
1831
1832 tree
1833 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1834 {
1835   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1836   if (TREE_CODE (ni) == INTEGER_CST)
1837     return ni;
1838   else
1839     {
1840       tree ni_name, var;
1841       gimple_seq stmts = NULL;
1842       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1843
1844       var = create_tmp_var (TREE_TYPE (ni), "niters");
1845       ni_name = force_gimple_operand (ni, &stmts, false, var);
1846       if (stmts)
1847         {
1848           gsi_insert_seq_on_edge_immediate (pe, stmts);
1849           if (new_var_p != NULL)
1850             *new_var_p = true;
1851         }
1852
1853       return ni_name;
1854     }
1855 }
1856
1857 /* Calculate the number of iterations above which vectorized loop will be
1858    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1859    of prolog loop.  If it's integer const, the integer number is also passed
1860    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1861    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1862    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1863    threshold below which the scalar (rather than vectorized) loop will be
1864    executed.  This function stores the upper bound (inclusive) of the result
1865    in BOUND_SCALAR.  */
1866
1867 static tree
1868 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1869                              int bound_prolog, poly_int64 bound_epilog, int th,
1870                              poly_uint64 *bound_scalar,
1871                              bool check_profitability)
1872 {
1873   tree type = TREE_TYPE (niters_prolog);
1874   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1875                              build_int_cst (type, bound_epilog));
1876
1877   *bound_scalar = bound_prolog + bound_epilog;
1878   if (check_profitability)
1879     {
1880       /* TH indicates the minimum niters of vectorized loop, while we
1881          compute the maximum niters of scalar loop.  */
1882       th--;
1883       /* Peeling for constant times.  */
1884       if (int_niters_prolog >= 0)
1885         {
1886           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1887           return build_int_cst (type, *bound_scalar);
1888         }
1889       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1890          and BOUND_EPILOG are inclusive upper bounds.  */
1891       if (known_ge (th, bound_prolog + bound_epilog))
1892         {
1893           *bound_scalar = th;
1894           return build_int_cst (type, th);
1895         }
1896       /* Need to do runtime comparison.  */
1897       else if (maybe_gt (th, bound_epilog))
1898         {
1899           *bound_scalar = upper_bound (*bound_scalar, th);
1900           return fold_build2 (MAX_EXPR, type,
1901                               build_int_cst (type, th), niters);
1902         }
1903     }
1904   return niters;
1905 }
1906
1907 /* NITERS is the number of times that the original scalar loop executes
1908    after peeling.  Work out the maximum number of iterations N that can
1909    be handled by the vectorized form of the loop and then either:
1910
1911    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1912
1913         niters_vector = N
1914
1915    b) set *STEP_VECTOR_PTR to one and generate:
1916
1917         niters_vector = N / vf
1918
1919    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1920    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1921    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1922
1923 void
1924 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1925                              tree *niters_vector_ptr, tree *step_vector_ptr,
1926                              bool niters_no_overflow)
1927 {
1928   tree ni_minus_gap, var;
1929   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1930   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1931   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1932   tree log_vf = NULL_TREE;
1933
1934   /* If epilogue loop is required because of data accesses with gaps, we
1935      subtract one iteration from the total number of iterations here for
1936      correct calculation of RATIO.  */
1937   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1938     {
1939       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1940                                   build_one_cst (type));
1941       if (!is_gimple_val (ni_minus_gap))
1942         {
1943           var = create_tmp_var (type, "ni_gap");
1944           gimple *stmts = NULL;
1945           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1946                                                true, var);
1947           gsi_insert_seq_on_edge_immediate (pe, stmts);
1948         }
1949     }
1950   else
1951     ni_minus_gap = niters;
1952
1953   unsigned HOST_WIDE_INT const_vf;
1954   if (vf.is_constant (&const_vf)
1955       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1956     {
1957       /* Create: niters >> log2(vf) */
1958       /* If it's known that niters == number of latch executions + 1 doesn't
1959          overflow, we can generate niters >> log2(vf); otherwise we generate
1960          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1961          will be at least one.  */
1962       log_vf = build_int_cst (type, exact_log2 (const_vf));
1963       if (niters_no_overflow)
1964         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1965       else
1966         niters_vector
1967           = fold_build2 (PLUS_EXPR, type,
1968                          fold_build2 (RSHIFT_EXPR, type,
1969                                       fold_build2 (MINUS_EXPR, type,
1970                                                    ni_minus_gap,
1971                                                    build_int_cst (type, vf)),
1972                                       log_vf),
1973                          build_int_cst (type, 1));
1974       step_vector = build_one_cst (type);
1975     }
1976   else
1977     {
1978       niters_vector = ni_minus_gap;
1979       step_vector = build_int_cst (type, vf);
1980     }
1981
1982   if (!is_gimple_val (niters_vector))
1983     {
1984       var = create_tmp_var (type, "bnd");
1985       gimple_seq stmts = NULL;
1986       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1987       gsi_insert_seq_on_edge_immediate (pe, stmts);
1988       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1989          we set range information to make niters analyzer's life easier.  */
1990       if (stmts != NULL && log_vf)
1991         set_range_info (niters_vector, VR_RANGE,
1992                         wi::to_wide (build_int_cst (type, 1)),
1993                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1994                                                   TYPE_MAX_VALUE (type),
1995                                                   log_vf)));
1996     }
1997   *niters_vector_ptr = niters_vector;
1998   *step_vector_ptr = step_vector;
1999
2000   return;
2001 }
2002
2003 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2004    loop specified by LOOP_VINFO after vectorization, compute the number
2005    of iterations before vectorization (niters_vector * vf) and store it
2006    to NITERS_VECTOR_MULT_VF_PTR.  */
2007
2008 static void
2009 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2010                                      tree niters_vector,
2011                                      tree *niters_vector_mult_vf_ptr)
2012 {
2013   /* We should be using a step_vector of VF if VF is variable.  */
2014   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2015   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2016   tree type = TREE_TYPE (niters_vector);
2017   tree log_vf = build_int_cst (type, exact_log2 (vf));
2018   basic_block exit_bb = single_exit (loop)->dest;
2019
2020   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2021   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2022                                             niters_vector, log_vf);
2023   if (!is_gimple_val (niters_vector_mult_vf))
2024     {
2025       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2026       gimple_seq stmts = NULL;
2027       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2028                                                     &stmts, true, var);
2029       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2030       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2031     }
2032   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2033 }
2034
2035 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2036    from SECOND/FIRST and puts it at the original loop's preheader/exit
2037    edge, the two loops are arranged as below:
2038
2039        preheader_a:
2040      first_loop:
2041        header_a:
2042          i_1 = PHI<i_0, i_2>;
2043          ...
2044          i_2 = i_1 + 1;
2045          if (cond_a)
2046            goto latch_a;
2047          else
2048            goto between_bb;
2049        latch_a:
2050          goto header_a;
2051
2052        between_bb:
2053          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2054
2055      second_loop:
2056        header_b:
2057          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2058                                  or with i_2 if no LCSSA phi is created
2059                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2060          ...
2061          i_4 = i_3 + 1;
2062          if (cond_b)
2063            goto latch_b;
2064          else
2065            goto exit_bb;
2066        latch_b:
2067          goto header_b;
2068
2069        exit_bb:
2070
2071    This function creates loop closed SSA for the first loop; update the
2072    second loop's PHI nodes by replacing argument on incoming edge with the
2073    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2074    is false, Loop closed ssa phis will only be created for non-iv phis for
2075    the first loop.
2076
2077    This function assumes exit bb of the first loop is preheader bb of the
2078    second loop, i.e, between_bb in the example code.  With PHIs updated,
2079    the second loop will execute rest iterations of the first.  */
2080
2081 static void
2082 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2083                                    struct loop *first, struct loop *second,
2084                                    bool create_lcssa_for_iv_phis)
2085 {
2086   gphi_iterator gsi_update, gsi_orig;
2087   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2088
2089   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2090   edge second_preheader_e = loop_preheader_edge (second);
2091   basic_block between_bb = single_exit (first)->dest;
2092
2093   gcc_assert (between_bb == second_preheader_e->src);
2094   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2095   /* Either the first loop or the second is the loop to be vectorized.  */
2096   gcc_assert (loop == first || loop == second);
2097
2098   for (gsi_orig = gsi_start_phis (first->header),
2099        gsi_update = gsi_start_phis (second->header);
2100        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2101        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2102     {
2103       gphi *orig_phi = gsi_orig.phi ();
2104       gphi *update_phi = gsi_update.phi ();
2105
2106       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2107       /* Generate lcssa PHI node for the first loop.  */
2108       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2109       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2110       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2111         {
2112           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2113           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2114           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2115           arg = new_res;
2116         }
2117
2118       /* Update PHI node in the second loop by replacing arg on the loop's
2119          incoming edge.  */
2120       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2121     }
2122 }
2123
2124 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2125    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2126    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2127    appear like below:
2128
2129        guard_bb:
2130          if (cond)
2131            goto merge_bb;
2132          else
2133            goto skip_loop;
2134
2135      skip_loop:
2136        header_a:
2137          i_1 = PHI<i_0, i_2>;
2138          ...
2139          i_2 = i_1 + 1;
2140          if (cond_a)
2141            goto latch_a;
2142          else
2143            goto exit_a;
2144        latch_a:
2145          goto header_a;
2146
2147        exit_a:
2148          i_5 = PHI<i_2>;
2149
2150        merge_bb:
2151          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2152
2153      update_loop:
2154        header_b:
2155          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2156          ...
2157          i_4 = i_3 + 1;
2158          if (cond_b)
2159            goto latch_b;
2160          else
2161            goto exit_bb;
2162        latch_b:
2163          goto header_b;
2164
2165        exit_bb:
2166
2167    This function creates PHI nodes at merge_bb and replaces the use of i_5
2168    in the update_loop's PHI node with the result of new PHI result.  */
2169
2170 static void
2171 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2172                                     struct loop *update_loop,
2173                                     edge guard_edge, edge merge_edge)
2174 {
2175   location_t merge_loc, guard_loc;
2176   edge orig_e = loop_preheader_edge (skip_loop);
2177   edge update_e = loop_preheader_edge (update_loop);
2178   gphi_iterator gsi_orig, gsi_update;
2179
2180   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2181         gsi_update = gsi_start_phis (update_loop->header));
2182        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2183        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2184     {
2185       gphi *orig_phi = gsi_orig.phi ();
2186       gphi *update_phi = gsi_update.phi ();
2187
2188       /* Generate new phi node at merge bb of the guard.  */
2189       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2190       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2191
2192       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2193          args in NEW_PHI for these edges.  */
2194       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2195       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2196       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2197       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2198       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2199       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2200
2201       /* Update phi in UPDATE_PHI.  */
2202       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2203     }
2204 }
2205
2206 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2207    this function searches for the corresponding lcssa phi node in exit
2208    bb of LOOP.  If it is found, return the phi result; otherwise return
2209    NULL.  */
2210
2211 static tree
2212 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2213                 gphi *lcssa_phi)
2214 {
2215   gphi_iterator gsi;
2216   edge e = single_exit (loop);
2217
2218   gcc_assert (single_pred_p (e->dest));
2219   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2220     {
2221       gphi *phi = gsi.phi ();
2222       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2223                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2224         return PHI_RESULT (phi);
2225     }
2226   return NULL_TREE;
2227 }
2228
2229 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2230    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2231    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2232    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2233    The CFG looks like:
2234
2235      loop:
2236        header_a:
2237          i_1 = PHI<i_0, i_2>;
2238          ...
2239          i_2 = i_1 + 1;
2240          if (cond_a)
2241            goto latch_a;
2242          else
2243            goto exit_a;
2244        latch_a:
2245          goto header_a;
2246
2247        exit_a:
2248
2249        guard_bb:
2250          if (cond)
2251            goto merge_bb;
2252          else
2253            goto epilog_loop;
2254
2255        ;; fall_through_bb
2256
2257      epilog_loop:
2258        header_b:
2259          i_3 = PHI<i_2, i_4>;
2260          ...
2261          i_4 = i_3 + 1;
2262          if (cond_b)
2263            goto latch_b;
2264          else
2265            goto merge_bb;
2266        latch_b:
2267          goto header_b;
2268
2269        merge_bb:
2270          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2271
2272        exit_bb:
2273          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2274
2275    For each name used out side EPILOG (i.e - for each name that has a lcssa
2276    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2277    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2278    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2279    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2280    in exit_bb will also be updated.  */
2281
2282 static void
2283 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2284                                     edge guard_edge, edge merge_edge)
2285 {
2286   gphi_iterator gsi;
2287   basic_block merge_bb = guard_edge->dest;
2288
2289   gcc_assert (single_succ_p (merge_bb));
2290   edge e = single_succ_edge (merge_bb);
2291   basic_block exit_bb = e->dest;
2292   gcc_assert (single_pred_p (exit_bb));
2293   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2294
2295   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2296     {
2297       gphi *update_phi = gsi.phi ();
2298       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2299       /* This loop-closed-phi actually doesn't represent a use out of the
2300          loop - the phi arg is a constant.  */
2301       if (TREE_CODE (old_arg) != SSA_NAME)
2302         continue;
2303
2304       tree merge_arg = get_current_def (old_arg);
2305       if (!merge_arg)
2306         merge_arg = old_arg;
2307
2308       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2309       /* If the var is live after loop but not a reduction, we simply
2310          use the old arg.  */
2311       if (!guard_arg)
2312         guard_arg = old_arg;
2313
2314       /* Create new phi node in MERGE_BB:  */
2315       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2316       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2317
2318       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2319          the two PHI args in merge_phi for these edges.  */
2320       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2321       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2322
2323       /* Update the original phi in exit_bb.  */
2324       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2325     }
2326 }
2327
2328 /* EPILOG loop is duplicated from the original loop for vectorizing,
2329    the arg of its loop closed ssa PHI needs to be updated.  */
2330
2331 static void
2332 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2333 {
2334   gphi_iterator gsi;
2335   basic_block exit_bb = single_exit (epilog)->dest;
2336
2337   gcc_assert (single_pred_p (exit_bb));
2338   edge e = EDGE_PRED (exit_bb, 0);
2339   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2340     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2341 }
2342
2343 /* Function vect_do_peeling.
2344
2345    Input:
2346    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2347
2348        preheader:
2349      LOOP:
2350        header_bb:
2351          loop_body
2352          if (exit_loop_cond) goto exit_bb
2353          else                goto header_bb
2354        exit_bb:
2355
2356    - NITERS: The number of iterations of the loop.
2357    - NITERSM1: The number of iterations of the loop's latch.
2358    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2359    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2360                               CHECK_PROFITABILITY is true.
2361    Output:
2362    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2363      iterate after vectorization; see vect_set_loop_condition for details.
2364    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2365      should be set to the number of scalar iterations handled by the
2366      vector loop.  The SSA name is only used on exit from the loop.
2367
2368    This function peels prolog and epilog from the loop, adds guards skipping
2369    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2370    would look like:
2371
2372        guard_bb_1:
2373          if (prefer_scalar_loop) goto merge_bb_1
2374          else                    goto guard_bb_2
2375
2376        guard_bb_2:
2377          if (skip_prolog) goto merge_bb_2
2378          else             goto prolog_preheader
2379
2380        prolog_preheader:
2381      PROLOG:
2382        prolog_header_bb:
2383          prolog_body
2384          if (exit_prolog_cond) goto prolog_exit_bb
2385          else                  goto prolog_header_bb
2386        prolog_exit_bb:
2387
2388        merge_bb_2:
2389
2390        vector_preheader:
2391      VECTOR LOOP:
2392        vector_header_bb:
2393          vector_body
2394          if (exit_vector_cond) goto vector_exit_bb
2395          else                  goto vector_header_bb
2396        vector_exit_bb:
2397
2398        guard_bb_3:
2399          if (skip_epilog) goto merge_bb_3
2400          else             goto epilog_preheader
2401
2402        merge_bb_1:
2403
2404        epilog_preheader:
2405      EPILOG:
2406        epilog_header_bb:
2407          epilog_body
2408          if (exit_epilog_cond) goto merge_bb_3
2409          else                  goto epilog_header_bb
2410
2411        merge_bb_3:
2412
2413    Note this function peels prolog and epilog only if it's necessary,
2414    as well as guards.
2415    Returns created epilogue or NULL.
2416
2417    TODO: Guard for prefer_scalar_loop should be emitted along with
2418    versioning conditions if loop versioning is needed.  */
2419
2420
2421 struct loop *
2422 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2423                  tree *niters_vector, tree *step_vector,
2424                  tree *niters_vector_mult_vf_var, int th,
2425                  bool check_profitability, bool niters_no_overflow)
2426 {
2427   edge e, guard_e;
2428   tree type = TREE_TYPE (niters), guard_cond;
2429   basic_block guard_bb, guard_to;
2430   profile_probability prob_prolog, prob_vector, prob_epilog;
2431   int estimated_vf;
2432   int prolog_peeling = 0;
2433   /* We currently do not support prolog peeling if the target alignment is not
2434      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2435      target alignment being constant.  */
2436   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2437   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2438     return NULL;
2439
2440   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2441     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2442
2443   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2444   poly_uint64 bound_epilog = 0;
2445   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2446       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2447     bound_epilog += vf - 1;
2448   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2449     bound_epilog += 1;
2450   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2451   poly_uint64 bound_scalar = bound_epilog;
2452
2453   if (!prolog_peeling && !epilog_peeling)
2454     return NULL;
2455
2456   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2457   estimated_vf = vect_vf_for_cost (loop_vinfo);
2458   if (estimated_vf == 2)
2459     estimated_vf = 3;
2460   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2461                         .apply_scale (estimated_vf - 1, estimated_vf);
2462
2463   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2464   struct loop *first_loop = loop;
2465   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2466   create_lcssa_for_virtual_phi (loop);
2467   update_ssa (TODO_update_ssa_only_virtuals);
2468
2469   if (MAY_HAVE_DEBUG_BIND_STMTS)
2470     {
2471       gcc_assert (!adjust_vec.exists ());
2472       adjust_vec.create (32);
2473     }
2474   initialize_original_copy_tables ();
2475
2476   /* Record the anchor bb at which the guard should be placed if the scalar
2477      loop might be preferred.  */
2478   basic_block anchor = loop_preheader_edge (loop)->src;
2479
2480   /* Generate the number of iterations for the prolog loop.  We do this here
2481      so that we can also get the upper bound on the number of iterations.  */
2482   tree niters_prolog;
2483   int bound_prolog = 0;
2484   if (prolog_peeling)
2485     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2486                                                  &bound_prolog);
2487   else
2488     niters_prolog = build_int_cst (type, 0);
2489
2490   /* Prolog loop may be skipped.  */
2491   bool skip_prolog = (prolog_peeling != 0);
2492   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2493      when we peel for epilog loop and when it hasn't been checked with
2494      loop versioning.  */
2495   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2496                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2497                                   bound_prolog + bound_epilog)
2498                       : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2499   /* Epilog loop must be executed if the number of iterations for epilog
2500      loop is known at compile time, otherwise we need to add a check at
2501      the end of vector loop and skip to the end of epilog loop.  */
2502   bool skip_epilog = (prolog_peeling < 0
2503                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2504                       || !vf.is_constant ());
2505   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2506   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2507     skip_epilog = false;
2508
2509   if (skip_vector)
2510     {
2511       split_edge (loop_preheader_edge (loop));
2512
2513       /* Due to the order in which we peel prolog and epilog, we first
2514          propagate probability to the whole loop.  The purpose is to
2515          avoid adjusting probabilities of both prolog and vector loops
2516          separately.  Note in this case, the probability of epilog loop
2517          needs to be scaled back later.  */
2518       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2519       if (prob_vector.initialized_p ())
2520         {
2521           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2522           scale_loop_profile (loop, prob_vector, 0);
2523         }
2524     }
2525
2526   dump_user_location_t loop_loc = find_loop_location (loop);
2527   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2528   if (prolog_peeling)
2529     {
2530       e = loop_preheader_edge (loop);
2531       if (!slpeel_can_duplicate_loop_p (loop, e))
2532         {
2533           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2534                            "loop can't be duplicated to preheader edge.\n");
2535           gcc_unreachable ();
2536         }
2537       /* Peel prolog and put it on preheader edge of loop.  */
2538       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2539       if (!prolog)
2540         {
2541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2542                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2543           gcc_unreachable ();
2544         }
2545       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2546       first_loop = prolog;
2547       reset_original_copy_tables ();
2548
2549       /* Update the number of iterations for prolog loop.  */
2550       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2551       vect_set_loop_condition (prolog, NULL, niters_prolog,
2552                                step_prolog, NULL_TREE, false);
2553
2554       /* Skip the prolog loop.  */
2555       if (skip_prolog)
2556         {
2557           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2558                                     niters_prolog, build_int_cst (type, 0));
2559           guard_bb = loop_preheader_edge (prolog)->src;
2560           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2561           guard_to = split_edge (loop_preheader_edge (loop));
2562           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2563                                            guard_to, guard_bb,
2564                                            prob_prolog.invert (),
2565                                            irred_flag);
2566           e = EDGE_PRED (guard_to, 0);
2567           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2568           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2569
2570           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2571           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2572         }
2573       /* Update init address of DRs.  */
2574       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2575       /* Update niters for vector loop.  */
2576       LOOP_VINFO_NITERS (loop_vinfo)
2577         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2578       LOOP_VINFO_NITERSM1 (loop_vinfo)
2579         = fold_build2 (MINUS_EXPR, type,
2580                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2581       bool new_var_p = false;
2582       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2583       /* It's guaranteed that vector loop bound before vectorization is at
2584          least VF, so set range information for newly generated var.  */
2585       if (new_var_p)
2586         set_range_info (niters, VR_RANGE,
2587                         wi::to_wide (build_int_cst (type, vf)),
2588                         wi::to_wide (TYPE_MAX_VALUE (type)));
2589
2590       /* Prolog iterates at most bound_prolog times, latch iterates at
2591          most bound_prolog - 1 times.  */
2592       record_niter_bound (prolog, bound_prolog - 1, false, true);
2593       delete_update_ssa ();
2594       adjust_vec_debug_stmts ();
2595       scev_reset ();
2596     }
2597
2598   if (epilog_peeling)
2599     {
2600       e = single_exit (loop);
2601       if (!slpeel_can_duplicate_loop_p (loop, e))
2602         {
2603           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2604                            "loop can't be duplicated to exit edge.\n");
2605           gcc_unreachable ();
2606         }
2607       /* Peel epilog and put it on exit edge of loop.  */
2608       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2609       if (!epilog)
2610         {
2611           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2612                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2613           gcc_unreachable ();
2614         }
2615       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2616
2617       /* Scalar version loop may be preferred.  In this case, add guard
2618          and skip to epilog.  Note this only happens when the number of
2619          iterations of loop is unknown at compile time, otherwise this
2620          won't be vectorized.  */
2621       if (skip_vector)
2622         {
2623           /* Additional epilogue iteration is peeled if gap exists.  */
2624           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2625                                                 bound_prolog, bound_epilog,
2626                                                 th, &bound_scalar,
2627                                                 check_profitability);
2628           /* Build guard against NITERSM1 since NITERS may overflow.  */
2629           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2630           guard_bb = anchor;
2631           guard_to = split_edge (loop_preheader_edge (epilog));
2632           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2633                                            guard_to, guard_bb,
2634                                            prob_vector.invert (),
2635                                            irred_flag);
2636           e = EDGE_PRED (guard_to, 0);
2637           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2638           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2639
2640           /* Simply propagate profile info from guard_bb to guard_to which is
2641              a merge point of control flow.  */
2642           guard_to->count = guard_bb->count;
2643
2644           /* Scale probability of epilog loop back.
2645              FIXME: We should avoid scaling down and back up.  Profile may
2646              get lost if we scale down to 0.  */
2647           basic_block *bbs = get_loop_body (epilog);
2648           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2649             bbs[i]->count = bbs[i]->count.apply_scale
2650                                  (bbs[i]->count,
2651                                   bbs[i]->count.apply_probability
2652                                     (prob_vector));
2653           free (bbs);
2654         }
2655
2656       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2657       tree niters_vector_mult_vf;
2658       /* If loop is peeled for non-zero constant times, now niters refers to
2659          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2660          overflows.  */
2661       niters_no_overflow |= (prolog_peeling > 0);
2662       vect_gen_vector_loop_niters (loop_vinfo, niters,
2663                                    niters_vector, step_vector,
2664                                    niters_no_overflow);
2665       if (!integer_onep (*step_vector))
2666         {
2667           /* On exit from the loop we will have an easy way of calcalating
2668              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2669              until then.  */
2670           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2671           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2672           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2673         }
2674       else
2675         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2676                                              &niters_vector_mult_vf);
2677       /* Update IVs of original loop as if they were advanced by
2678          niters_vector_mult_vf steps.  */
2679       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2680       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2681       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2682                                         update_e);
2683
2684       if (skip_epilog)
2685         {
2686           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2687                                     niters, niters_vector_mult_vf);
2688           guard_bb = single_exit (loop)->dest;
2689           guard_to = split_edge (single_exit (epilog));
2690           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2691                                            skip_vector ? anchor : guard_bb,
2692                                            prob_epilog.invert (),
2693                                            irred_flag);
2694           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2695                                               single_exit (epilog));
2696           /* Only need to handle basic block before epilog loop if it's not
2697              the guard_bb, which is the case when skip_vector is true.  */
2698           if (guard_bb != bb_before_epilog)
2699             {
2700               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2701
2702               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2703             }
2704           scale_loop_profile (epilog, prob_epilog, 0);
2705         }
2706       else
2707         slpeel_update_phi_nodes_for_lcssa (epilog);
2708
2709       unsigned HOST_WIDE_INT bound;
2710       if (bound_scalar.is_constant (&bound))
2711         {
2712           gcc_assert (bound != 0);
2713           /* -1 to convert loop iterations to latch iterations.  */
2714           record_niter_bound (epilog, bound - 1, false, true);
2715         }
2716
2717       delete_update_ssa ();
2718       adjust_vec_debug_stmts ();
2719       scev_reset ();
2720     }
2721   adjust_vec.release ();
2722   free_original_copy_tables ();
2723
2724   return epilog;
2725 }
2726
2727 /* Function vect_create_cond_for_niters_checks.
2728
2729    Create a conditional expression that represents the run-time checks for
2730    loop's niter.  The loop is guaranteed to terminate if the run-time
2731    checks hold.
2732
2733    Input:
2734    COND_EXPR  - input conditional expression.  New conditions will be chained
2735                 with logical AND operation.  If it is NULL, then the function
2736                 is used to return the number of alias checks.
2737    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2738                 to be checked.
2739
2740    Output:
2741    COND_EXPR - conditional expression.
2742
2743    The returned COND_EXPR is the conditional expression to be used in the
2744    if statement that controls which version of the loop gets executed at
2745    runtime.  */
2746
2747 static void
2748 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2749 {
2750   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2751
2752   if (*cond_expr)
2753     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2754                               *cond_expr, part_cond_expr);
2755   else
2756     *cond_expr = part_cond_expr;
2757 }
2758
2759 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2760    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2761
2762 static void
2763 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2764 {
2765   if (*cond_expr)
2766     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2767                               *cond_expr, part_cond_expr);
2768   else
2769     *cond_expr = part_cond_expr;
2770 }
2771
2772 /* Function vect_create_cond_for_align_checks.
2773
2774    Create a conditional expression that represents the alignment checks for
2775    all of data references (array element references) whose alignment must be
2776    checked at runtime.
2777
2778    Input:
2779    COND_EXPR  - input conditional expression.  New conditions will be chained
2780                 with logical AND operation.
2781    LOOP_VINFO - two fields of the loop information are used.
2782                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2783                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2784
2785    Output:
2786    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2787                          expression.
2788    The returned value is the conditional expression to be used in the if
2789    statement that controls which version of the loop gets executed at runtime.
2790
2791    The algorithm makes two assumptions:
2792      1) The number of bytes "n" in a vector is a power of 2.
2793      2) An address "a" is aligned if a%n is zero and that this
2794         test can be done as a&(n-1) == 0.  For example, for 16
2795         byte vectors the test is a&0xf == 0.  */
2796
2797 static void
2798 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2799                                    tree *cond_expr,
2800                                    gimple_seq *cond_expr_stmt_list)
2801 {
2802   vec<stmt_vec_info> may_misalign_stmts
2803     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2804   stmt_vec_info stmt_info;
2805   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2806   tree mask_cst;
2807   unsigned int i;
2808   tree int_ptrsize_type;
2809   char tmp_name[20];
2810   tree or_tmp_name = NULL_TREE;
2811   tree and_tmp_name;
2812   gimple *and_stmt;
2813   tree ptrsize_zero;
2814   tree part_cond_expr;
2815
2816   /* Check that mask is one less than a power of 2, i.e., mask is
2817      all zeros followed by all ones.  */
2818   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2819
2820   int_ptrsize_type = signed_type_for (ptr_type_node);
2821
2822   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2823      of the first vector of the i'th data reference. */
2824
2825   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2826     {
2827       gimple_seq new_stmt_list = NULL;
2828       tree addr_base;
2829       tree addr_tmp_name;
2830       tree new_or_tmp_name;
2831       gimple *addr_stmt, *or_stmt;
2832       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2833       bool negative = tree_int_cst_compare
2834         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2835       tree offset = negative
2836         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2837
2838       /* create: addr_tmp = (int)(address_of_first_vector) */
2839       addr_base =
2840         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2841                                               offset);
2842       if (new_stmt_list != NULL)
2843         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2844
2845       sprintf (tmp_name, "addr2int%d", i);
2846       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2847       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2848       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2849
2850       /* The addresses are OR together.  */
2851
2852       if (or_tmp_name != NULL_TREE)
2853         {
2854           /* create: or_tmp = or_tmp | addr_tmp */
2855           sprintf (tmp_name, "orptrs%d", i);
2856           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2857           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2858                                          or_tmp_name, addr_tmp_name);
2859           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2860           or_tmp_name = new_or_tmp_name;
2861         }
2862       else
2863         or_tmp_name = addr_tmp_name;
2864
2865     } /* end for i */
2866
2867   mask_cst = build_int_cst (int_ptrsize_type, mask);
2868
2869   /* create: and_tmp = or_tmp & mask  */
2870   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2871
2872   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2873                                   or_tmp_name, mask_cst);
2874   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2875
2876   /* Make and_tmp the left operand of the conditional test against zero.
2877      if and_tmp has a nonzero bit then some address is unaligned.  */
2878   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2879   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2880                                 and_tmp_name, ptrsize_zero);
2881   chain_cond_expr (cond_expr, part_cond_expr);
2882 }
2883
2884 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2885    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2886    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2887    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2888
2889 static void
2890 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2891 {
2892   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2893   unsigned int i;
2894   vec_object_pair *pair;
2895   FOR_EACH_VEC_ELT (pairs, i, pair)
2896     {
2897       tree addr1 = build_fold_addr_expr (pair->first);
2898       tree addr2 = build_fold_addr_expr (pair->second);
2899       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2900                                          addr1, addr2);
2901       chain_cond_expr (cond_expr, part_cond_expr);
2902     }
2903 }
2904
2905 /* Create an expression that is true when all lower-bound conditions for
2906    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2907
2908 static void
2909 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2910 {
2911   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2912   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2913     {
2914       tree expr = lower_bounds[i].expr;
2915       tree type = unsigned_type_for (TREE_TYPE (expr));
2916       expr = fold_convert (type, expr);
2917       poly_uint64 bound = lower_bounds[i].min_value;
2918       if (!lower_bounds[i].unsigned_p)
2919         {
2920           expr = fold_build2 (PLUS_EXPR, type, expr,
2921                               build_int_cstu (type, bound - 1));
2922           bound += bound - 1;
2923         }
2924       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2925                                          build_int_cstu (type, bound));
2926       chain_cond_expr (cond_expr, part_cond_expr);
2927     }
2928 }
2929
2930 /* Function vect_create_cond_for_alias_checks.
2931
2932    Create a conditional expression that represents the run-time checks for
2933    overlapping of address ranges represented by a list of data references
2934    relations passed as input.
2935
2936    Input:
2937    COND_EXPR  - input conditional expression.  New conditions will be chained
2938                 with logical AND operation.  If it is NULL, then the function
2939                 is used to return the number of alias checks.
2940    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2941                 to be checked.
2942
2943    Output:
2944    COND_EXPR - conditional expression.
2945
2946    The returned COND_EXPR is the conditional expression to be used in the if
2947    statement that controls which version of the loop gets executed at runtime.
2948 */
2949
2950 void
2951 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2952 {
2953   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2954     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2955
2956   if (comp_alias_ddrs.is_empty ())
2957     return;
2958
2959   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2960                                &comp_alias_ddrs, cond_expr);
2961   if (dump_enabled_p ())
2962     dump_printf_loc (MSG_NOTE, vect_location,
2963                      "created %u versioning for alias checks.\n",
2964                      comp_alias_ddrs.length ());
2965 }
2966
2967
2968 /* Function vect_loop_versioning.
2969
2970    If the loop has data references that may or may not be aligned or/and
2971    has data reference relations whose independence was not proven then
2972    two versions of the loop need to be generated, one which is vectorized
2973    and one which isn't.  A test is then generated to control which of the
2974    loops is executed.  The test checks for the alignment of all of the
2975    data references that may or may not be aligned.  An additional
2976    sequence of runtime tests is generated for each pairs of DDRs whose
2977    independence was not proven.  The vectorized version of loop is
2978    executed only if both alias and alignment tests are passed.
2979
2980    The test generated to check which version of loop is executed
2981    is modified to also check for profitability as indicated by the
2982    cost model threshold TH.
2983
2984    The versioning precondition(s) are placed in *COND_EXPR and
2985    *COND_EXPR_STMT_LIST.  */
2986
2987 void
2988 vect_loop_versioning (loop_vec_info loop_vinfo,
2989                       unsigned int th, bool check_profitability,
2990                       poly_uint64 versioning_threshold)
2991 {
2992   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2993   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2994   basic_block condition_bb;
2995   gphi_iterator gsi;
2996   gimple_stmt_iterator cond_exp_gsi;
2997   basic_block merge_bb;
2998   basic_block new_exit_bb;
2999   edge new_exit_e, e;
3000   gphi *orig_phi, *new_phi;
3001   tree cond_expr = NULL_TREE;
3002   gimple_seq cond_expr_stmt_list = NULL;
3003   tree arg;
3004   profile_probability prob = profile_probability::likely ();
3005   gimple_seq gimplify_stmt_list = NULL;
3006   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3007   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3008   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3009   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3010
3011   if (check_profitability)
3012     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3013                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3014                                             th - 1));
3015   if (maybe_ne (versioning_threshold, 0U))
3016     {
3017       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3018                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3019                                               versioning_threshold - 1));
3020       if (cond_expr)
3021         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3022                                  expr, cond_expr);
3023       else
3024         cond_expr = expr;
3025     }
3026
3027   if (version_niter)
3028     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3029
3030   if (cond_expr)
3031     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3032                                         is_gimple_condexpr, NULL_TREE);
3033
3034   if (version_align)
3035     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3036                                        &cond_expr_stmt_list);
3037
3038   if (version_alias)
3039     {
3040       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3041       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3042       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3043     }
3044
3045   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3046                                       &gimplify_stmt_list,
3047                                       is_gimple_condexpr, NULL_TREE);
3048   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3049
3050   initialize_original_copy_tables ();
3051   if (scalar_loop)
3052     {
3053       edge scalar_e;
3054       basic_block preheader, scalar_preheader;
3055
3056       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3057          scale LOOP's frequencies instead.  */
3058       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3059                             prob, prob.invert (), prob, prob.invert (), true);
3060       scale_loop_frequencies (loop, prob);
3061       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3062          while we need to move it above LOOP's preheader.  */
3063       e = loop_preheader_edge (loop);
3064       scalar_e = loop_preheader_edge (scalar_loop);
3065       /* The vector loop preheader might not be empty, since new
3066          invariants could have been created while analyzing the loop.  */
3067       gcc_assert (single_pred_p (e->src));
3068       gcc_assert (empty_block_p (scalar_e->src)
3069                   && single_pred_p (scalar_e->src));
3070       gcc_assert (single_pred_p (condition_bb));
3071       preheader = e->src;
3072       scalar_preheader = scalar_e->src;
3073       scalar_e = find_edge (condition_bb, scalar_preheader);
3074       e = single_pred_edge (preheader);
3075       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3076                                       scalar_preheader);
3077       redirect_edge_and_branch_force (scalar_e, preheader);
3078       redirect_edge_and_branch_force (e, condition_bb);
3079       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3080                                single_pred (condition_bb));
3081       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3082                                single_pred (scalar_preheader));
3083       set_immediate_dominator (CDI_DOMINATORS, preheader,
3084                                condition_bb);
3085     }
3086   else
3087     nloop = loop_version (loop, cond_expr, &condition_bb,
3088                           prob, prob.invert (), prob, prob.invert (), true);
3089
3090   if (version_niter)
3091     {
3092       /* The versioned loop could be infinite, we need to clear existing
3093          niter information which is copied from the original loop.  */
3094       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3095       vect_free_loop_info_assumptions (nloop);
3096       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3097       loop_constraint_set (loop, LOOP_C_INFINITE);
3098     }
3099
3100   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3101       && dump_enabled_p ())
3102     {
3103       if (version_alias)
3104         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3105                          vect_location,
3106                          "loop versioned for vectorization because of "
3107                          "possible aliasing\n");
3108       if (version_align)
3109         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3110                          vect_location,
3111                          "loop versioned for vectorization to enhance "
3112                          "alignment\n");
3113
3114     }
3115   free_original_copy_tables ();
3116
3117   /* Loop versioning violates an assumption we try to maintain during
3118      vectorization - that the loop exit block has a single predecessor.
3119      After versioning, the exit block of both loop versions is the same
3120      basic block (i.e. it has two predecessors). Just in order to simplify
3121      following transformations in the vectorizer, we fix this situation
3122      here by adding a new (empty) block on the exit-edge of the loop,
3123      with the proper loop-exit phis to maintain loop-closed-form.
3124      If loop versioning wasn't done from loop, but scalar_loop instead,
3125      merge_bb will have already just a single successor.  */
3126
3127   merge_bb = single_exit (loop)->dest;
3128   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3129     {
3130       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3131       new_exit_bb = split_edge (single_exit (loop));
3132       new_exit_e = single_exit (loop);
3133       e = EDGE_SUCC (new_exit_bb, 0);
3134
3135       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3136         {
3137           tree new_res;
3138           orig_phi = gsi.phi ();
3139           new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3140           new_phi = create_phi_node (new_res, new_exit_bb);
3141           arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3142           add_phi_arg (new_phi, arg, new_exit_e,
3143                        gimple_phi_arg_location_from_edge (orig_phi, e));
3144           adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3145         }
3146     }
3147
3148   /* End loop-exit-fixes after versioning.  */
3149
3150   if (cond_expr_stmt_list)
3151     {
3152       cond_exp_gsi = gsi_last_bb (condition_bb);
3153       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3154                              GSI_SAME_STMT);
3155     }
3156   update_ssa (TODO_update_ssa);
3157 }