gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   struct loop *loop = bb->loop_father;
  93   struct loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (struct loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (struct loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 321                                rgroup_masks *src_rgm)
 322 {
 323   tree src_masktype = src_rgm->mask_type;
 324   tree dest_masktype = dest_rgm->mask_type;
 325   machine_mode src_mode = TYPE_MODE (src_masktype);
 326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 329     {
 330       /* Unpacking the source masks gives at least as many mask bits as
 331          we need.  We can then VIEW_CONVERT any excess bits away.  */
 332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
 333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 334         {
 335           tree src = src_rgm->masks[i / 2];
 336           tree dest = dest_rgm->masks[i];
 337           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 338                             ? VEC_UNPACK_HI_EXPR
 339                             : VEC_UNPACK_LO_EXPR);
 340           gassign *stmt;
 341           if (dest_masktype == unpack_masktype)
 342             stmt = gimple_build_assign (dest, code, src);
 343           else
 344             {
 345               tree temp = make_ssa_name (unpack_masktype);
 346               stmt = gimple_build_assign (temp, code, src);
 347               gimple_seq_add_stmt (seq, stmt);
 348               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 349                                           build1 (VIEW_CONVERT_EXPR,
 350                                                   dest_masktype, temp));
 351             }
 352           gimple_seq_add_stmt (seq, stmt);
 353         }
 354       return true;
 355     }
 356   vec_perm_indices indices[2];
 357   if (dest_masktype == src_masktype
 358       && interleave_supported_p (&indices[0], src_masktype, 0)
 359       && interleave_supported_p (&indices[1], src_masktype, 1))
 360     {
 361       /* The destination requires twice as many mask bits as the source, so
 362          we can use interleaving permutes to double up the number of bits.  */
 363       tree masks[2];
 364       for (unsigned int i = 0; i < 2; ++i)
 365         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 367         {
 368           tree src = src_rgm->masks[i / 2];
 369           tree dest = dest_rgm->masks[i];
 370           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 371                                               src, src, masks[i & 1]);
 372           gimple_seq_add_stmt (seq, stmt);
 373         }
 374       return true;
 375     }
 376   return false;
 377 }
 378
 379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 380    all the masks in RGM and return a mask that is nonzero when the loop
 381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 382    Use LOOP_COND_GSI to insert code before the exit gcond.
 383
 384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 385    times and has been vectorized according to LOOP_VINFO.  Each iteration
 386    of the vectorized loop handles VF iterations of the scalar loop.
 387
 388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 389    starts with NITERS_SKIP dummy iterations of the scalar loop before
 390    the real work starts.  The mask elements for these dummy iterations
 391    must be 0, to ensure that the extra iterations do not have an effect.
 392
 393    It is known that:
 394
 395      NITERS * RGM->max_nscalars_per_iter
 396
 397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 398    variable that starts at 0 and has step:
 399
 400      VF * RGM->max_nscalars_per_iter
 401
 402    might overflow before hitting a value above:
 403
 404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 405
 406    This means that we cannot guarantee that such an induction variable
 407    would ever hit a value that produces a set of all-false masks for RGM.  */
 408
 409 static tree
 410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 411                               gimple_seq *preheader_seq,
 412                               gimple_stmt_iterator loop_cond_gsi,
 413                               rgroup_masks *rgm, tree vf,
 414                               tree niters, tree niters_skip,
 415                               bool might_wrap_p)
 416 {
 417   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 418   tree mask_type = rgm->mask_type;
 419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 421
 422   /* Calculate the maximum number of scalar values that the rgroup
 423      handles in total, the number that it handles for each iteration
 424      of the vector loop, and the number that it should skip during the
 425      first iteration of the vector loop.  */
 426   tree nscalars_total = niters;
 427   tree nscalars_step = vf;
 428   tree nscalars_skip = niters_skip;
 429   if (nscalars_per_iter != 1)
 430     {
 431       /* We checked before choosing to use a fully-masked loop that these
 432          multiplications don't overflow.  */
 433       tree factor = build_int_cst (compare_type, nscalars_per_iter);
 434       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 435                                      nscalars_total, factor);
 436       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 437                                     nscalars_step, factor);
 438       if (nscalars_skip)
 439         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 440                                       nscalars_skip, factor);
 441     }
 442
 443   /* Create an induction variable that counts the number of scalars
 444      processed.  */
 445   tree index_before_incr, index_after_incr;
 446   gimple_stmt_iterator incr_gsi;
 447   bool insert_after;
 448   tree zero_index = build_int_cst (compare_type, 0);
 449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 450   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
 451              insert_after, &index_before_incr, &index_after_incr);
 452
 453   tree test_index, test_limit, first_limit;
 454   gimple_stmt_iterator *test_gsi;
 455   if (might_wrap_p)
 456     {
 457       /* In principle the loop should stop iterating once the incremented
 458          IV reaches a value greater than or equal to:
 459
 460            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 461
 462          However, there's no guarantee that this addition doesn't overflow
 463          the comparison type, or that the IV hits a value above it before
 464          wrapping around.  We therefore adjust the limit down by one
 465          IV step:
 466
 467            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 468            -[infinite-prec] NSCALARS_STEP
 469
 470          and compare the IV against this limit _before_ incrementing it.
 471          Since the comparison type is unsigned, we actually want the
 472          subtraction to saturate at zero:
 473
 474            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 475            -[sat] NSCALARS_STEP
 476
 477          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 478
 479            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 480
 481          where the rightmost subtraction can be done directly in
 482          COMPARE_TYPE.  */
 483       test_index = index_before_incr;
 484       tree adjust = nscalars_step;
 485       if (nscalars_skip)
 486         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 487                                adjust, nscalars_skip);
 488       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 489                                  nscalars_total, adjust);
 490       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 491                                  test_limit, adjust);
 492       test_gsi = &incr_gsi;
 493
 494       /* Get a safe limit for the first iteration.  */
 495       if (nscalars_skip)
 496         {
 497           /* The first vector iteration can handle at most NSCALARS_STEP
 498              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 499              NSCALARS_SKIP to that cannot overflow.  */
 500           tree const_limit = build_int_cst (compare_type,
 501                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 502                                             * nscalars_per_iter);
 503           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 504                                       nscalars_total, const_limit);
 505           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 506                                       first_limit, nscalars_skip);
 507         }
 508       else
 509         /* For the first iteration it doesn't matter whether the IV hits
 510            a value above NSCALARS_TOTAL.  That only matters for the latch
 511            condition.  */
 512         first_limit = nscalars_total;
 513     }
 514   else
 515     {
 516       /* Test the incremented IV, which will always hit a value above
 517          the bound before wrapping.  */
 518       test_index = index_after_incr;
 519       test_limit = nscalars_total;
 520       if (nscalars_skip)
 521         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 522                                    test_limit, nscalars_skip);
 523       test_gsi = &loop_cond_gsi;
 524
 525       first_limit = test_limit;
 526     }
 527
 528   /* Provide a definition of each mask in the group.  */
 529   tree next_mask = NULL_TREE;
 530   tree mask;
 531   unsigned int i;
 532   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 533     {
 534       /* Previous masks will cover BIAS scalars.  This mask covers the
 535          next batch.  */
 536       poly_uint64 bias = nscalars_per_mask * i;
 537       tree bias_tree = build_int_cst (compare_type, bias);
 538       gimple *tmp_stmt;
 539
 540       /* See whether the first iteration of the vector loop is known
 541          to have a full mask.  */
 542       poly_uint64 const_limit;
 543       bool first_iteration_full
 544         = (poly_int_tree_p (first_limit, &const_limit)
 545            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 546
 547       /* Rather than have a new IV that starts at BIAS and goes up to
 548          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 549          and adjust the bound down by BIAS.  */
 550       tree this_test_limit = test_limit;
 551       if (i != 0)
 552         {
 553           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 554                                           compare_type, this_test_limit,
 555                                           bias_tree);
 556           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 557                                           compare_type, this_test_limit,
 558                                           bias_tree);
 559         }
 560
 561       /* Create the initial mask.  First include all scalars that
 562          are within the loop limit.  */
 563       tree init_mask = NULL_TREE;
 564       if (!first_iteration_full)
 565         {
 566           tree start, end;
 567           if (first_limit == test_limit)
 568             {
 569               /* Use a natural test between zero (the initial IV value)
 570                  and the loop limit.  The "else" block would be valid too,
 571                  but this choice can avoid the need to load BIAS_TREE into
 572                  a register.  */
 573               start = zero_index;
 574               end = this_test_limit;
 575             }
 576           else
 577             {
 578               /* FIRST_LIMIT is the maximum number of scalars handled by the
 579                  first iteration of the vector loop.  Test the portion
 580                  associated with this mask.  */
 581               start = bias_tree;
 582               end = first_limit;
 583             }
 584
 585           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 586           tmp_stmt = vect_gen_while (init_mask, start, end);
 587           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 588         }
 589
 590       /* Now AND out the bits that are within the number of skipped
 591          scalars.  */
 592       poly_uint64 const_skip;
 593       if (nscalars_skip
 594           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 595                && known_le (const_skip, bias)))
 596         {
 597           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 598                                                     bias_tree, nscalars_skip);
 599           if (init_mask)
 600             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 601                                       init_mask, unskipped_mask);
 602           else
 603             init_mask = unskipped_mask;
 604         }
 605
 606       if (!init_mask)
 607         /* First iteration is full.  */
 608         init_mask = build_minus_one_cst (mask_type);
 609
 610       /* Get the mask value for the next iteration of the loop.  */
 611       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 612       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 613       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 614
 615       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 616     }
 617   return next_mask;
 618 }
 619
 620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 621    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 622    number of iterations of the original scalar loop that should be
 623    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 624    as for vect_set_loop_condition.
 625
 626    Insert the branch-back condition before LOOP_COND_GSI and return the
 627    final gcond.  */
 628
 629 static gcond *
 630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 631                                 tree niters, tree final_iv,
 632                                 bool niters_maybe_zero,
 633                                 gimple_stmt_iterator loop_cond_gsi)
 634 {
 635   gimple_seq preheader_seq = NULL;
 636   gimple_seq header_seq = NULL;
 637
 638   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 639   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 640   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
 641   tree orig_niters = niters;
 642
 643   /* Type of the initial value of NITERS.  */
 644   tree ni_actual_type = TREE_TYPE (niters);
 645   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 646
 647   /* Convert NITERS to the same size as the compare.  */
 648   if (compare_precision > ni_actual_precision
 649       && niters_maybe_zero)
 650     {
 651       /* We know that there is always at least one iteration, so if the
 652          count is zero then it must have wrapped.  Cope with this by
 653          subtracting 1 before the conversion and adding 1 to the result.  */
 654       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 655       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 656                              niters, build_minus_one_cst (ni_actual_type));
 657       niters = gimple_convert (&preheader_seq, compare_type, niters);
 658       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 659                              niters, build_one_cst (compare_type));
 660     }
 661   else
 662     niters = gimple_convert (&preheader_seq, compare_type, niters);
 663
 664   /* Convert skip_niters to the right type.  */
 665   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 666
 667   /* Now calculate the value that the induction variable must be able
 668      to hit in order to ensure that we end the loop with an all-false mask.
 669      This involves adding the maximum number of inactive trailing scalar
 670      iterations.  */
 671   widest_int iv_limit;
 672   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
 673   if (known_max_iters)
 674     {
 675       if (niters_skip)
 676         {
 677           /* Add the maximum number of skipped iterations to the
 678              maximum iteration count.  */
 679           if (TREE_CODE (niters_skip) == INTEGER_CST)
 680             iv_limit += wi::to_widest (niters_skip);
 681           else
 682             iv_limit += max_vf - 1;
 683         }
 684       /* IV_LIMIT is the maximum number of latch iterations, which is also
 685          the maximum in-range IV value.  Round this value down to the previous
 686          vector alignment boundary and then add an extra full iteration.  */
 687       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 688       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
 689     }
 690
 691   /* Get the vectorization factor in tree form.  */
 692   tree vf = build_int_cst (compare_type,
 693                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 694
 695   /* Iterate over all the rgroups and fill in their masks.  We could use
 696      the first mask from any rgroup for the loop condition; here we
 697      arbitrarily pick the last.  */
 698   tree test_mask = NULL_TREE;
 699   rgroup_masks *rgm;
 700   unsigned int i;
 701   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 702   FOR_EACH_VEC_ELT (*masks, i, rgm)
 703     if (!rgm->masks.is_empty ())
 704       {
 705         /* First try using permutes.  This adds a single vector
 706            instruction to the loop for each mask, but needs no extra
 707            loop invariants or IVs.  */
 708         unsigned int nmasks = i + 1;
 709         if ((nmasks & 1) == 0)
 710           {
 711             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 712             if (!half_rgm->masks.is_empty ()
 713                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 714               continue;
 715           }
 716
 717         /* See whether zero-based IV would ever generate all-false masks
 718            before wrapping around.  */
 719         bool might_wrap_p
 720           = (!known_max_iters
 721              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 722                                     UNSIGNED)
 723                  > compare_precision));
 724
 725         /* Set up all masks for this group.  */
 726         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 727                                                   &preheader_seq,
 728                                                   loop_cond_gsi, rgm, vf,
 729                                                   niters, niters_skip,
 730                                                   might_wrap_p);
 731       }
 732
 733   /* Emit all accumulated statements.  */
 734   add_preheader_seq (loop, preheader_seq);
 735   add_header_seq (loop, header_seq);
 736
 737   /* Get a boolean result that tells us whether to iterate.  */
 738   edge exit_edge = single_exit (loop);
 739   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 740   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 741   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 742                                         NULL_TREE, NULL_TREE);
 743   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 744
 745   /* The loop iterates (NITERS - 1) / VF + 1 times.
 746      Subtract one from this to get the latch count.  */
 747   tree step = build_int_cst (compare_type,
 748                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 749   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 750                                        build_minus_one_cst (compare_type));
 751   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 752                                      niters_minus_one, step);
 753
 754   if (final_iv)
 755     {
 756       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 757       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 758     }
 759
 760   return cond_stmt;
 761 }
 762
 763 /* Like vect_set_loop_condition, but handle the case in which there
 764    are no loop masks.  */
 765
 766 static gcond *
 767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
 768                                   tree step, tree final_iv,
 769                                   bool niters_maybe_zero,
 770                                   gimple_stmt_iterator loop_cond_gsi)
 771 {
 772   tree indx_before_incr, indx_after_incr;
 773   gcond *cond_stmt;
 774   gcond *orig_cond;
 775   edge pe = loop_preheader_edge (loop);
 776   edge exit_edge = single_exit (loop);
 777   gimple_stmt_iterator incr_gsi;
 778   bool insert_after;
 779   enum tree_code code;
 780   tree niters_type = TREE_TYPE (niters);
 781
 782   orig_cond = get_loop_exit_condition (loop);
 783   gcc_assert (orig_cond);
 784   loop_cond_gsi = gsi_for_stmt (orig_cond);
 785
 786   tree init, limit;
 787   if (!niters_maybe_zero && integer_onep (step))
 788     {
 789       /* In this case we can use a simple 0-based IV:
 790
 791          A:
 792            x = 0;
 793            do
 794              {
 795                ...
 796                x += 1;
 797              }
 798            while (x < NITERS);  */
 799       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 800       init = build_zero_cst (niters_type);
 801       limit = niters;
 802     }
 803   else
 804     {
 805       /* The following works for all values of NITERS except 0:
 806
 807          B:
 808            x = 0;
 809            do
 810              {
 811                ...
 812                x += STEP;
 813              }
 814            while (x <= NITERS - STEP);
 815
 816          so that the loop continues to iterate if x + STEP - 1 < NITERS
 817          but stops if x + STEP - 1 >= NITERS.
 818
 819          However, if NITERS is zero, x never hits a value above NITERS - STEP
 820          before wrapping around.  There are two obvious ways of dealing with
 821          this:
 822
 823          - start at STEP - 1 and compare x before incrementing it
 824          - start at -1 and compare x after incrementing it
 825
 826          The latter is simpler and is what we use.  The loop in this case
 827          looks like:
 828
 829          C:
 830            x = -1;
 831            do
 832              {
 833                ...
 834                x += STEP;
 835              }
 836            while (x < NITERS - STEP);
 837
 838          In both cases the loop limit is NITERS - STEP.  */
 839       gimple_seq seq = NULL;
 840       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 841       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 842       if (seq)
 843         {
 844           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 845           gcc_assert (!new_bb);
 846         }
 847       if (niters_maybe_zero)
 848         {
 849           /* Case C.  */
 850           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 851           init = build_all_ones_cst (niters_type);
 852         }
 853       else
 854         {
 855           /* Case B.  */
 856           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 857           init = build_zero_cst (niters_type);
 858         }
 859     }
 860
 861   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 862   create_iv (init, step, NULL_TREE, loop,
 863              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 864   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 865                                               true, NULL_TREE, true,
 866                                               GSI_SAME_STMT);
 867   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 868                                      true, GSI_SAME_STMT);
 869
 870   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 871                                  NULL_TREE);
 872
 873   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 874
 875   /* Record the number of latch iterations.  */
 876   if (limit == niters)
 877     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 878        latch count.  */
 879     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 880                                        build_int_cst (niters_type, 1));
 881   else
 882     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 883        Subtract one from this to get the latch count.  */
 884     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 885                                        limit, step);
 886
 887   if (final_iv)
 888     {
 889       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 890                                              indx_after_incr, init);
 891       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 892     }
 893
 894   return cond_stmt;
 895 }
 896
 897 /* If we're using fully-masked loops, make LOOP iterate:
 898
 899       N == (NITERS - 1) / STEP + 1
 900
 901    times.  When NITERS is zero, this is equivalent to making the loop
 902    execute (1 << M) / STEP times, where M is the precision of NITERS.
 903    NITERS_MAYBE_ZERO is true if this last case might occur.
 904
 905    If we're not using fully-masked loops, make LOOP iterate:
 906
 907       N == (NITERS - STEP) / STEP + 1
 908
 909    times, where NITERS is known to be outside the range [1, STEP - 1].
 910    This is equivalent to making the loop execute NITERS / STEP times
 911    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 912    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 913
 914    If FINAL_IV is nonnull, it is an SSA name that should be set to
 915    N * STEP on exit from the loop.
 916
 917    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 918
 919 void
 920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 921                          tree niters, tree step, tree final_iv,
 922                          bool niters_maybe_zero)
 923 {
 924   gcond *cond_stmt;
 925   gcond *orig_cond = get_loop_exit_condition (loop);
 926   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 927
 928   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 929     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 930                                                 final_iv, niters_maybe_zero,
 931                                                 loop_cond_gsi);
 932   else
 933     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 934                                                   final_iv, niters_maybe_zero,
 935                                                   loop_cond_gsi);
 936
 937   /* Remove old loop exit test.  */
 938   stmt_vec_info orig_cond_info;
 939   if (loop_vinfo
 940       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 941     loop_vinfo->remove_stmt (orig_cond_info);
 942   else
 943     gsi_remove (&loop_cond_gsi, true);
 944
 945   if (dump_enabled_p ())
 946     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 947                      cond_stmt);
 948 }
 949
 950 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 951    For all PHI arguments in FROM->dest and TO->dest from those
 952    edges ensure that TO->dest PHI arguments have current_def
 953    to that in from.  */
 954
 955 static void
 956 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 957 {
 958   gimple_stmt_iterator gsi_from, gsi_to;
 959
 960   for (gsi_from = gsi_start_phis (from->dest),
 961        gsi_to = gsi_start_phis (to->dest);
 962        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 963     {
 964       gimple *from_phi = gsi_stmt (gsi_from);
 965       gimple *to_phi = gsi_stmt (gsi_to);
 966       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 967       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 968       if (virtual_operand_p (from_arg))
 969         {
 970           gsi_next (&gsi_from);
 971           continue;
 972         }
 973       if (virtual_operand_p (to_arg))
 974         {
 975           gsi_next (&gsi_to);
 976           continue;
 977         }
 978       if (TREE_CODE (from_arg) != SSA_NAME)
 979         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 980       else
 981         {
 982           if (get_current_def (to_arg) == NULL_TREE)
 983             set_current_def (to_arg, get_current_def (from_arg));
 984         }
 985       gsi_next (&gsi_from);
 986       gsi_next (&gsi_to);
 987     }
 988
 989   gphi *from_phi = get_virtual_phi (from->dest);
 990   gphi *to_phi = get_virtual_phi (to->dest);
 991   if (from_phi)
 992     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 993                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 994 }
 995
 996
 997 /* Given LOOP this function generates a new copy of it and puts it
 998    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 999    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1000    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1001    entry or exit of LOOP.  */
1002
1003 struct loop *
1004 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1005                                         struct loop *scalar_loop, edge e)
1006 {
1007   struct loop *new_loop;
1008   basic_block *new_bbs, *bbs, *pbbs;
1009   bool at_exit;
1010   bool was_imm_dom;
1011   basic_block exit_dest;
1012   edge exit, new_exit;
1013   bool duplicate_outer_loop = false;
1014
1015   exit = single_exit (loop);
1016   at_exit = (e == exit);
1017   if (!at_exit && e != loop_preheader_edge (loop))
1018     return NULL;
1019
1020   if (scalar_loop == NULL)
1021     scalar_loop = loop;
1022
1023   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1024   pbbs = bbs + 1;
1025   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1026   /* Allow duplication of outer loops.  */
1027   if (scalar_loop->inner)
1028     duplicate_outer_loop = true;
1029   /* Check whether duplication is possible.  */
1030   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1031     {
1032       free (bbs);
1033       return NULL;
1034     }
1035
1036   /* Generate new loop structure.  */
1037   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1038   duplicate_subloops (scalar_loop, new_loop);
1039
1040   exit_dest = exit->dest;
1041   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1042                                           exit_dest) == loop->header ?
1043                  true : false);
1044
1045   /* Also copy the pre-header, this avoids jumping through hoops to
1046      duplicate the loop entry PHI arguments.  Create an empty
1047      pre-header unconditionally for this.  */
1048   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1049   edge entry_e = single_pred_edge (preheader);
1050   bbs[0] = preheader;
1051   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1052
1053   exit = single_exit (scalar_loop);
1054   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1055             &exit, 1, &new_exit, NULL,
1056             at_exit ? loop->latch : e->src, true);
1057   exit = single_exit (loop);
1058   basic_block new_preheader = new_bbs[0];
1059
1060   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1061
1062   if (scalar_loop != loop)
1063     {
1064       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1065          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1066          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1067          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1068          header) to have current_def set, so copy them over.  */
1069       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1070                                                 exit);
1071       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1072                                                            0),
1073                                                 EDGE_SUCC (loop->latch, 0));
1074     }
1075
1076   if (at_exit) /* Add the loop copy at exit.  */
1077     {
1078       if (scalar_loop != loop)
1079         {
1080           gphi_iterator gsi;
1081           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1082
1083           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1084                gsi_next (&gsi))
1085             {
1086               gphi *phi = gsi.phi ();
1087               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1088               location_t orig_locus
1089                 = gimple_phi_arg_location_from_edge (phi, e);
1090
1091               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1092             }
1093         }
1094       redirect_edge_and_branch_force (e, new_preheader);
1095       flush_pending_stmts (e);
1096       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1097       if (was_imm_dom || duplicate_outer_loop)
1098         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1099
1100       /* And remove the non-necessary forwarder again.  Keep the other
1101          one so we have a proper pre-header for the loop at the exit edge.  */
1102       redirect_edge_pred (single_succ_edge (preheader),
1103                           single_pred (preheader));
1104       delete_basic_block (preheader);
1105       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1106                                loop_preheader_edge (scalar_loop)->src);
1107     }
1108   else /* Add the copy at entry.  */
1109     {
1110       if (scalar_loop != loop)
1111         {
1112           /* Remove the non-necessary forwarder of scalar_loop again.  */
1113           redirect_edge_pred (single_succ_edge (preheader),
1114                               single_pred (preheader));
1115           delete_basic_block (preheader);
1116           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1117                                    loop_preheader_edge (scalar_loop)->src);
1118           preheader = split_edge (loop_preheader_edge (loop));
1119           entry_e = single_pred_edge (preheader);
1120         }
1121
1122       redirect_edge_and_branch_force (entry_e, new_preheader);
1123       flush_pending_stmts (entry_e);
1124       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1125
1126       redirect_edge_and_branch_force (new_exit, preheader);
1127       flush_pending_stmts (new_exit);
1128       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1129
1130       /* And remove the non-necessary forwarder again.  Keep the other
1131          one so we have a proper pre-header for the loop at the exit edge.  */
1132       redirect_edge_pred (single_succ_edge (new_preheader),
1133                           single_pred (new_preheader));
1134       delete_basic_block (new_preheader);
1135       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1136                                loop_preheader_edge (new_loop)->src);
1137     }
1138
1139   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1140   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1141     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1142
1143   if (scalar_loop != loop)
1144     {
1145       /* Update new_loop->header PHIs, so that on the preheader
1146          edge they are the ones from loop rather than scalar_loop.  */
1147       gphi_iterator gsi_orig, gsi_new;
1148       edge orig_e = loop_preheader_edge (loop);
1149       edge new_e = loop_preheader_edge (new_loop);
1150
1151       for (gsi_orig = gsi_start_phis (loop->header),
1152            gsi_new = gsi_start_phis (new_loop->header);
1153            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1154            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1155         {
1156           gphi *orig_phi = gsi_orig.phi ();
1157           gphi *new_phi = gsi_new.phi ();
1158           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1159           location_t orig_locus
1160             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1161
1162           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1163         }
1164     }
1165
1166   free (new_bbs);
1167   free (bbs);
1168
1169   checking_verify_dominators (CDI_DOMINATORS);
1170
1171   return new_loop;
1172 }
1173
1174
1175 /* Given the condition expression COND, put it as the last statement of
1176    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1177    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1178    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1179    new edge as irreducible if IRREDUCIBLE_P is true.  */
1180
1181 static edge
1182 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1183                        basic_block guard_to, basic_block dom_bb,
1184                        profile_probability probability, bool irreducible_p)
1185 {
1186   gimple_stmt_iterator gsi;
1187   edge new_e, enter_e;
1188   gcond *cond_stmt;
1189   gimple_seq gimplify_stmt_list = NULL;
1190
1191   enter_e = EDGE_SUCC (guard_bb, 0);
1192   enter_e->flags &= ~EDGE_FALLTHRU;
1193   enter_e->flags |= EDGE_FALSE_VALUE;
1194   gsi = gsi_last_bb (guard_bb);
1195
1196   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1197                                  NULL_TREE);
1198   if (gimplify_stmt_list)
1199     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1200
1201   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1202   gsi = gsi_last_bb (guard_bb);
1203   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1204
1205   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1206   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1207
1208   new_e->probability = probability;
1209   if (irreducible_p)
1210     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1211
1212   enter_e->probability = probability.invert ();
1213   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1214
1215   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1216   if (enter_e->dest->loop_father->header == enter_e->dest)
1217     split_edge (enter_e);
1218
1219   return new_e;
1220 }
1221
1222
1223 /* This function verifies that the following restrictions apply to LOOP:
1224    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1225        for innermost loop and 5 basic blocks for outer-loop.
1226    (2) it is single entry, single exit
1227    (3) its exit condition is the last stmt in the header
1228    (4) E is the entry/exit edge of LOOP.
1229  */
1230
1231 bool
1232 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1233 {
1234   edge exit_e = single_exit (loop);
1235   edge entry_e = loop_preheader_edge (loop);
1236   gcond *orig_cond = get_loop_exit_condition (loop);
1237   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1238   unsigned int num_bb = loop->inner? 5 : 2;
1239
1240   /* All loops have an outer scope; the only case loop->outer is NULL is for
1241      the function itself.  */
1242   if (!loop_outer (loop)
1243       || loop->num_nodes != num_bb
1244       || !empty_block_p (loop->latch)
1245       || !single_exit (loop)
1246       /* Verify that new loop exit condition can be trivially modified.  */
1247       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1248       || (e != exit_e && e != entry_e))
1249     return false;
1250
1251   return true;
1252 }
1253
1254 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1255    in the exit bb and rename all the uses after the loop.  This simplifies
1256    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1257    (but normally loop closed SSA form doesn't require virtual PHIs to be
1258    in the same form).  Doing this early simplifies the checking what
1259    uses should be renamed.  */
1260
1261 static void
1262 create_lcssa_for_virtual_phi (struct loop *loop)
1263 {
1264   gphi_iterator gsi;
1265   edge exit_e = single_exit (loop);
1266
1267   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1268     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1269       {
1270         gphi *phi = gsi.phi ();
1271         for (gsi = gsi_start_phis (exit_e->dest);
1272              !gsi_end_p (gsi); gsi_next (&gsi))
1273           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1274             break;
1275         if (gsi_end_p (gsi))
1276           {
1277             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1278             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1279             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1280             imm_use_iterator imm_iter;
1281             gimple *stmt;
1282             use_operand_p use_p;
1283
1284             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1285               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1286             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1287             gimple_phi_set_result (new_phi, new_vop);
1288             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1289               if (stmt != new_phi
1290                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1291                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1292                   SET_USE (use_p, new_vop);
1293           }
1294         break;
1295       }
1296
1297 }
1298
1299 /* Function vect_get_loop_location.
1300
1301    Extract the location of the loop in the source code.
1302    If the loop is not well formed for vectorization, an estimated
1303    location is calculated.
1304    Return the loop location if succeed and NULL if not.  */
1305
1306 dump_user_location_t
1307 find_loop_location (struct loop *loop)
1308 {
1309   gimple *stmt = NULL;
1310   basic_block bb;
1311   gimple_stmt_iterator si;
1312
1313   if (!loop)
1314     return dump_user_location_t ();
1315
1316   stmt = get_loop_exit_condition (loop);
1317
1318   if (stmt
1319       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1320     return stmt;
1321
1322   /* If we got here the loop is probably not "well formed",
1323      try to estimate the loop location */
1324
1325   if (!loop->header)
1326     return dump_user_location_t ();
1327
1328   bb = loop->header;
1329
1330   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1331     {
1332       stmt = gsi_stmt (si);
1333       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1334         return stmt;
1335     }
1336
1337   return dump_user_location_t ();
1338 }
1339
1340 /* Return true if the phi described by STMT_INFO defines an IV of the
1341    loop to be vectorized.  */
1342
1343 static bool
1344 iv_phi_p (stmt_vec_info stmt_info)
1345 {
1346   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1347   if (virtual_operand_p (PHI_RESULT (phi)))
1348     return false;
1349
1350   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1351       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1352     return false;
1353
1354   return true;
1355 }
1356
1357 /* Function vect_can_advance_ivs_p
1358
1359    In case the number of iterations that LOOP iterates is unknown at compile
1360    time, an epilog loop will be generated, and the loop induction variables
1361    (IVs) will be "advanced" to the value they are supposed to take just before
1362    the epilog loop.  Here we check that the access function of the loop IVs
1363    and the expression that represents the loop bound are simple enough.
1364    These restrictions will be relaxed in the future.  */
1365
1366 bool
1367 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1368 {
1369   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1370   basic_block bb = loop->header;
1371   gphi_iterator gsi;
1372
1373   /* Analyze phi functions of the loop header.  */
1374
1375   if (dump_enabled_p ())
1376     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1377   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1378     {
1379       tree evolution_part;
1380
1381       gphi *phi = gsi.phi ();
1382       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1383       if (dump_enabled_p ())
1384         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1385                          phi_info->stmt);
1386
1387       /* Skip virtual phi's. The data dependences that are associated with
1388          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1389
1390          Skip reduction phis.  */
1391       if (!iv_phi_p (phi_info))
1392         {
1393           if (dump_enabled_p ())
1394             dump_printf_loc (MSG_NOTE, vect_location,
1395                              "reduc or virtual phi. skip.\n");
1396           continue;
1397         }
1398
1399       /* Analyze the evolution function.  */
1400
1401       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1402       if (evolution_part == NULL_TREE)
1403         {
1404           if (dump_enabled_p ())
1405             dump_printf (MSG_MISSED_OPTIMIZATION,
1406                          "No access function or evolution.\n");
1407           return false;
1408         }
1409
1410       /* FORNOW: We do not transform initial conditions of IVs
1411          which evolution functions are not invariants in the loop.  */
1412
1413       if (!expr_invariant_in_loop_p (loop, evolution_part))
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "evolution not invariant in loop.\n");
1418           return false;
1419         }
1420
1421       /* FORNOW: We do not transform initial conditions of IVs
1422          which evolution functions are a polynomial of degree >= 2.  */
1423
1424       if (tree_is_chrec (evolution_part))
1425         {
1426           if (dump_enabled_p ())
1427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428                              "evolution is chrec.\n");
1429           return false;
1430         }
1431     }
1432
1433   return true;
1434 }
1435
1436
1437 /*   Function vect_update_ivs_after_vectorizer.
1438
1439      "Advance" the induction variables of LOOP to the value they should take
1440      after the execution of LOOP.  This is currently necessary because the
1441      vectorizer does not handle induction variables that are used after the
1442      loop.  Such a situation occurs when the last iterations of LOOP are
1443      peeled, because:
1444      1. We introduced new uses after LOOP for IVs that were not originally used
1445         after LOOP: the IVs of LOOP are now used by an epilog loop.
1446      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1447         times, whereas the loop IVs should be bumped N times.
1448
1449      Input:
1450      - LOOP - a loop that is going to be vectorized. The last few iterations
1451               of LOOP were peeled.
1452      - NITERS - the number of iterations that LOOP executes (before it is
1453                 vectorized). i.e, the number of times the ivs should be bumped.
1454      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1455                   coming out from LOOP on which there are uses of the LOOP ivs
1456                   (this is the path from LOOP->exit to epilog_loop->preheader).
1457
1458                   The new definitions of the ivs are placed in LOOP->exit.
1459                   The phi args associated with the edge UPDATE_E in the bb
1460                   UPDATE_E->dest are updated accordingly.
1461
1462      Assumption 1: Like the rest of the vectorizer, this function assumes
1463      a single loop exit that has a single predecessor.
1464
1465      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1466      organized in the same order.
1467
1468      Assumption 3: The access function of the ivs is simple enough (see
1469      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1470
1471      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1472      coming out of LOOP on which the ivs of LOOP are used (this is the path
1473      that leads to the epilog loop; other paths skip the epilog loop).  This
1474      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1475      needs to have its phis updated.
1476  */
1477
1478 static void
1479 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1480                                   tree niters, edge update_e)
1481 {
1482   gphi_iterator gsi, gsi1;
1483   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1484   basic_block update_bb = update_e->dest;
1485   basic_block exit_bb = single_exit (loop)->dest;
1486
1487   /* Make sure there exists a single-predecessor exit bb:  */
1488   gcc_assert (single_pred_p (exit_bb));
1489   gcc_assert (single_succ_edge (exit_bb) == update_e);
1490
1491   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1492        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1493        gsi_next (&gsi), gsi_next (&gsi1))
1494     {
1495       tree init_expr;
1496       tree step_expr, off;
1497       tree type;
1498       tree var, ni, ni_name;
1499       gimple_stmt_iterator last_gsi;
1500
1501       gphi *phi = gsi.phi ();
1502       gphi *phi1 = gsi1.phi ();
1503       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1504       if (dump_enabled_p ())
1505         dump_printf_loc (MSG_NOTE, vect_location,
1506                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1507
1508       /* Skip reduction and virtual phis.  */
1509       if (!iv_phi_p (phi_info))
1510         {
1511           if (dump_enabled_p ())
1512             dump_printf_loc (MSG_NOTE, vect_location,
1513                              "reduc or virtual phi. skip.\n");
1514           continue;
1515         }
1516
1517       type = TREE_TYPE (gimple_phi_result (phi));
1518       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1519       step_expr = unshare_expr (step_expr);
1520
1521       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1522          of degree >= 2 or exponential.  */
1523       gcc_assert (!tree_is_chrec (step_expr));
1524
1525       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1526
1527       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1528                          fold_convert (TREE_TYPE (step_expr), niters),
1529                          step_expr);
1530       if (POINTER_TYPE_P (type))
1531         ni = fold_build_pointer_plus (init_expr, off);
1532       else
1533         ni = fold_build2 (PLUS_EXPR, type,
1534                           init_expr, fold_convert (type, off));
1535
1536       var = create_tmp_var (type, "tmp");
1537
1538       last_gsi = gsi_last_bb (exit_bb);
1539       gimple_seq new_stmts = NULL;
1540       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1541       /* Exit_bb shouldn't be empty.  */
1542       if (!gsi_end_p (last_gsi))
1543         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1544       else
1545         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1546
1547       /* Fix phi expressions in the successor bb.  */
1548       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1549     }
1550 }
1551
1552 /* Return a gimple value containing the misalignment (measured in vector
1553    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1554    it is away from a perfectly aligned address.  Add any new statements
1555    to SEQ.  */
1556
1557 static tree
1558 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1559 {
1560   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1561   stmt_vec_info stmt_info = dr_info->stmt;
1562   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1563
1564   unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1565   gcc_assert (target_align != 0);
1566
1567   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1568                                         size_zero_node) < 0;
1569   tree offset = (negative
1570                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1571                  : size_zero_node);
1572   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1573                                                           offset);
1574   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1575   tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1576   HOST_WIDE_INT elem_size
1577     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1578   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1579
1580   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1581   tree int_start_addr = fold_convert (type, start_addr);
1582   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1583                                         target_align_minus_1);
1584
1585   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1586   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1587                                         elem_size_log);
1588
1589   return misalign_in_elems;
1590 }
1591
1592 /* Function vect_gen_prolog_loop_niters
1593
1594    Generate the number of iterations which should be peeled as prolog for the
1595    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1596    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1597    As a result, after the execution of this loop, the data reference DR will
1598    refer to an aligned location.  The following computation is generated:
1599
1600    If the misalignment of DR is known at compile time:
1601      addr_mis = int mis = DR_MISALIGNMENT (dr);
1602    Else, compute address misalignment in bytes:
1603      addr_mis = addr & (target_align - 1)
1604
1605    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1606
1607    (elem_size = element type size; an element is the scalar element whose type
1608    is the inner type of the vectype)
1609
1610    The computations will be emitted at the end of BB.  We also compute and
1611    store upper bound (included) of the result in BOUND.
1612
1613    When the step of the data-ref in the loop is not 1 (as in interleaved data
1614    and SLP), the number of iterations of the prolog must be divided by the step
1615    (which is equal to the size of interleaved group).
1616
1617    The above formulas assume that VF == number of elements in the vector. This
1618    may not hold when there are multiple-types in the loop.
1619    In this case, for some data-references in the loop the VF does not represent
1620    the number of elements that fit in the vector.  Therefore, instead of VF we
1621    use TYPE_VECTOR_SUBPARTS.  */
1622
1623 static tree
1624 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1625                              basic_block bb, int *bound)
1626 {
1627   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1628   tree var;
1629   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1630   gimple_seq stmts = NULL, new_stmts = NULL;
1631   tree iters, iters_name;
1632   stmt_vec_info stmt_info = dr_info->stmt;
1633   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1634   unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1635
1636   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1637     {
1638       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1639
1640       if (dump_enabled_p ())
1641         dump_printf_loc (MSG_NOTE, vect_location,
1642                          "known peeling = %d.\n", npeel);
1643
1644       iters = build_int_cst (niters_type, npeel);
1645       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1646     }
1647   else
1648     {
1649       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1650       tree type = TREE_TYPE (misalign_in_elems);
1651       HOST_WIDE_INT elem_size
1652         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1653       HOST_WIDE_INT align_in_elems = target_align / elem_size;
1654       tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1655       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1656
1657       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1658                                  & (align_in_elems - 1)).  */
1659       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1660                                             size_zero_node) < 0;
1661       if (negative)
1662         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1663                              align_in_elems_tree);
1664       else
1665         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1666                              misalign_in_elems);
1667       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1668       iters = fold_convert (niters_type, iters);
1669       *bound = align_in_elems - 1;
1670     }
1671
1672   if (dump_enabled_p ())
1673     dump_printf_loc (MSG_NOTE, vect_location,
1674                      "niters for prolog loop: %T\n", iters);
1675
1676   var = create_tmp_var (niters_type, "prolog_loop_niters");
1677   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1678
1679   if (new_stmts)
1680     gimple_seq_add_seq (&stmts, new_stmts);
1681   if (stmts)
1682     {
1683       gcc_assert (single_succ_p (bb));
1684       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1685       if (gsi_end_p (gsi))
1686         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1687       else
1688         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1689     }
1690   return iters_name;
1691 }
1692
1693
1694 /* Function vect_update_init_of_dr
1695
1696    If CODE is PLUS, the vector loop starts NITERS iterations after the
1697    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1698    iterations before the scalar one (using masking to skip inactive
1699    elements).  This function updates the information recorded in DR to
1700    account for the difference.  Specifically, it updates the OFFSET
1701    field of DR.  */
1702
1703 static void
1704 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1705 {
1706   tree offset = DR_OFFSET (dr);
1707
1708   niters = fold_build2 (MULT_EXPR, sizetype,
1709                         fold_convert (sizetype, niters),
1710                         fold_convert (sizetype, DR_STEP (dr)));
1711   offset = fold_build2 (code, sizetype,
1712                         fold_convert (sizetype, offset), niters);
1713   DR_OFFSET (dr) = offset;
1714 }
1715
1716
1717 /* Function vect_update_inits_of_drs
1718
1719    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1720    CODE and NITERS are as for vect_update_inits_of_dr.  */
1721
1722 static void
1723 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1724                           tree_code code)
1725 {
1726   unsigned int i;
1727   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1728   struct data_reference *dr;
1729
1730   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1731
1732   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1733   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1734     {
1735       gimple_seq seq;
1736       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1737       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1738
1739       niters = fold_convert (sizetype, niters);
1740       niters = force_gimple_operand (niters, &seq, false, var);
1741       if (seq)
1742         {
1743           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1744           gcc_assert (!new_bb);
1745         }
1746     }
1747
1748   FOR_EACH_VEC_ELT (datarefs, i, dr)
1749     {
1750       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1751       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1752         vect_update_init_of_dr (dr, niters, code);
1753     }
1754 }
1755
1756 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1757    by masking.  This involves calculating the number of iterations to
1758    be peeled and then aligning all memory references appropriately.  */
1759
1760 void
1761 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1762 {
1763   tree misalign_in_elems;
1764   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1765
1766   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1767
1768   /* From the information recorded in LOOP_VINFO get the number of iterations
1769      that need to be skipped via masking.  */
1770   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1771     {
1772       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1773                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1774       misalign_in_elems = build_int_cst (type, misalign);
1775     }
1776   else
1777     {
1778       gimple_seq seq1 = NULL, seq2 = NULL;
1779       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1780       misalign_in_elems = fold_convert (type, misalign_in_elems);
1781       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1782                                                 &seq2, true, NULL_TREE);
1783       gimple_seq_add_seq (&seq1, seq2);
1784       if (seq1)
1785         {
1786           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1787           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1788           gcc_assert (!new_bb);
1789         }
1790     }
1791
1792   if (dump_enabled_p ())
1793     dump_printf_loc (MSG_NOTE, vect_location,
1794                      "misalignment for fully-masked loop: %T\n",
1795                      misalign_in_elems);
1796
1797   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1798
1799   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1800 }
1801
1802 /* This function builds ni_name = number of iterations.  Statements
1803    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1804    it to TRUE if new ssa_var is generated.  */
1805
1806 tree
1807 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1808 {
1809   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1810   if (TREE_CODE (ni) == INTEGER_CST)
1811     return ni;
1812   else
1813     {
1814       tree ni_name, var;
1815       gimple_seq stmts = NULL;
1816       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1817
1818       var = create_tmp_var (TREE_TYPE (ni), "niters");
1819       ni_name = force_gimple_operand (ni, &stmts, false, var);
1820       if (stmts)
1821         {
1822           gsi_insert_seq_on_edge_immediate (pe, stmts);
1823           if (new_var_p != NULL)
1824             *new_var_p = true;
1825         }
1826
1827       return ni_name;
1828     }
1829 }
1830
1831 /* Calculate the number of iterations above which vectorized loop will be
1832    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1833    of prolog loop.  If it's integer const, the integer number is also passed
1834    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1835    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1836    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1837    threshold below which the scalar (rather than vectorized) loop will be
1838    executed.  This function stores the upper bound (inclusive) of the result
1839    in BOUND_SCALAR.  */
1840
1841 static tree
1842 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1843                              int bound_prolog, poly_int64 bound_epilog, int th,
1844                              poly_uint64 *bound_scalar,
1845                              bool check_profitability)
1846 {
1847   tree type = TREE_TYPE (niters_prolog);
1848   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1849                              build_int_cst (type, bound_epilog));
1850
1851   *bound_scalar = bound_prolog + bound_epilog;
1852   if (check_profitability)
1853     {
1854       /* TH indicates the minimum niters of vectorized loop, while we
1855          compute the maximum niters of scalar loop.  */
1856       th--;
1857       /* Peeling for constant times.  */
1858       if (int_niters_prolog >= 0)
1859         {
1860           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1861           return build_int_cst (type, *bound_scalar);
1862         }
1863       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1864          and BOUND_EPILOG are inclusive upper bounds.  */
1865       if (known_ge (th, bound_prolog + bound_epilog))
1866         {
1867           *bound_scalar = th;
1868           return build_int_cst (type, th);
1869         }
1870       /* Need to do runtime comparison.  */
1871       else if (maybe_gt (th, bound_epilog))
1872         {
1873           *bound_scalar = upper_bound (*bound_scalar, th);
1874           return fold_build2 (MAX_EXPR, type,
1875                               build_int_cst (type, th), niters);
1876         }
1877     }
1878   return niters;
1879 }
1880
1881 /* NITERS is the number of times that the original scalar loop executes
1882    after peeling.  Work out the maximum number of iterations N that can
1883    be handled by the vectorized form of the loop and then either:
1884
1885    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1886
1887         niters_vector = N
1888
1889    b) set *STEP_VECTOR_PTR to one and generate:
1890
1891         niters_vector = N / vf
1892
1893    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1894    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1895    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1896
1897 void
1898 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1899                              tree *niters_vector_ptr, tree *step_vector_ptr,
1900                              bool niters_no_overflow)
1901 {
1902   tree ni_minus_gap, var;
1903   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1904   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1905   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1906   tree log_vf = NULL_TREE;
1907
1908   /* If epilogue loop is required because of data accesses with gaps, we
1909      subtract one iteration from the total number of iterations here for
1910      correct calculation of RATIO.  */
1911   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1912     {
1913       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1914                                   build_one_cst (type));
1915       if (!is_gimple_val (ni_minus_gap))
1916         {
1917           var = create_tmp_var (type, "ni_gap");
1918           gimple *stmts = NULL;
1919           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1920                                                true, var);
1921           gsi_insert_seq_on_edge_immediate (pe, stmts);
1922         }
1923     }
1924   else
1925     ni_minus_gap = niters;
1926
1927   unsigned HOST_WIDE_INT const_vf;
1928   if (vf.is_constant (&const_vf)
1929       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1930     {
1931       /* Create: niters >> log2(vf) */
1932       /* If it's known that niters == number of latch executions + 1 doesn't
1933          overflow, we can generate niters >> log2(vf); otherwise we generate
1934          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1935          will be at least one.  */
1936       log_vf = build_int_cst (type, exact_log2 (const_vf));
1937       if (niters_no_overflow)
1938         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1939       else
1940         niters_vector
1941           = fold_build2 (PLUS_EXPR, type,
1942                          fold_build2 (RSHIFT_EXPR, type,
1943                                       fold_build2 (MINUS_EXPR, type,
1944                                                    ni_minus_gap,
1945                                                    build_int_cst (type, vf)),
1946                                       log_vf),
1947                          build_int_cst (type, 1));
1948       step_vector = build_one_cst (type);
1949     }
1950   else
1951     {
1952       niters_vector = ni_minus_gap;
1953       step_vector = build_int_cst (type, vf);
1954     }
1955
1956   if (!is_gimple_val (niters_vector))
1957     {
1958       var = create_tmp_var (type, "bnd");
1959       gimple_seq stmts = NULL;
1960       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1961       gsi_insert_seq_on_edge_immediate (pe, stmts);
1962       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1963          we set range information to make niters analyzer's life easier.  */
1964       if (stmts != NULL && log_vf)
1965         set_range_info (niters_vector, VR_RANGE,
1966                         wi::to_wide (build_int_cst (type, 1)),
1967                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1968                                                   TYPE_MAX_VALUE (type),
1969                                                   log_vf)));
1970     }
1971   *niters_vector_ptr = niters_vector;
1972   *step_vector_ptr = step_vector;
1973
1974   return;
1975 }
1976
1977 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1978    loop specified by LOOP_VINFO after vectorization, compute the number
1979    of iterations before vectorization (niters_vector * vf) and store it
1980    to NITERS_VECTOR_MULT_VF_PTR.  */
1981
1982 static void
1983 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1984                                      tree niters_vector,
1985                                      tree *niters_vector_mult_vf_ptr)
1986 {
1987   /* We should be using a step_vector of VF if VF is variable.  */
1988   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1989   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1990   tree type = TREE_TYPE (niters_vector);
1991   tree log_vf = build_int_cst (type, exact_log2 (vf));
1992   basic_block exit_bb = single_exit (loop)->dest;
1993
1994   gcc_assert (niters_vector_mult_vf_ptr != NULL);
1995   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1996                                             niters_vector, log_vf);
1997   if (!is_gimple_val (niters_vector_mult_vf))
1998     {
1999       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2000       gimple_seq stmts = NULL;
2001       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2002                                                     &stmts, true, var);
2003       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2004       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2005     }
2006   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2007 }
2008
2009 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2010    from SECOND/FIRST and puts it at the original loop's preheader/exit
2011    edge, the two loops are arranged as below:
2012
2013        preheader_a:
2014      first_loop:
2015        header_a:
2016          i_1 = PHI<i_0, i_2>;
2017          ...
2018          i_2 = i_1 + 1;
2019          if (cond_a)
2020            goto latch_a;
2021          else
2022            goto between_bb;
2023        latch_a:
2024          goto header_a;
2025
2026        between_bb:
2027          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2028
2029      second_loop:
2030        header_b:
2031          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2032                                  or with i_2 if no LCSSA phi is created
2033                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2034          ...
2035          i_4 = i_3 + 1;
2036          if (cond_b)
2037            goto latch_b;
2038          else
2039            goto exit_bb;
2040        latch_b:
2041          goto header_b;
2042
2043        exit_bb:
2044
2045    This function creates loop closed SSA for the first loop; update the
2046    second loop's PHI nodes by replacing argument on incoming edge with the
2047    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2048    is false, Loop closed ssa phis will only be created for non-iv phis for
2049    the first loop.
2050
2051    This function assumes exit bb of the first loop is preheader bb of the
2052    second loop, i.e, between_bb in the example code.  With PHIs updated,
2053    the second loop will execute rest iterations of the first.  */
2054
2055 static void
2056 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2057                                    struct loop *first, struct loop *second,
2058                                    bool create_lcssa_for_iv_phis)
2059 {
2060   gphi_iterator gsi_update, gsi_orig;
2061   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2062
2063   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2064   edge second_preheader_e = loop_preheader_edge (second);
2065   basic_block between_bb = single_exit (first)->dest;
2066
2067   gcc_assert (between_bb == second_preheader_e->src);
2068   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2069   /* Either the first loop or the second is the loop to be vectorized.  */
2070   gcc_assert (loop == first || loop == second);
2071
2072   for (gsi_orig = gsi_start_phis (first->header),
2073        gsi_update = gsi_start_phis (second->header);
2074        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2075        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2076     {
2077       gphi *orig_phi = gsi_orig.phi ();
2078       gphi *update_phi = gsi_update.phi ();
2079
2080       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2081       /* Generate lcssa PHI node for the first loop.  */
2082       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2083       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2084       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2085         {
2086           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2087           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2088           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2089           arg = new_res;
2090         }
2091
2092       /* Update PHI node in the second loop by replacing arg on the loop's
2093          incoming edge.  */
2094       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2095     }
2096 }
2097
2098 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2099    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2100    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2101    appear like below:
2102
2103        guard_bb:
2104          if (cond)
2105            goto merge_bb;
2106          else
2107            goto skip_loop;
2108
2109      skip_loop:
2110        header_a:
2111          i_1 = PHI<i_0, i_2>;
2112          ...
2113          i_2 = i_1 + 1;
2114          if (cond_a)
2115            goto latch_a;
2116          else
2117            goto exit_a;
2118        latch_a:
2119          goto header_a;
2120
2121        exit_a:
2122          i_5 = PHI<i_2>;
2123
2124        merge_bb:
2125          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2126
2127      update_loop:
2128        header_b:
2129          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2130          ...
2131          i_4 = i_3 + 1;
2132          if (cond_b)
2133            goto latch_b;
2134          else
2135            goto exit_bb;
2136        latch_b:
2137          goto header_b;
2138
2139        exit_bb:
2140
2141    This function creates PHI nodes at merge_bb and replaces the use of i_5
2142    in the update_loop's PHI node with the result of new PHI result.  */
2143
2144 static void
2145 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2146                                     struct loop *update_loop,
2147                                     edge guard_edge, edge merge_edge)
2148 {
2149   source_location merge_loc, guard_loc;
2150   edge orig_e = loop_preheader_edge (skip_loop);
2151   edge update_e = loop_preheader_edge (update_loop);
2152   gphi_iterator gsi_orig, gsi_update;
2153
2154   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2155         gsi_update = gsi_start_phis (update_loop->header));
2156        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2157        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2158     {
2159       gphi *orig_phi = gsi_orig.phi ();
2160       gphi *update_phi = gsi_update.phi ();
2161
2162       /* Generate new phi node at merge bb of the guard.  */
2163       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2164       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2165
2166       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2167          args in NEW_PHI for these edges.  */
2168       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2169       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2170       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2171       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2172       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2173       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2174
2175       /* Update phi in UPDATE_PHI.  */
2176       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2177     }
2178 }
2179
2180 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2181    this function searches for the corresponding lcssa phi node in exit
2182    bb of LOOP.  If it is found, return the phi result; otherwise return
2183    NULL.  */
2184
2185 static tree
2186 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2187                 gphi *lcssa_phi)
2188 {
2189   gphi_iterator gsi;
2190   edge e = single_exit (loop);
2191
2192   gcc_assert (single_pred_p (e->dest));
2193   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2194     {
2195       gphi *phi = gsi.phi ();
2196       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2197                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2198         return PHI_RESULT (phi);
2199     }
2200   return NULL_TREE;
2201 }
2202
2203 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2204    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2205    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2206    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2207    The CFG looks like:
2208
2209      loop:
2210        header_a:
2211          i_1 = PHI<i_0, i_2>;
2212          ...
2213          i_2 = i_1 + 1;
2214          if (cond_a)
2215            goto latch_a;
2216          else
2217            goto exit_a;
2218        latch_a:
2219          goto header_a;
2220
2221        exit_a:
2222
2223        guard_bb:
2224          if (cond)
2225            goto merge_bb;
2226          else
2227            goto epilog_loop;
2228
2229        ;; fall_through_bb
2230
2231      epilog_loop:
2232        header_b:
2233          i_3 = PHI<i_2, i_4>;
2234          ...
2235          i_4 = i_3 + 1;
2236          if (cond_b)
2237            goto latch_b;
2238          else
2239            goto merge_bb;
2240        latch_b:
2241          goto header_b;
2242
2243        merge_bb:
2244          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2245
2246        exit_bb:
2247          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2248
2249    For each name used out side EPILOG (i.e - for each name that has a lcssa
2250    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2251    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2252    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2253    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2254    in exit_bb will also be updated.  */
2255
2256 static void
2257 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2258                                     edge guard_edge, edge merge_edge)
2259 {
2260   gphi_iterator gsi;
2261   basic_block merge_bb = guard_edge->dest;
2262
2263   gcc_assert (single_succ_p (merge_bb));
2264   edge e = single_succ_edge (merge_bb);
2265   basic_block exit_bb = e->dest;
2266   gcc_assert (single_pred_p (exit_bb));
2267   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2268
2269   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2270     {
2271       gphi *update_phi = gsi.phi ();
2272       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2273       /* This loop-closed-phi actually doesn't represent a use out of the
2274          loop - the phi arg is a constant.  */
2275       if (TREE_CODE (old_arg) != SSA_NAME)
2276         continue;
2277
2278       tree merge_arg = get_current_def (old_arg);
2279       if (!merge_arg)
2280         merge_arg = old_arg;
2281
2282       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2283       /* If the var is live after loop but not a reduction, we simply
2284          use the old arg.  */
2285       if (!guard_arg)
2286         guard_arg = old_arg;
2287
2288       /* Create new phi node in MERGE_BB:  */
2289       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2290       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2291
2292       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2293          the two PHI args in merge_phi for these edges.  */
2294       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2295       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2296
2297       /* Update the original phi in exit_bb.  */
2298       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2299     }
2300 }
2301
2302 /* EPILOG loop is duplicated from the original loop for vectorizing,
2303    the arg of its loop closed ssa PHI needs to be updated.  */
2304
2305 static void
2306 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2307 {
2308   gphi_iterator gsi;
2309   basic_block exit_bb = single_exit (epilog)->dest;
2310
2311   gcc_assert (single_pred_p (exit_bb));
2312   edge e = EDGE_PRED (exit_bb, 0);
2313   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2314     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2315 }
2316
2317 /* Function vect_do_peeling.
2318
2319    Input:
2320    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2321
2322        preheader:
2323      LOOP:
2324        header_bb:
2325          loop_body
2326          if (exit_loop_cond) goto exit_bb
2327          else                goto header_bb
2328        exit_bb:
2329
2330    - NITERS: The number of iterations of the loop.
2331    - NITERSM1: The number of iterations of the loop's latch.
2332    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2333    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2334                               CHECK_PROFITABILITY is true.
2335    Output:
2336    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2337      iterate after vectorization; see vect_set_loop_condition for details.
2338    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2339      should be set to the number of scalar iterations handled by the
2340      vector loop.  The SSA name is only used on exit from the loop.
2341
2342    This function peels prolog and epilog from the loop, adds guards skipping
2343    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2344    would look like:
2345
2346        guard_bb_1:
2347          if (prefer_scalar_loop) goto merge_bb_1
2348          else                    goto guard_bb_2
2349
2350        guard_bb_2:
2351          if (skip_prolog) goto merge_bb_2
2352          else             goto prolog_preheader
2353
2354        prolog_preheader:
2355      PROLOG:
2356        prolog_header_bb:
2357          prolog_body
2358          if (exit_prolog_cond) goto prolog_exit_bb
2359          else                  goto prolog_header_bb
2360        prolog_exit_bb:
2361
2362        merge_bb_2:
2363
2364        vector_preheader:
2365      VECTOR LOOP:
2366        vector_header_bb:
2367          vector_body
2368          if (exit_vector_cond) goto vector_exit_bb
2369          else                  goto vector_header_bb
2370        vector_exit_bb:
2371
2372        guard_bb_3:
2373          if (skip_epilog) goto merge_bb_3
2374          else             goto epilog_preheader
2375
2376        merge_bb_1:
2377
2378        epilog_preheader:
2379      EPILOG:
2380        epilog_header_bb:
2381          epilog_body
2382          if (exit_epilog_cond) goto merge_bb_3
2383          else                  goto epilog_header_bb
2384
2385        merge_bb_3:
2386
2387    Note this function peels prolog and epilog only if it's necessary,
2388    as well as guards.
2389    Returns created epilogue or NULL.
2390
2391    TODO: Guard for prefer_scalar_loop should be emitted along with
2392    versioning conditions if loop versioning is needed.  */
2393
2394
2395 struct loop *
2396 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2397                  tree *niters_vector, tree *step_vector,
2398                  tree *niters_vector_mult_vf_var, int th,
2399                  bool check_profitability, bool niters_no_overflow)
2400 {
2401   edge e, guard_e;
2402   tree type = TREE_TYPE (niters), guard_cond;
2403   basic_block guard_bb, guard_to;
2404   profile_probability prob_prolog, prob_vector, prob_epilog;
2405   int estimated_vf;
2406   int prolog_peeling = 0;
2407   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2408     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2409
2410   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2411   poly_uint64 bound_epilog = 0;
2412   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2413       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2414     bound_epilog += vf - 1;
2415   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2416     bound_epilog += 1;
2417   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2418   poly_uint64 bound_scalar = bound_epilog;
2419
2420   if (!prolog_peeling && !epilog_peeling)
2421     return NULL;
2422
2423   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2424   estimated_vf = vect_vf_for_cost (loop_vinfo);
2425   if (estimated_vf == 2)
2426     estimated_vf = 3;
2427   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2428                         .apply_scale (estimated_vf - 1, estimated_vf);
2429
2430   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2431   struct loop *first_loop = loop;
2432   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2433   create_lcssa_for_virtual_phi (loop);
2434   update_ssa (TODO_update_ssa_only_virtuals);
2435
2436   if (MAY_HAVE_DEBUG_BIND_STMTS)
2437     {
2438       gcc_assert (!adjust_vec.exists ());
2439       adjust_vec.create (32);
2440     }
2441   initialize_original_copy_tables ();
2442
2443   /* Record the anchor bb at which the guard should be placed if the scalar
2444      loop might be preferred.  */
2445   basic_block anchor = loop_preheader_edge (loop)->src;
2446
2447   /* Generate the number of iterations for the prolog loop.  We do this here
2448      so that we can also get the upper bound on the number of iterations.  */
2449   tree niters_prolog;
2450   int bound_prolog = 0;
2451   if (prolog_peeling)
2452     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2453                                                  &bound_prolog);
2454   else
2455     niters_prolog = build_int_cst (type, 0);
2456
2457   /* Prolog loop may be skipped.  */
2458   bool skip_prolog = (prolog_peeling != 0);
2459   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2460      when we peel for epilog loop and when it hasn't been checked with
2461      loop versioning.  */
2462   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2463                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2464                                   bound_prolog + bound_epilog)
2465                       : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2466   /* Epilog loop must be executed if the number of iterations for epilog
2467      loop is known at compile time, otherwise we need to add a check at
2468      the end of vector loop and skip to the end of epilog loop.  */
2469   bool skip_epilog = (prolog_peeling < 0
2470                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2471                       || !vf.is_constant ());
2472   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2473   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2474     skip_epilog = false;
2475
2476   if (skip_vector)
2477     {
2478       split_edge (loop_preheader_edge (loop));
2479
2480       /* Due to the order in which we peel prolog and epilog, we first
2481          propagate probability to the whole loop.  The purpose is to
2482          avoid adjusting probabilities of both prolog and vector loops
2483          separately.  Note in this case, the probability of epilog loop
2484          needs to be scaled back later.  */
2485       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2486       if (prob_vector.initialized_p ())
2487         {
2488           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2489           scale_loop_profile (loop, prob_vector, 0);
2490         }
2491     }
2492
2493   dump_user_location_t loop_loc = find_loop_location (loop);
2494   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2495   if (prolog_peeling)
2496     {
2497       e = loop_preheader_edge (loop);
2498       if (!slpeel_can_duplicate_loop_p (loop, e))
2499         {
2500           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2501                            "loop can't be duplicated to preheader edge.\n");
2502           gcc_unreachable ();
2503         }
2504       /* Peel prolog and put it on preheader edge of loop.  */
2505       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2506       if (!prolog)
2507         {
2508           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2509                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2510           gcc_unreachable ();
2511         }
2512       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2513       first_loop = prolog;
2514       reset_original_copy_tables ();
2515
2516       /* Update the number of iterations for prolog loop.  */
2517       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2518       vect_set_loop_condition (prolog, NULL, niters_prolog,
2519                                step_prolog, NULL_TREE, false);
2520
2521       /* Skip the prolog loop.  */
2522       if (skip_prolog)
2523         {
2524           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2525                                     niters_prolog, build_int_cst (type, 0));
2526           guard_bb = loop_preheader_edge (prolog)->src;
2527           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2528           guard_to = split_edge (loop_preheader_edge (loop));
2529           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2530                                            guard_to, guard_bb,
2531                                            prob_prolog.invert (),
2532                                            irred_flag);
2533           e = EDGE_PRED (guard_to, 0);
2534           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2535           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2536
2537           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2538           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2539         }
2540       /* Update init address of DRs.  */
2541       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2542       /* Update niters for vector loop.  */
2543       LOOP_VINFO_NITERS (loop_vinfo)
2544         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2545       LOOP_VINFO_NITERSM1 (loop_vinfo)
2546         = fold_build2 (MINUS_EXPR, type,
2547                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2548       bool new_var_p = false;
2549       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2550       /* It's guaranteed that vector loop bound before vectorization is at
2551          least VF, so set range information for newly generated var.  */
2552       if (new_var_p)
2553         set_range_info (niters, VR_RANGE,
2554                         wi::to_wide (build_int_cst (type, vf)),
2555                         wi::to_wide (TYPE_MAX_VALUE (type)));
2556
2557       /* Prolog iterates at most bound_prolog times, latch iterates at
2558          most bound_prolog - 1 times.  */
2559       record_niter_bound (prolog, bound_prolog - 1, false, true);
2560       delete_update_ssa ();
2561       adjust_vec_debug_stmts ();
2562       scev_reset ();
2563     }
2564
2565   if (epilog_peeling)
2566     {
2567       e = single_exit (loop);
2568       if (!slpeel_can_duplicate_loop_p (loop, e))
2569         {
2570           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2571                            "loop can't be duplicated to exit edge.\n");
2572           gcc_unreachable ();
2573         }
2574       /* Peel epilog and put it on exit edge of loop.  */
2575       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2576       if (!epilog)
2577         {
2578           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2579                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2580           gcc_unreachable ();
2581         }
2582       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2583
2584       /* Scalar version loop may be preferred.  In this case, add guard
2585          and skip to epilog.  Note this only happens when the number of
2586          iterations of loop is unknown at compile time, otherwise this
2587          won't be vectorized.  */
2588       if (skip_vector)
2589         {
2590           /* Additional epilogue iteration is peeled if gap exists.  */
2591           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2592                                                 bound_prolog, bound_epilog,
2593                                                 th, &bound_scalar,
2594                                                 check_profitability);
2595           /* Build guard against NITERSM1 since NITERS may overflow.  */
2596           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2597           guard_bb = anchor;
2598           guard_to = split_edge (loop_preheader_edge (epilog));
2599           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2600                                            guard_to, guard_bb,
2601                                            prob_vector.invert (),
2602                                            irred_flag);
2603           e = EDGE_PRED (guard_to, 0);
2604           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2605           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2606
2607           /* Simply propagate profile info from guard_bb to guard_to which is
2608              a merge point of control flow.  */
2609           guard_to->count = guard_bb->count;
2610
2611           /* Scale probability of epilog loop back.
2612              FIXME: We should avoid scaling down and back up.  Profile may
2613              get lost if we scale down to 0.  */
2614           basic_block *bbs = get_loop_body (epilog);
2615           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2616             bbs[i]->count = bbs[i]->count.apply_scale
2617                                  (bbs[i]->count,
2618                                   bbs[i]->count.apply_probability
2619                                     (prob_vector));
2620           free (bbs);
2621         }
2622
2623       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2624       tree niters_vector_mult_vf;
2625       /* If loop is peeled for non-zero constant times, now niters refers to
2626          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2627          overflows.  */
2628       niters_no_overflow |= (prolog_peeling > 0);
2629       vect_gen_vector_loop_niters (loop_vinfo, niters,
2630                                    niters_vector, step_vector,
2631                                    niters_no_overflow);
2632       if (!integer_onep (*step_vector))
2633         {
2634           /* On exit from the loop we will have an easy way of calcalating
2635              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2636              until then.  */
2637           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2638           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2639           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2640         }
2641       else
2642         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2643                                              &niters_vector_mult_vf);
2644       /* Update IVs of original loop as if they were advanced by
2645          niters_vector_mult_vf steps.  */
2646       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2647       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2648       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2649                                         update_e);
2650
2651       if (skip_epilog)
2652         {
2653           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2654                                     niters, niters_vector_mult_vf);
2655           guard_bb = single_exit (loop)->dest;
2656           guard_to = split_edge (single_exit (epilog));
2657           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2658                                            skip_vector ? anchor : guard_bb,
2659                                            prob_epilog.invert (),
2660                                            irred_flag);
2661           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2662                                               single_exit (epilog));
2663           /* Only need to handle basic block before epilog loop if it's not
2664              the guard_bb, which is the case when skip_vector is true.  */
2665           if (guard_bb != bb_before_epilog)
2666             {
2667               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2668
2669               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2670             }
2671           scale_loop_profile (epilog, prob_epilog, 0);
2672         }
2673       else
2674         slpeel_update_phi_nodes_for_lcssa (epilog);
2675
2676       unsigned HOST_WIDE_INT bound;
2677       if (bound_scalar.is_constant (&bound))
2678         {
2679           gcc_assert (bound != 0);
2680           /* -1 to convert loop iterations to latch iterations.  */
2681           record_niter_bound (epilog, bound - 1, false, true);
2682         }
2683
2684       delete_update_ssa ();
2685       adjust_vec_debug_stmts ();
2686       scev_reset ();
2687     }
2688   adjust_vec.release ();
2689   free_original_copy_tables ();
2690
2691   return epilog;
2692 }
2693
2694 /* Function vect_create_cond_for_niters_checks.
2695
2696    Create a conditional expression that represents the run-time checks for
2697    loop's niter.  The loop is guaranteed to terminate if the run-time
2698    checks hold.
2699
2700    Input:
2701    COND_EXPR  - input conditional expression.  New conditions will be chained
2702                 with logical AND operation.  If it is NULL, then the function
2703                 is used to return the number of alias checks.
2704    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2705                 to be checked.
2706
2707    Output:
2708    COND_EXPR - conditional expression.
2709
2710    The returned COND_EXPR is the conditional expression to be used in the
2711    if statement that controls which version of the loop gets executed at
2712    runtime.  */
2713
2714 static void
2715 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2716 {
2717   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2718
2719   if (*cond_expr)
2720     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2721                               *cond_expr, part_cond_expr);
2722   else
2723     *cond_expr = part_cond_expr;
2724 }
2725
2726 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2727    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2728
2729 static void
2730 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2731 {
2732   if (*cond_expr)
2733     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2734                               *cond_expr, part_cond_expr);
2735   else
2736     *cond_expr = part_cond_expr;
2737 }
2738
2739 /* Function vect_create_cond_for_align_checks.
2740
2741    Create a conditional expression that represents the alignment checks for
2742    all of data references (array element references) whose alignment must be
2743    checked at runtime.
2744
2745    Input:
2746    COND_EXPR  - input conditional expression.  New conditions will be chained
2747                 with logical AND operation.
2748    LOOP_VINFO - two fields of the loop information are used.
2749                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2750                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2751
2752    Output:
2753    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2754                          expression.
2755    The returned value is the conditional expression to be used in the if
2756    statement that controls which version of the loop gets executed at runtime.
2757
2758    The algorithm makes two assumptions:
2759      1) The number of bytes "n" in a vector is a power of 2.
2760      2) An address "a" is aligned if a%n is zero and that this
2761         test can be done as a&(n-1) == 0.  For example, for 16
2762         byte vectors the test is a&0xf == 0.  */
2763
2764 static void
2765 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2766                                    tree *cond_expr,
2767                                    gimple_seq *cond_expr_stmt_list)
2768 {
2769   vec<stmt_vec_info> may_misalign_stmts
2770     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2771   stmt_vec_info stmt_info;
2772   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2773   tree mask_cst;
2774   unsigned int i;
2775   tree int_ptrsize_type;
2776   char tmp_name[20];
2777   tree or_tmp_name = NULL_TREE;
2778   tree and_tmp_name;
2779   gimple *and_stmt;
2780   tree ptrsize_zero;
2781   tree part_cond_expr;
2782
2783   /* Check that mask is one less than a power of 2, i.e., mask is
2784      all zeros followed by all ones.  */
2785   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2786
2787   int_ptrsize_type = signed_type_for (ptr_type_node);
2788
2789   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2790      of the first vector of the i'th data reference. */
2791
2792   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2793     {
2794       gimple_seq new_stmt_list = NULL;
2795       tree addr_base;
2796       tree addr_tmp_name;
2797       tree new_or_tmp_name;
2798       gimple *addr_stmt, *or_stmt;
2799       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2800       bool negative = tree_int_cst_compare
2801         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2802       tree offset = negative
2803         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2804
2805       /* create: addr_tmp = (int)(address_of_first_vector) */
2806       addr_base =
2807         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2808                                               offset);
2809       if (new_stmt_list != NULL)
2810         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2811
2812       sprintf (tmp_name, "addr2int%d", i);
2813       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2814       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2815       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2816
2817       /* The addresses are OR together.  */
2818
2819       if (or_tmp_name != NULL_TREE)
2820         {
2821           /* create: or_tmp = or_tmp | addr_tmp */
2822           sprintf (tmp_name, "orptrs%d", i);
2823           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2824           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2825                                          or_tmp_name, addr_tmp_name);
2826           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2827           or_tmp_name = new_or_tmp_name;
2828         }
2829       else
2830         or_tmp_name = addr_tmp_name;
2831
2832     } /* end for i */
2833
2834   mask_cst = build_int_cst (int_ptrsize_type, mask);
2835
2836   /* create: and_tmp = or_tmp & mask  */
2837   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2838
2839   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2840                                   or_tmp_name, mask_cst);
2841   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2842
2843   /* Make and_tmp the left operand of the conditional test against zero.
2844      if and_tmp has a nonzero bit then some address is unaligned.  */
2845   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2846   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2847                                 and_tmp_name, ptrsize_zero);
2848   chain_cond_expr (cond_expr, part_cond_expr);
2849 }
2850
2851 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2852    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2853    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2854    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2855
2856 static void
2857 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2858 {
2859   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2860   unsigned int i;
2861   vec_object_pair *pair;
2862   FOR_EACH_VEC_ELT (pairs, i, pair)
2863     {
2864       tree addr1 = build_fold_addr_expr (pair->first);
2865       tree addr2 = build_fold_addr_expr (pair->second);
2866       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2867                                          addr1, addr2);
2868       chain_cond_expr (cond_expr, part_cond_expr);
2869     }
2870 }
2871
2872 /* Create an expression that is true when all lower-bound conditions for
2873    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2874
2875 static void
2876 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2877 {
2878   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2879   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2880     {
2881       tree expr = lower_bounds[i].expr;
2882       tree type = unsigned_type_for (TREE_TYPE (expr));
2883       expr = fold_convert (type, expr);
2884       poly_uint64 bound = lower_bounds[i].min_value;
2885       if (!lower_bounds[i].unsigned_p)
2886         {
2887           expr = fold_build2 (PLUS_EXPR, type, expr,
2888                               build_int_cstu (type, bound - 1));
2889           bound += bound - 1;
2890         }
2891       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2892                                          build_int_cstu (type, bound));
2893       chain_cond_expr (cond_expr, part_cond_expr);
2894     }
2895 }
2896
2897 /* Function vect_create_cond_for_alias_checks.
2898
2899    Create a conditional expression that represents the run-time checks for
2900    overlapping of address ranges represented by a list of data references
2901    relations passed as input.
2902
2903    Input:
2904    COND_EXPR  - input conditional expression.  New conditions will be chained
2905                 with logical AND operation.  If it is NULL, then the function
2906                 is used to return the number of alias checks.
2907    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2908                 to be checked.
2909
2910    Output:
2911    COND_EXPR - conditional expression.
2912
2913    The returned COND_EXPR is the conditional expression to be used in the if
2914    statement that controls which version of the loop gets executed at runtime.
2915 */
2916
2917 void
2918 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2919 {
2920   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2921     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2922
2923   if (comp_alias_ddrs.is_empty ())
2924     return;
2925
2926   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2927                                &comp_alias_ddrs, cond_expr);
2928   if (dump_enabled_p ())
2929     dump_printf_loc (MSG_NOTE, vect_location,
2930                      "created %u versioning for alias checks.\n",
2931                      comp_alias_ddrs.length ());
2932 }
2933
2934
2935 /* Function vect_loop_versioning.
2936
2937    If the loop has data references that may or may not be aligned or/and
2938    has data reference relations whose independence was not proven then
2939    two versions of the loop need to be generated, one which is vectorized
2940    and one which isn't.  A test is then generated to control which of the
2941    loops is executed.  The test checks for the alignment of all of the
2942    data references that may or may not be aligned.  An additional
2943    sequence of runtime tests is generated for each pairs of DDRs whose
2944    independence was not proven.  The vectorized version of loop is
2945    executed only if both alias and alignment tests are passed.
2946
2947    The test generated to check which version of loop is executed
2948    is modified to also check for profitability as indicated by the
2949    cost model threshold TH.
2950
2951    The versioning precondition(s) are placed in *COND_EXPR and
2952    *COND_EXPR_STMT_LIST.  */
2953
2954 void
2955 vect_loop_versioning (loop_vec_info loop_vinfo,
2956                       unsigned int th, bool check_profitability,
2957                       poly_uint64 versioning_threshold)
2958 {
2959   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2960   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2961   basic_block condition_bb;
2962   gphi_iterator gsi;
2963   gimple_stmt_iterator cond_exp_gsi;
2964   basic_block merge_bb;
2965   basic_block new_exit_bb;
2966   edge new_exit_e, e;
2967   gphi *orig_phi, *new_phi;
2968   tree cond_expr = NULL_TREE;
2969   gimple_seq cond_expr_stmt_list = NULL;
2970   tree arg;
2971   profile_probability prob = profile_probability::likely ();
2972   gimple_seq gimplify_stmt_list = NULL;
2973   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2974   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2975   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2976   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2977
2978   if (check_profitability)
2979     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2980                              build_int_cst (TREE_TYPE (scalar_loop_iters),
2981                                             th - 1));
2982   if (maybe_ne (versioning_threshold, 0U))
2983     {
2984       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2985                                build_int_cst (TREE_TYPE (scalar_loop_iters),
2986                                               versioning_threshold - 1));
2987       if (cond_expr)
2988         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2989                                  expr, cond_expr);
2990       else
2991         cond_expr = expr;
2992     }
2993
2994   if (version_niter)
2995     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
2996
2997   if (cond_expr)
2998     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
2999                                         is_gimple_condexpr, NULL_TREE);
3000
3001   if (version_align)
3002     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3003                                        &cond_expr_stmt_list);
3004
3005   if (version_alias)
3006     {
3007       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3008       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3009       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3010     }
3011
3012   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3013                                       &gimplify_stmt_list,
3014                                       is_gimple_condexpr, NULL_TREE);
3015   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3016
3017   initialize_original_copy_tables ();
3018   if (scalar_loop)
3019     {
3020       edge scalar_e;
3021       basic_block preheader, scalar_preheader;
3022
3023       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3024          scale LOOP's frequencies instead.  */
3025       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3026                             prob, prob.invert (), prob, prob.invert (), true);
3027       scale_loop_frequencies (loop, prob);
3028       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3029          while we need to move it above LOOP's preheader.  */
3030       e = loop_preheader_edge (loop);
3031       scalar_e = loop_preheader_edge (scalar_loop);
3032       /* The vector loop preheader might not be empty, since new
3033          invariants could have been created while analyzing the loop.  */
3034       gcc_assert (single_pred_p (e->src));
3035       gcc_assert (empty_block_p (scalar_e->src)
3036                   && single_pred_p (scalar_e->src));
3037       gcc_assert (single_pred_p (condition_bb));
3038       preheader = e->src;
3039       scalar_preheader = scalar_e->src;
3040       scalar_e = find_edge (condition_bb, scalar_preheader);
3041       e = single_pred_edge (preheader);
3042       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3043                                       scalar_preheader);
3044       redirect_edge_and_branch_force (scalar_e, preheader);
3045       redirect_edge_and_branch_force (e, condition_bb);
3046       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3047                                single_pred (condition_bb));
3048       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3049                                single_pred (scalar_preheader));
3050       set_immediate_dominator (CDI_DOMINATORS, preheader,
3051                                condition_bb);
3052     }
3053   else
3054     nloop = loop_version (loop, cond_expr, &condition_bb,
3055                           prob, prob.invert (), prob, prob.invert (), true);
3056
3057   if (version_niter)
3058     {
3059       /* The versioned loop could be infinite, we need to clear existing
3060          niter information which is copied from the original loop.  */
3061       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3062       vect_free_loop_info_assumptions (nloop);
3063       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3064       loop_constraint_set (loop, LOOP_C_INFINITE);
3065     }
3066
3067   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3068       && dump_enabled_p ())
3069     {
3070       if (version_alias)
3071         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3072                          vect_location,
3073                          "loop versioned for vectorization because of "
3074                          "possible aliasing\n");
3075       if (version_align)
3076         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3077                          vect_location,
3078                          "loop versioned for vectorization to enhance "
3079                          "alignment\n");
3080
3081     }
3082   free_original_copy_tables ();
3083
3084   /* Loop versioning violates an assumption we try to maintain during
3085      vectorization - that the loop exit block has a single predecessor.
3086      After versioning, the exit block of both loop versions is the same
3087      basic block (i.e. it has two predecessors). Just in order to simplify
3088      following transformations in the vectorizer, we fix this situation
3089      here by adding a new (empty) block on the exit-edge of the loop,
3090      with the proper loop-exit phis to maintain loop-closed-form.
3091      If loop versioning wasn't done from loop, but scalar_loop instead,
3092      merge_bb will have already just a single successor.  */
3093
3094   merge_bb = single_exit (loop)->dest;
3095   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3096     {
3097       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3098       new_exit_bb = split_edge (single_exit (loop));
3099       new_exit_e = single_exit (loop);
3100       e = EDGE_SUCC (new_exit_bb, 0);
3101
3102       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3103         {
3104           tree new_res;
3105           orig_phi = gsi.phi ();
3106           new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3107           new_phi = create_phi_node (new_res, new_exit_bb);
3108           arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3109           add_phi_arg (new_phi, arg, new_exit_e,
3110                        gimple_phi_arg_location_from_edge (orig_phi, e));
3111           adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3112         }
3113     }
3114
3115   /* End loop-exit-fixes after versioning.  */
3116
3117   if (cond_expr_stmt_list)
3118     {
3119       cond_exp_gsi = gsi_last_bb (condition_bb);
3120       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3121                              GSI_SAME_STMT);
3122     }
3123   update_ssa (TODO_update_ssa);
3124 }