gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   struct loop *loop = bb->loop_father;
  93   struct loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (struct loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (struct loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 321                                rgroup_masks *src_rgm)
 322 {
 323   tree src_masktype = src_rgm->mask_type;
 324   tree dest_masktype = dest_rgm->mask_type;
 325   machine_mode src_mode = TYPE_MODE (src_masktype);
 326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 329     {
 330       /* Unpacking the source masks gives at least as many mask bits as
 331          we need.  We can then VIEW_CONVERT any excess bits away.  */
 332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
 333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 334         {
 335           tree src = src_rgm->masks[i / 2];
 336           tree dest = dest_rgm->masks[i];
 337           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 338                             ? VEC_UNPACK_HI_EXPR
 339                             : VEC_UNPACK_LO_EXPR);
 340           gassign *stmt;
 341           if (dest_masktype == unpack_masktype)
 342             stmt = gimple_build_assign (dest, code, src);
 343           else
 344             {
 345               tree temp = make_ssa_name (unpack_masktype);
 346               stmt = gimple_build_assign (temp, code, src);
 347               gimple_seq_add_stmt (seq, stmt);
 348               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 349                                           build1 (VIEW_CONVERT_EXPR,
 350                                                   dest_masktype, temp));
 351             }
 352           gimple_seq_add_stmt (seq, stmt);
 353         }
 354       return true;
 355     }
 356   vec_perm_indices indices[2];
 357   if (dest_masktype == src_masktype
 358       && interleave_supported_p (&indices[0], src_masktype, 0)
 359       && interleave_supported_p (&indices[1], src_masktype, 1))
 360     {
 361       /* The destination requires twice as many mask bits as the source, so
 362          we can use interleaving permutes to double up the number of bits.  */
 363       tree masks[2];
 364       for (unsigned int i = 0; i < 2; ++i)
 365         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 367         {
 368           tree src = src_rgm->masks[i / 2];
 369           tree dest = dest_rgm->masks[i];
 370           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 371                                               src, src, masks[i & 1]);
 372           gimple_seq_add_stmt (seq, stmt);
 373         }
 374       return true;
 375     }
 376   return false;
 377 }
 378
 379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 380    all the masks in RGM and return a mask that is nonzero when the loop
 381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 382    Use LOOP_COND_GSI to insert code before the exit gcond.
 383
 384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 385    times and has been vectorized according to LOOP_VINFO.  Each iteration
 386    of the vectorized loop handles VF iterations of the scalar loop.
 387
 388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 389    starts with NITERS_SKIP dummy iterations of the scalar loop before
 390    the real work starts.  The mask elements for these dummy iterations
 391    must be 0, to ensure that the extra iterations do not have an effect.
 392
 393    It is known that:
 394
 395      NITERS * RGM->max_nscalars_per_iter
 396
 397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 398    variable that starts at 0 and has step:
 399
 400      VF * RGM->max_nscalars_per_iter
 401
 402    might overflow before hitting a value above:
 403
 404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 405
 406    This means that we cannot guarantee that such an induction variable
 407    would ever hit a value that produces a set of all-false masks for RGM.  */
 408
 409 static tree
 410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 411                               gimple_seq *preheader_seq,
 412                               gimple_stmt_iterator loop_cond_gsi,
 413                               rgroup_masks *rgm, tree vf,
 414                               tree niters, tree niters_skip,
 415                               bool might_wrap_p)
 416 {
 417   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 418   tree mask_type = rgm->mask_type;
 419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 421
 422   /* Calculate the maximum number of scalar values that the rgroup
 423      handles in total, the number that it handles for each iteration
 424      of the vector loop, and the number that it should skip during the
 425      first iteration of the vector loop.  */
 426   tree nscalars_total = niters;
 427   tree nscalars_step = vf;
 428   tree nscalars_skip = niters_skip;
 429   if (nscalars_per_iter != 1)
 430     {
 431       /* We checked before choosing to use a fully-masked loop that these
 432          multiplications don't overflow.  */
 433       tree factor = build_int_cst (compare_type, nscalars_per_iter);
 434       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 435                                      nscalars_total, factor);
 436       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 437                                     nscalars_step, factor);
 438       if (nscalars_skip)
 439         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 440                                       nscalars_skip, factor);
 441     }
 442
 443   /* Create an induction variable that counts the number of scalars
 444      processed.  */
 445   tree index_before_incr, index_after_incr;
 446   gimple_stmt_iterator incr_gsi;
 447   bool insert_after;
 448   tree zero_index = build_int_cst (compare_type, 0);
 449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 450   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
 451              insert_after, &index_before_incr, &index_after_incr);
 452
 453   tree test_index, test_limit, first_limit;
 454   gimple_stmt_iterator *test_gsi;
 455   if (might_wrap_p)
 456     {
 457       /* In principle the loop should stop iterating once the incremented
 458          IV reaches a value greater than or equal to:
 459
 460            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 461
 462          However, there's no guarantee that this addition doesn't overflow
 463          the comparison type, or that the IV hits a value above it before
 464          wrapping around.  We therefore adjust the limit down by one
 465          IV step:
 466
 467            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 468            -[infinite-prec] NSCALARS_STEP
 469
 470          and compare the IV against this limit _before_ incrementing it.
 471          Since the comparison type is unsigned, we actually want the
 472          subtraction to saturate at zero:
 473
 474            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 475            -[sat] NSCALARS_STEP
 476
 477          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 478
 479            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 480
 481          where the rightmost subtraction can be done directly in
 482          COMPARE_TYPE.  */
 483       test_index = index_before_incr;
 484       tree adjust = nscalars_step;
 485       if (nscalars_skip)
 486         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 487                                adjust, nscalars_skip);
 488       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 489                                  nscalars_total, adjust);
 490       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 491                                  test_limit, adjust);
 492       test_gsi = &incr_gsi;
 493
 494       /* Get a safe limit for the first iteration.  */
 495       if (nscalars_skip)
 496         {
 497           /* The first vector iteration can handle at most NSCALARS_STEP
 498              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 499              NSCALARS_SKIP to that cannot overflow.  */
 500           tree const_limit = build_int_cst (compare_type,
 501                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 502                                             * nscalars_per_iter);
 503           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 504                                       nscalars_total, const_limit);
 505           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 506                                       first_limit, nscalars_skip);
 507         }
 508       else
 509         /* For the first iteration it doesn't matter whether the IV hits
 510            a value above NSCALARS_TOTAL.  That only matters for the latch
 511            condition.  */
 512         first_limit = nscalars_total;
 513     }
 514   else
 515     {
 516       /* Test the incremented IV, which will always hit a value above
 517          the bound before wrapping.  */
 518       test_index = index_after_incr;
 519       test_limit = nscalars_total;
 520       if (nscalars_skip)
 521         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 522                                    test_limit, nscalars_skip);
 523       test_gsi = &loop_cond_gsi;
 524
 525       first_limit = test_limit;
 526     }
 527
 528   /* Provide a definition of each mask in the group.  */
 529   tree next_mask = NULL_TREE;
 530   tree mask;
 531   unsigned int i;
 532   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 533     {
 534       /* Previous masks will cover BIAS scalars.  This mask covers the
 535          next batch.  */
 536       poly_uint64 bias = nscalars_per_mask * i;
 537       tree bias_tree = build_int_cst (compare_type, bias);
 538       gimple *tmp_stmt;
 539
 540       /* See whether the first iteration of the vector loop is known
 541          to have a full mask.  */
 542       poly_uint64 const_limit;
 543       bool first_iteration_full
 544         = (poly_int_tree_p (first_limit, &const_limit)
 545            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 546
 547       /* Rather than have a new IV that starts at BIAS and goes up to
 548          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 549          and adjust the bound down by BIAS.  */
 550       tree this_test_limit = test_limit;
 551       if (i != 0)
 552         {
 553           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 554                                           compare_type, this_test_limit,
 555                                           bias_tree);
 556           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 557                                           compare_type, this_test_limit,
 558                                           bias_tree);
 559         }
 560
 561       /* Create the initial mask.  First include all scalars that
 562          are within the loop limit.  */
 563       tree init_mask = NULL_TREE;
 564       if (!first_iteration_full)
 565         {
 566           tree start, end;
 567           if (first_limit == test_limit)
 568             {
 569               /* Use a natural test between zero (the initial IV value)
 570                  and the loop limit.  The "else" block would be valid too,
 571                  but this choice can avoid the need to load BIAS_TREE into
 572                  a register.  */
 573               start = zero_index;
 574               end = this_test_limit;
 575             }
 576           else
 577             {
 578               /* FIRST_LIMIT is the maximum number of scalars handled by the
 579                  first iteration of the vector loop.  Test the portion
 580                  associated with this mask.  */
 581               start = bias_tree;
 582               end = first_limit;
 583             }
 584
 585           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 586           tmp_stmt = vect_gen_while (init_mask, start, end);
 587           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 588         }
 589
 590       /* Now AND out the bits that are within the number of skipped
 591          scalars.  */
 592       poly_uint64 const_skip;
 593       if (nscalars_skip
 594           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 595                && known_le (const_skip, bias)))
 596         {
 597           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 598                                                     bias_tree, nscalars_skip);
 599           if (init_mask)
 600             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 601                                       init_mask, unskipped_mask);
 602           else
 603             init_mask = unskipped_mask;
 604         }
 605
 606       if (!init_mask)
 607         /* First iteration is full.  */
 608         init_mask = build_minus_one_cst (mask_type);
 609
 610       /* Get the mask value for the next iteration of the loop.  */
 611       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 612       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 613       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 614
 615       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 616     }
 617   return next_mask;
 618 }
 619
 620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 621    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 622    number of iterations of the original scalar loop that should be
 623    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 624    as for vect_set_loop_condition.
 625
 626    Insert the branch-back condition before LOOP_COND_GSI and return the
 627    final gcond.  */
 628
 629 static gcond *
 630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 631                                 tree niters, tree final_iv,
 632                                 bool niters_maybe_zero,
 633                                 gimple_stmt_iterator loop_cond_gsi)
 634 {
 635   gimple_seq preheader_seq = NULL;
 636   gimple_seq header_seq = NULL;
 637
 638   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 639   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 640   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
 641   tree orig_niters = niters;
 642
 643   /* Type of the initial value of NITERS.  */
 644   tree ni_actual_type = TREE_TYPE (niters);
 645   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 646
 647   /* Convert NITERS to the same size as the compare.  */
 648   if (compare_precision > ni_actual_precision
 649       && niters_maybe_zero)
 650     {
 651       /* We know that there is always at least one iteration, so if the
 652          count is zero then it must have wrapped.  Cope with this by
 653          subtracting 1 before the conversion and adding 1 to the result.  */
 654       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 655       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 656                              niters, build_minus_one_cst (ni_actual_type));
 657       niters = gimple_convert (&preheader_seq, compare_type, niters);
 658       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 659                              niters, build_one_cst (compare_type));
 660     }
 661   else
 662     niters = gimple_convert (&preheader_seq, compare_type, niters);
 663
 664   /* Convert skip_niters to the right type.  */
 665   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 666
 667   /* Now calculate the value that the induction variable must be able
 668      to hit in order to ensure that we end the loop with an all-false mask.
 669      This involves adding the maximum number of inactive trailing scalar
 670      iterations.  */
 671   widest_int iv_limit;
 672   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
 673   if (known_max_iters)
 674     {
 675       if (niters_skip)
 676         {
 677           /* Add the maximum number of skipped iterations to the
 678              maximum iteration count.  */
 679           if (TREE_CODE (niters_skip) == INTEGER_CST)
 680             iv_limit += wi::to_widest (niters_skip);
 681           else
 682             iv_limit += max_vf - 1;
 683         }
 684       /* IV_LIMIT is the maximum number of latch iterations, which is also
 685          the maximum in-range IV value.  Round this value down to the previous
 686          vector alignment boundary and then add an extra full iteration.  */
 687       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 688       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
 689     }
 690
 691   /* Get the vectorization factor in tree form.  */
 692   tree vf = build_int_cst (compare_type,
 693                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 694
 695   /* Iterate over all the rgroups and fill in their masks.  We could use
 696      the first mask from any rgroup for the loop condition; here we
 697      arbitrarily pick the last.  */
 698   tree test_mask = NULL_TREE;
 699   rgroup_masks *rgm;
 700   unsigned int i;
 701   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 702   FOR_EACH_VEC_ELT (*masks, i, rgm)
 703     if (!rgm->masks.is_empty ())
 704       {
 705         /* First try using permutes.  This adds a single vector
 706            instruction to the loop for each mask, but needs no extra
 707            loop invariants or IVs.  */
 708         unsigned int nmasks = i + 1;
 709         if ((nmasks & 1) == 0)
 710           {
 711             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 712             if (!half_rgm->masks.is_empty ()
 713                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 714               continue;
 715           }
 716
 717         /* See whether zero-based IV would ever generate all-false masks
 718            before wrapping around.  */
 719         bool might_wrap_p
 720           = (!known_max_iters
 721              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 722                                     UNSIGNED)
 723                  > compare_precision));
 724
 725         /* Set up all masks for this group.  */
 726         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 727                                                   &preheader_seq,
 728                                                   loop_cond_gsi, rgm, vf,
 729                                                   niters, niters_skip,
 730                                                   might_wrap_p);
 731       }
 732
 733   /* Emit all accumulated statements.  */
 734   add_preheader_seq (loop, preheader_seq);
 735   add_header_seq (loop, header_seq);
 736
 737   /* Get a boolean result that tells us whether to iterate.  */
 738   edge exit_edge = single_exit (loop);
 739   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 740   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 741   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 742                                         NULL_TREE, NULL_TREE);
 743   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 744
 745   /* The loop iterates (NITERS - 1) / VF + 1 times.
 746      Subtract one from this to get the latch count.  */
 747   tree step = build_int_cst (compare_type,
 748                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 749   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 750                                        build_minus_one_cst (compare_type));
 751   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 752                                      niters_minus_one, step);
 753
 754   if (final_iv)
 755     {
 756       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 757       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 758     }
 759
 760   return cond_stmt;
 761 }
 762
 763 /* Like vect_set_loop_condition, but handle the case in which there
 764    are no loop masks.  */
 765
 766 static gcond *
 767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
 768                                   tree step, tree final_iv,
 769                                   bool niters_maybe_zero,
 770                                   gimple_stmt_iterator loop_cond_gsi)
 771 {
 772   tree indx_before_incr, indx_after_incr;
 773   gcond *cond_stmt;
 774   gcond *orig_cond;
 775   edge pe = loop_preheader_edge (loop);
 776   edge exit_edge = single_exit (loop);
 777   gimple_stmt_iterator incr_gsi;
 778   bool insert_after;
 779   enum tree_code code;
 780   tree niters_type = TREE_TYPE (niters);
 781
 782   orig_cond = get_loop_exit_condition (loop);
 783   gcc_assert (orig_cond);
 784   loop_cond_gsi = gsi_for_stmt (orig_cond);
 785
 786   tree init, limit;
 787   if (!niters_maybe_zero && integer_onep (step))
 788     {
 789       /* In this case we can use a simple 0-based IV:
 790
 791          A:
 792            x = 0;
 793            do
 794              {
 795                ...
 796                x += 1;
 797              }
 798            while (x < NITERS);  */
 799       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 800       init = build_zero_cst (niters_type);
 801       limit = niters;
 802     }
 803   else
 804     {
 805       /* The following works for all values of NITERS except 0:
 806
 807          B:
 808            x = 0;
 809            do
 810              {
 811                ...
 812                x += STEP;
 813              }
 814            while (x <= NITERS - STEP);
 815
 816          so that the loop continues to iterate if x + STEP - 1 < NITERS
 817          but stops if x + STEP - 1 >= NITERS.
 818
 819          However, if NITERS is zero, x never hits a value above NITERS - STEP
 820          before wrapping around.  There are two obvious ways of dealing with
 821          this:
 822
 823          - start at STEP - 1 and compare x before incrementing it
 824          - start at -1 and compare x after incrementing it
 825
 826          The latter is simpler and is what we use.  The loop in this case
 827          looks like:
 828
 829          C:
 830            x = -1;
 831            do
 832              {
 833                ...
 834                x += STEP;
 835              }
 836            while (x < NITERS - STEP);
 837
 838          In both cases the loop limit is NITERS - STEP.  */
 839       gimple_seq seq = NULL;
 840       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 841       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 842       if (seq)
 843         {
 844           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 845           gcc_assert (!new_bb);
 846         }
 847       if (niters_maybe_zero)
 848         {
 849           /* Case C.  */
 850           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 851           init = build_all_ones_cst (niters_type);
 852         }
 853       else
 854         {
 855           /* Case B.  */
 856           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 857           init = build_zero_cst (niters_type);
 858         }
 859     }
 860
 861   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 862   create_iv (init, step, NULL_TREE, loop,
 863              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 864   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 865                                               true, NULL_TREE, true,
 866                                               GSI_SAME_STMT);
 867   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 868                                      true, GSI_SAME_STMT);
 869
 870   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 871                                  NULL_TREE);
 872
 873   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 874
 875   /* Record the number of latch iterations.  */
 876   if (limit == niters)
 877     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 878        latch count.  */
 879     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 880                                        build_int_cst (niters_type, 1));
 881   else
 882     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 883        Subtract one from this to get the latch count.  */
 884     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 885                                        limit, step);
 886
 887   if (final_iv)
 888     {
 889       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 890                                              indx_after_incr, init);
 891       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 892     }
 893
 894   return cond_stmt;
 895 }
 896
 897 /* If we're using fully-masked loops, make LOOP iterate:
 898
 899       N == (NITERS - 1) / STEP + 1
 900
 901    times.  When NITERS is zero, this is equivalent to making the loop
 902    execute (1 << M) / STEP times, where M is the precision of NITERS.
 903    NITERS_MAYBE_ZERO is true if this last case might occur.
 904
 905    If we're not using fully-masked loops, make LOOP iterate:
 906
 907       N == (NITERS - STEP) / STEP + 1
 908
 909    times, where NITERS is known to be outside the range [1, STEP - 1].
 910    This is equivalent to making the loop execute NITERS / STEP times
 911    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 912    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 913
 914    If FINAL_IV is nonnull, it is an SSA name that should be set to
 915    N * STEP on exit from the loop.
 916
 917    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 918
 919 void
 920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 921                          tree niters, tree step, tree final_iv,
 922                          bool niters_maybe_zero)
 923 {
 924   gcond *cond_stmt;
 925   gcond *orig_cond = get_loop_exit_condition (loop);
 926   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 927
 928   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 929     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 930                                                 final_iv, niters_maybe_zero,
 931                                                 loop_cond_gsi);
 932   else
 933     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 934                                                   final_iv, niters_maybe_zero,
 935                                                   loop_cond_gsi);
 936
 937   /* Remove old loop exit test.  */
 938   stmt_vec_info orig_cond_info;
 939   if (loop_vinfo
 940       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 941     loop_vinfo->remove_stmt (orig_cond_info);
 942   else
 943     gsi_remove (&loop_cond_gsi, true);
 944
 945   if (dump_enabled_p ())
 946     {
 947       dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
 948       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
 949     }
 950 }
 951
 952 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 953    For all PHI arguments in FROM->dest and TO->dest from those
 954    edges ensure that TO->dest PHI arguments have current_def
 955    to that in from.  */
 956
 957 static void
 958 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 959 {
 960   gimple_stmt_iterator gsi_from, gsi_to;
 961
 962   for (gsi_from = gsi_start_phis (from->dest),
 963        gsi_to = gsi_start_phis (to->dest);
 964        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 965     {
 966       gimple *from_phi = gsi_stmt (gsi_from);
 967       gimple *to_phi = gsi_stmt (gsi_to);
 968       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 969       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 970       if (virtual_operand_p (from_arg))
 971         {
 972           gsi_next (&gsi_from);
 973           continue;
 974         }
 975       if (virtual_operand_p (to_arg))
 976         {
 977           gsi_next (&gsi_to);
 978           continue;
 979         }
 980       if (TREE_CODE (from_arg) != SSA_NAME)
 981         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 982       else
 983         {
 984           if (get_current_def (to_arg) == NULL_TREE)
 985             set_current_def (to_arg, get_current_def (from_arg));
 986         }
 987       gsi_next (&gsi_from);
 988       gsi_next (&gsi_to);
 989     }
 990
 991   gphi *from_phi = get_virtual_phi (from->dest);
 992   gphi *to_phi = get_virtual_phi (to->dest);
 993   if (from_phi)
 994     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 995                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 996 }
 997
 998
 999 /* Given LOOP this function generates a new copy of it and puts it
1000    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1001    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1002    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1003    entry or exit of LOOP.  */
1004
1005 struct loop *
1006 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1007                                         struct loop *scalar_loop, edge e)
1008 {
1009   struct loop *new_loop;
1010   basic_block *new_bbs, *bbs, *pbbs;
1011   bool at_exit;
1012   bool was_imm_dom;
1013   basic_block exit_dest;
1014   edge exit, new_exit;
1015   bool duplicate_outer_loop = false;
1016
1017   exit = single_exit (loop);
1018   at_exit = (e == exit);
1019   if (!at_exit && e != loop_preheader_edge (loop))
1020     return NULL;
1021
1022   if (scalar_loop == NULL)
1023     scalar_loop = loop;
1024
1025   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1026   pbbs = bbs + 1;
1027   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1028   /* Allow duplication of outer loops.  */
1029   if (scalar_loop->inner)
1030     duplicate_outer_loop = true;
1031   /* Check whether duplication is possible.  */
1032   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1033     {
1034       free (bbs);
1035       return NULL;
1036     }
1037
1038   /* Generate new loop structure.  */
1039   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1040   duplicate_subloops (scalar_loop, new_loop);
1041
1042   exit_dest = exit->dest;
1043   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1044                                           exit_dest) == loop->header ?
1045                  true : false);
1046
1047   /* Also copy the pre-header, this avoids jumping through hoops to
1048      duplicate the loop entry PHI arguments.  Create an empty
1049      pre-header unconditionally for this.  */
1050   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1051   edge entry_e = single_pred_edge (preheader);
1052   bbs[0] = preheader;
1053   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1054
1055   exit = single_exit (scalar_loop);
1056   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1057             &exit, 1, &new_exit, NULL,
1058             at_exit ? loop->latch : e->src, true);
1059   exit = single_exit (loop);
1060   basic_block new_preheader = new_bbs[0];
1061
1062   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1063
1064   if (scalar_loop != loop)
1065     {
1066       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1067          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1068          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1069          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1070          header) to have current_def set, so copy them over.  */
1071       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1072                                                 exit);
1073       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1074                                                            0),
1075                                                 EDGE_SUCC (loop->latch, 0));
1076     }
1077
1078   if (at_exit) /* Add the loop copy at exit.  */
1079     {
1080       if (scalar_loop != loop)
1081         {
1082           gphi_iterator gsi;
1083           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1084
1085           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1086                gsi_next (&gsi))
1087             {
1088               gphi *phi = gsi.phi ();
1089               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1090               location_t orig_locus
1091                 = gimple_phi_arg_location_from_edge (phi, e);
1092
1093               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1094             }
1095         }
1096       redirect_edge_and_branch_force (e, new_preheader);
1097       flush_pending_stmts (e);
1098       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1099       if (was_imm_dom || duplicate_outer_loop)
1100         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1101
1102       /* And remove the non-necessary forwarder again.  Keep the other
1103          one so we have a proper pre-header for the loop at the exit edge.  */
1104       redirect_edge_pred (single_succ_edge (preheader),
1105                           single_pred (preheader));
1106       delete_basic_block (preheader);
1107       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1108                                loop_preheader_edge (scalar_loop)->src);
1109     }
1110   else /* Add the copy at entry.  */
1111     {
1112       if (scalar_loop != loop)
1113         {
1114           /* Remove the non-necessary forwarder of scalar_loop again.  */
1115           redirect_edge_pred (single_succ_edge (preheader),
1116                               single_pred (preheader));
1117           delete_basic_block (preheader);
1118           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1119                                    loop_preheader_edge (scalar_loop)->src);
1120           preheader = split_edge (loop_preheader_edge (loop));
1121           entry_e = single_pred_edge (preheader);
1122         }
1123
1124       redirect_edge_and_branch_force (entry_e, new_preheader);
1125       flush_pending_stmts (entry_e);
1126       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1127
1128       redirect_edge_and_branch_force (new_exit, preheader);
1129       flush_pending_stmts (new_exit);
1130       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1131
1132       /* And remove the non-necessary forwarder again.  Keep the other
1133          one so we have a proper pre-header for the loop at the exit edge.  */
1134       redirect_edge_pred (single_succ_edge (new_preheader),
1135                           single_pred (new_preheader));
1136       delete_basic_block (new_preheader);
1137       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1138                                loop_preheader_edge (new_loop)->src);
1139     }
1140
1141   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1142   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1143     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1144
1145   if (scalar_loop != loop)
1146     {
1147       /* Update new_loop->header PHIs, so that on the preheader
1148          edge they are the ones from loop rather than scalar_loop.  */
1149       gphi_iterator gsi_orig, gsi_new;
1150       edge orig_e = loop_preheader_edge (loop);
1151       edge new_e = loop_preheader_edge (new_loop);
1152
1153       for (gsi_orig = gsi_start_phis (loop->header),
1154            gsi_new = gsi_start_phis (new_loop->header);
1155            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1156            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1157         {
1158           gphi *orig_phi = gsi_orig.phi ();
1159           gphi *new_phi = gsi_new.phi ();
1160           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1161           location_t orig_locus
1162             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1163
1164           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1165         }
1166     }
1167
1168   free (new_bbs);
1169   free (bbs);
1170
1171   checking_verify_dominators (CDI_DOMINATORS);
1172
1173   return new_loop;
1174 }
1175
1176
1177 /* Given the condition expression COND, put it as the last statement of
1178    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1179    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1180    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1181    new edge as irreducible if IRREDUCIBLE_P is true.  */
1182
1183 static edge
1184 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1185                        basic_block guard_to, basic_block dom_bb,
1186                        profile_probability probability, bool irreducible_p)
1187 {
1188   gimple_stmt_iterator gsi;
1189   edge new_e, enter_e;
1190   gcond *cond_stmt;
1191   gimple_seq gimplify_stmt_list = NULL;
1192
1193   enter_e = EDGE_SUCC (guard_bb, 0);
1194   enter_e->flags &= ~EDGE_FALLTHRU;
1195   enter_e->flags |= EDGE_FALSE_VALUE;
1196   gsi = gsi_last_bb (guard_bb);
1197
1198   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1199                                  NULL_TREE);
1200   if (gimplify_stmt_list)
1201     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1202
1203   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1204   gsi = gsi_last_bb (guard_bb);
1205   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1206
1207   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1208   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1209
1210   new_e->probability = probability;
1211   if (irreducible_p)
1212     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1213
1214   enter_e->probability = probability.invert ();
1215   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1216
1217   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1218   if (enter_e->dest->loop_father->header == enter_e->dest)
1219     split_edge (enter_e);
1220
1221   return new_e;
1222 }
1223
1224
1225 /* This function verifies that the following restrictions apply to LOOP:
1226    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1227        for innermost loop and 5 basic blocks for outer-loop.
1228    (2) it is single entry, single exit
1229    (3) its exit condition is the last stmt in the header
1230    (4) E is the entry/exit edge of LOOP.
1231  */
1232
1233 bool
1234 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1235 {
1236   edge exit_e = single_exit (loop);
1237   edge entry_e = loop_preheader_edge (loop);
1238   gcond *orig_cond = get_loop_exit_condition (loop);
1239   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1240   unsigned int num_bb = loop->inner? 5 : 2;
1241
1242   /* All loops have an outer scope; the only case loop->outer is NULL is for
1243      the function itself.  */
1244   if (!loop_outer (loop)
1245       || loop->num_nodes != num_bb
1246       || !empty_block_p (loop->latch)
1247       || !single_exit (loop)
1248       /* Verify that new loop exit condition can be trivially modified.  */
1249       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1250       || (e != exit_e && e != entry_e))
1251     return false;
1252
1253   return true;
1254 }
1255
1256 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1257    in the exit bb and rename all the uses after the loop.  This simplifies
1258    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1259    (but normally loop closed SSA form doesn't require virtual PHIs to be
1260    in the same form).  Doing this early simplifies the checking what
1261    uses should be renamed.  */
1262
1263 static void
1264 create_lcssa_for_virtual_phi (struct loop *loop)
1265 {
1266   gphi_iterator gsi;
1267   edge exit_e = single_exit (loop);
1268
1269   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1270     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1271       {
1272         gphi *phi = gsi.phi ();
1273         for (gsi = gsi_start_phis (exit_e->dest);
1274              !gsi_end_p (gsi); gsi_next (&gsi))
1275           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1276             break;
1277         if (gsi_end_p (gsi))
1278           {
1279             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1280             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1281             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1282             imm_use_iterator imm_iter;
1283             gimple *stmt;
1284             use_operand_p use_p;
1285
1286             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1287               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1288             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1289             gimple_phi_set_result (new_phi, new_vop);
1290             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1291               if (stmt != new_phi
1292                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1293                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1294                   SET_USE (use_p, new_vop);
1295           }
1296         break;
1297       }
1298
1299 }
1300
1301 /* Function vect_get_loop_location.
1302
1303    Extract the location of the loop in the source code.
1304    If the loop is not well formed for vectorization, an estimated
1305    location is calculated.
1306    Return the loop location if succeed and NULL if not.  */
1307
1308 dump_user_location_t
1309 find_loop_location (struct loop *loop)
1310 {
1311   gimple *stmt = NULL;
1312   basic_block bb;
1313   gimple_stmt_iterator si;
1314
1315   if (!loop)
1316     return dump_user_location_t ();
1317
1318   stmt = get_loop_exit_condition (loop);
1319
1320   if (stmt
1321       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1322     return stmt;
1323
1324   /* If we got here the loop is probably not "well formed",
1325      try to estimate the loop location */
1326
1327   if (!loop->header)
1328     return dump_user_location_t ();
1329
1330   bb = loop->header;
1331
1332   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1333     {
1334       stmt = gsi_stmt (si);
1335       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1336         return stmt;
1337     }
1338
1339   return dump_user_location_t ();
1340 }
1341
1342 /* Return true if the phi described by STMT_INFO defines an IV of the
1343    loop to be vectorized.  */
1344
1345 static bool
1346 iv_phi_p (stmt_vec_info stmt_info)
1347 {
1348   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1349   if (virtual_operand_p (PHI_RESULT (phi)))
1350     return false;
1351
1352   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1353       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1354     return false;
1355
1356   return true;
1357 }
1358
1359 /* Function vect_can_advance_ivs_p
1360
1361    In case the number of iterations that LOOP iterates is unknown at compile
1362    time, an epilog loop will be generated, and the loop induction variables
1363    (IVs) will be "advanced" to the value they are supposed to take just before
1364    the epilog loop.  Here we check that the access function of the loop IVs
1365    and the expression that represents the loop bound are simple enough.
1366    These restrictions will be relaxed in the future.  */
1367
1368 bool
1369 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1370 {
1371   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1372   basic_block bb = loop->header;
1373   gphi_iterator gsi;
1374
1375   /* Analyze phi functions of the loop header.  */
1376
1377   if (dump_enabled_p ())
1378     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1379   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1380     {
1381       tree evolution_part;
1382
1383       gphi *phi = gsi.phi ();
1384       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1385       if (dump_enabled_p ())
1386         {
1387           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1388           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi_info->stmt, 0);
1389         }
1390
1391       /* Skip virtual phi's. The data dependences that are associated with
1392          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1393
1394          Skip reduction phis.  */
1395       if (!iv_phi_p (phi_info))
1396         {
1397           if (dump_enabled_p ())
1398             dump_printf_loc (MSG_NOTE, vect_location,
1399                              "reduc or virtual phi. skip.\n");
1400           continue;
1401         }
1402
1403       /* Analyze the evolution function.  */
1404
1405       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1406       if (evolution_part == NULL_TREE)
1407         {
1408           if (dump_enabled_p ())
1409             dump_printf (MSG_MISSED_OPTIMIZATION,
1410                          "No access function or evolution.\n");
1411           return false;
1412         }
1413
1414       /* FORNOW: We do not transform initial conditions of IVs
1415          which evolution functions are not invariants in the loop.  */
1416
1417       if (!expr_invariant_in_loop_p (loop, evolution_part))
1418         {
1419           if (dump_enabled_p ())
1420             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421                              "evolution not invariant in loop.\n");
1422           return false;
1423         }
1424
1425       /* FORNOW: We do not transform initial conditions of IVs
1426          which evolution functions are a polynomial of degree >= 2.  */
1427
1428       if (tree_is_chrec (evolution_part))
1429         {
1430           if (dump_enabled_p ())
1431             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432                              "evolution is chrec.\n");
1433           return false;
1434         }
1435     }
1436
1437   return true;
1438 }
1439
1440
1441 /*   Function vect_update_ivs_after_vectorizer.
1442
1443      "Advance" the induction variables of LOOP to the value they should take
1444      after the execution of LOOP.  This is currently necessary because the
1445      vectorizer does not handle induction variables that are used after the
1446      loop.  Such a situation occurs when the last iterations of LOOP are
1447      peeled, because:
1448      1. We introduced new uses after LOOP for IVs that were not originally used
1449         after LOOP: the IVs of LOOP are now used by an epilog loop.
1450      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1451         times, whereas the loop IVs should be bumped N times.
1452
1453      Input:
1454      - LOOP - a loop that is going to be vectorized. The last few iterations
1455               of LOOP were peeled.
1456      - NITERS - the number of iterations that LOOP executes (before it is
1457                 vectorized). i.e, the number of times the ivs should be bumped.
1458      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1459                   coming out from LOOP on which there are uses of the LOOP ivs
1460                   (this is the path from LOOP->exit to epilog_loop->preheader).
1461
1462                   The new definitions of the ivs are placed in LOOP->exit.
1463                   The phi args associated with the edge UPDATE_E in the bb
1464                   UPDATE_E->dest are updated accordingly.
1465
1466      Assumption 1: Like the rest of the vectorizer, this function assumes
1467      a single loop exit that has a single predecessor.
1468
1469      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1470      organized in the same order.
1471
1472      Assumption 3: The access function of the ivs is simple enough (see
1473      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1474
1475      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1476      coming out of LOOP on which the ivs of LOOP are used (this is the path
1477      that leads to the epilog loop; other paths skip the epilog loop).  This
1478      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1479      needs to have its phis updated.
1480  */
1481
1482 static void
1483 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1484                                   tree niters, edge update_e)
1485 {
1486   gphi_iterator gsi, gsi1;
1487   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1488   basic_block update_bb = update_e->dest;
1489   basic_block exit_bb = single_exit (loop)->dest;
1490
1491   /* Make sure there exists a single-predecessor exit bb:  */
1492   gcc_assert (single_pred_p (exit_bb));
1493   gcc_assert (single_succ_edge (exit_bb) == update_e);
1494
1495   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1496        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1497        gsi_next (&gsi), gsi_next (&gsi1))
1498     {
1499       tree init_expr;
1500       tree step_expr, off;
1501       tree type;
1502       tree var, ni, ni_name;
1503       gimple_stmt_iterator last_gsi;
1504
1505       gphi *phi = gsi.phi ();
1506       gphi *phi1 = gsi1.phi ();
1507       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1508       if (dump_enabled_p ())
1509         {
1510           dump_printf_loc (MSG_NOTE, vect_location,
1511                            "vect_update_ivs_after_vectorizer: phi: ");
1512           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1513         }
1514
1515       /* Skip reduction and virtual phis.  */
1516       if (!iv_phi_p (phi_info))
1517         {
1518           if (dump_enabled_p ())
1519             dump_printf_loc (MSG_NOTE, vect_location,
1520                              "reduc or virtual phi. skip.\n");
1521           continue;
1522         }
1523
1524       type = TREE_TYPE (gimple_phi_result (phi));
1525       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1526       step_expr = unshare_expr (step_expr);
1527
1528       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1529          of degree >= 2 or exponential.  */
1530       gcc_assert (!tree_is_chrec (step_expr));
1531
1532       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1533
1534       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1535                          fold_convert (TREE_TYPE (step_expr), niters),
1536                          step_expr);
1537       if (POINTER_TYPE_P (type))
1538         ni = fold_build_pointer_plus (init_expr, off);
1539       else
1540         ni = fold_build2 (PLUS_EXPR, type,
1541                           init_expr, fold_convert (type, off));
1542
1543       var = create_tmp_var (type, "tmp");
1544
1545       last_gsi = gsi_last_bb (exit_bb);
1546       gimple_seq new_stmts = NULL;
1547       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1548       /* Exit_bb shouldn't be empty.  */
1549       if (!gsi_end_p (last_gsi))
1550         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1551       else
1552         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1553
1554       /* Fix phi expressions in the successor bb.  */
1555       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1556     }
1557 }
1558
1559 /* Return a gimple value containing the misalignment (measured in vector
1560    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1561    it is away from a perfectly aligned address.  Add any new statements
1562    to SEQ.  */
1563
1564 static tree
1565 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1566 {
1567   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1568   stmt_vec_info stmt_info = dr_info->stmt;
1569   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1570
1571   unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1572   gcc_assert (target_align != 0);
1573
1574   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1575                                         size_zero_node) < 0;
1576   tree offset = (negative
1577                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1578                  : size_zero_node);
1579   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1580                                                           offset);
1581   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1582   tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1583   HOST_WIDE_INT elem_size
1584     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1585   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1586
1587   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1588   tree int_start_addr = fold_convert (type, start_addr);
1589   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1590                                         target_align_minus_1);
1591
1592   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1593   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1594                                         elem_size_log);
1595
1596   return misalign_in_elems;
1597 }
1598
1599 /* Function vect_gen_prolog_loop_niters
1600
1601    Generate the number of iterations which should be peeled as prolog for the
1602    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1603    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1604    As a result, after the execution of this loop, the data reference DR will
1605    refer to an aligned location.  The following computation is generated:
1606
1607    If the misalignment of DR is known at compile time:
1608      addr_mis = int mis = DR_MISALIGNMENT (dr);
1609    Else, compute address misalignment in bytes:
1610      addr_mis = addr & (target_align - 1)
1611
1612    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1613
1614    (elem_size = element type size; an element is the scalar element whose type
1615    is the inner type of the vectype)
1616
1617    The computations will be emitted at the end of BB.  We also compute and
1618    store upper bound (included) of the result in BOUND.
1619
1620    When the step of the data-ref in the loop is not 1 (as in interleaved data
1621    and SLP), the number of iterations of the prolog must be divided by the step
1622    (which is equal to the size of interleaved group).
1623
1624    The above formulas assume that VF == number of elements in the vector. This
1625    may not hold when there are multiple-types in the loop.
1626    In this case, for some data-references in the loop the VF does not represent
1627    the number of elements that fit in the vector.  Therefore, instead of VF we
1628    use TYPE_VECTOR_SUBPARTS.  */
1629
1630 static tree
1631 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1632                              basic_block bb, int *bound)
1633 {
1634   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1635   tree var;
1636   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1637   gimple_seq stmts = NULL, new_stmts = NULL;
1638   tree iters, iters_name;
1639   stmt_vec_info stmt_info = dr_info->stmt;
1640   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1641   unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1642
1643   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1644     {
1645       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1646
1647       if (dump_enabled_p ())
1648         dump_printf_loc (MSG_NOTE, vect_location,
1649                          "known peeling = %d.\n", npeel);
1650
1651       iters = build_int_cst (niters_type, npeel);
1652       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1653     }
1654   else
1655     {
1656       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1657       tree type = TREE_TYPE (misalign_in_elems);
1658       HOST_WIDE_INT elem_size
1659         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1660       HOST_WIDE_INT align_in_elems = target_align / elem_size;
1661       tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1662       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1663
1664       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1665                                  & (align_in_elems - 1)).  */
1666       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1667                                             size_zero_node) < 0;
1668       if (negative)
1669         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1670                              align_in_elems_tree);
1671       else
1672         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1673                              misalign_in_elems);
1674       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1675       iters = fold_convert (niters_type, iters);
1676       *bound = align_in_elems - 1;
1677     }
1678
1679   if (dump_enabled_p ())
1680     {
1681       dump_printf_loc (MSG_NOTE, vect_location,
1682                        "niters for prolog loop: ");
1683       dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1684       dump_printf (MSG_NOTE, "\n");
1685     }
1686
1687   var = create_tmp_var (niters_type, "prolog_loop_niters");
1688   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1689
1690   if (new_stmts)
1691     gimple_seq_add_seq (&stmts, new_stmts);
1692   if (stmts)
1693     {
1694       gcc_assert (single_succ_p (bb));
1695       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1696       if (gsi_end_p (gsi))
1697         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1698       else
1699         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1700     }
1701   return iters_name;
1702 }
1703
1704
1705 /* Function vect_update_init_of_dr
1706
1707    If CODE is PLUS, the vector loop starts NITERS iterations after the
1708    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1709    iterations before the scalar one (using masking to skip inactive
1710    elements).  This function updates the information recorded in DR to
1711    account for the difference.  Specifically, it updates the OFFSET
1712    field of DR.  */
1713
1714 static void
1715 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1716 {
1717   tree offset = DR_OFFSET (dr);
1718
1719   niters = fold_build2 (MULT_EXPR, sizetype,
1720                         fold_convert (sizetype, niters),
1721                         fold_convert (sizetype, DR_STEP (dr)));
1722   offset = fold_build2 (code, sizetype,
1723                         fold_convert (sizetype, offset), niters);
1724   DR_OFFSET (dr) = offset;
1725 }
1726
1727
1728 /* Function vect_update_inits_of_drs
1729
1730    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1731    CODE and NITERS are as for vect_update_inits_of_dr.  */
1732
1733 static void
1734 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1735                           tree_code code)
1736 {
1737   unsigned int i;
1738   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1739   struct data_reference *dr;
1740
1741   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1742
1743   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1744   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1745     {
1746       gimple_seq seq;
1747       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1748       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1749
1750       niters = fold_convert (sizetype, niters);
1751       niters = force_gimple_operand (niters, &seq, false, var);
1752       if (seq)
1753         {
1754           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1755           gcc_assert (!new_bb);
1756         }
1757     }
1758
1759   FOR_EACH_VEC_ELT (datarefs, i, dr)
1760     {
1761       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1762       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1763         vect_update_init_of_dr (dr, niters, code);
1764     }
1765 }
1766
1767 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1768    by masking.  This involves calculating the number of iterations to
1769    be peeled and then aligning all memory references appropriately.  */
1770
1771 void
1772 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1773 {
1774   tree misalign_in_elems;
1775   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1776
1777   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1778
1779   /* From the information recorded in LOOP_VINFO get the number of iterations
1780      that need to be skipped via masking.  */
1781   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1782     {
1783       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1784                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1785       misalign_in_elems = build_int_cst (type, misalign);
1786     }
1787   else
1788     {
1789       gimple_seq seq1 = NULL, seq2 = NULL;
1790       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1791       misalign_in_elems = fold_convert (type, misalign_in_elems);
1792       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1793                                                 &seq2, true, NULL_TREE);
1794       gimple_seq_add_seq (&seq1, seq2);
1795       if (seq1)
1796         {
1797           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1798           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1799           gcc_assert (!new_bb);
1800         }
1801     }
1802
1803   if (dump_enabled_p ())
1804     {
1805       dump_printf_loc (MSG_NOTE, vect_location,
1806                        "misalignment for fully-masked loop: ");
1807       dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1808       dump_printf (MSG_NOTE, "\n");
1809     }
1810
1811   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1812
1813   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1814 }
1815
1816 /* This function builds ni_name = number of iterations.  Statements
1817    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1818    it to TRUE if new ssa_var is generated.  */
1819
1820 tree
1821 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1822 {
1823   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1824   if (TREE_CODE (ni) == INTEGER_CST)
1825     return ni;
1826   else
1827     {
1828       tree ni_name, var;
1829       gimple_seq stmts = NULL;
1830       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1831
1832       var = create_tmp_var (TREE_TYPE (ni), "niters");
1833       ni_name = force_gimple_operand (ni, &stmts, false, var);
1834       if (stmts)
1835         {
1836           gsi_insert_seq_on_edge_immediate (pe, stmts);
1837           if (new_var_p != NULL)
1838             *new_var_p = true;
1839         }
1840
1841       return ni_name;
1842     }
1843 }
1844
1845 /* Calculate the number of iterations above which vectorized loop will be
1846    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1847    of prolog loop.  If it's integer const, the integer number is also passed
1848    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1849    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1850    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1851    threshold below which the scalar (rather than vectorized) loop will be
1852    executed.  This function stores the upper bound (inclusive) of the result
1853    in BOUND_SCALAR.  */
1854
1855 static tree
1856 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1857                              int bound_prolog, poly_int64 bound_epilog, int th,
1858                              poly_uint64 *bound_scalar,
1859                              bool check_profitability)
1860 {
1861   tree type = TREE_TYPE (niters_prolog);
1862   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1863                              build_int_cst (type, bound_epilog));
1864
1865   *bound_scalar = bound_prolog + bound_epilog;
1866   if (check_profitability)
1867     {
1868       /* TH indicates the minimum niters of vectorized loop, while we
1869          compute the maximum niters of scalar loop.  */
1870       th--;
1871       /* Peeling for constant times.  */
1872       if (int_niters_prolog >= 0)
1873         {
1874           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1875           return build_int_cst (type, *bound_scalar);
1876         }
1877       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1878          and BOUND_EPILOG are inclusive upper bounds.  */
1879       if (known_ge (th, bound_prolog + bound_epilog))
1880         {
1881           *bound_scalar = th;
1882           return build_int_cst (type, th);
1883         }
1884       /* Need to do runtime comparison.  */
1885       else if (maybe_gt (th, bound_epilog))
1886         {
1887           *bound_scalar = upper_bound (*bound_scalar, th);
1888           return fold_build2 (MAX_EXPR, type,
1889                               build_int_cst (type, th), niters);
1890         }
1891     }
1892   return niters;
1893 }
1894
1895 /* NITERS is the number of times that the original scalar loop executes
1896    after peeling.  Work out the maximum number of iterations N that can
1897    be handled by the vectorized form of the loop and then either:
1898
1899    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1900
1901         niters_vector = N
1902
1903    b) set *STEP_VECTOR_PTR to one and generate:
1904
1905         niters_vector = N / vf
1906
1907    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1908    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1909    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1910
1911 void
1912 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1913                              tree *niters_vector_ptr, tree *step_vector_ptr,
1914                              bool niters_no_overflow)
1915 {
1916   tree ni_minus_gap, var;
1917   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1918   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1919   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1920   tree log_vf = NULL_TREE;
1921
1922   /* If epilogue loop is required because of data accesses with gaps, we
1923      subtract one iteration from the total number of iterations here for
1924      correct calculation of RATIO.  */
1925   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1926     {
1927       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1928                                   build_one_cst (type));
1929       if (!is_gimple_val (ni_minus_gap))
1930         {
1931           var = create_tmp_var (type, "ni_gap");
1932           gimple *stmts = NULL;
1933           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1934                                                true, var);
1935           gsi_insert_seq_on_edge_immediate (pe, stmts);
1936         }
1937     }
1938   else
1939     ni_minus_gap = niters;
1940
1941   unsigned HOST_WIDE_INT const_vf;
1942   if (vf.is_constant (&const_vf)
1943       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1944     {
1945       /* Create: niters >> log2(vf) */
1946       /* If it's known that niters == number of latch executions + 1 doesn't
1947          overflow, we can generate niters >> log2(vf); otherwise we generate
1948          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1949          will be at least one.  */
1950       log_vf = build_int_cst (type, exact_log2 (const_vf));
1951       if (niters_no_overflow)
1952         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1953       else
1954         niters_vector
1955           = fold_build2 (PLUS_EXPR, type,
1956                          fold_build2 (RSHIFT_EXPR, type,
1957                                       fold_build2 (MINUS_EXPR, type,
1958                                                    ni_minus_gap,
1959                                                    build_int_cst (type, vf)),
1960                                       log_vf),
1961                          build_int_cst (type, 1));
1962       step_vector = build_one_cst (type);
1963     }
1964   else
1965     {
1966       niters_vector = ni_minus_gap;
1967       step_vector = build_int_cst (type, vf);
1968     }
1969
1970   if (!is_gimple_val (niters_vector))
1971     {
1972       var = create_tmp_var (type, "bnd");
1973       gimple_seq stmts = NULL;
1974       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1975       gsi_insert_seq_on_edge_immediate (pe, stmts);
1976       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1977          we set range information to make niters analyzer's life easier.  */
1978       if (stmts != NULL && log_vf)
1979         set_range_info (niters_vector, VR_RANGE,
1980                         wi::to_wide (build_int_cst (type, 1)),
1981                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1982                                                   TYPE_MAX_VALUE (type),
1983                                                   log_vf)));
1984     }
1985   *niters_vector_ptr = niters_vector;
1986   *step_vector_ptr = step_vector;
1987
1988   return;
1989 }
1990
1991 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1992    loop specified by LOOP_VINFO after vectorization, compute the number
1993    of iterations before vectorization (niters_vector * vf) and store it
1994    to NITERS_VECTOR_MULT_VF_PTR.  */
1995
1996 static void
1997 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1998                                      tree niters_vector,
1999                                      tree *niters_vector_mult_vf_ptr)
2000 {
2001   /* We should be using a step_vector of VF if VF is variable.  */
2002   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2003   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2004   tree type = TREE_TYPE (niters_vector);
2005   tree log_vf = build_int_cst (type, exact_log2 (vf));
2006   basic_block exit_bb = single_exit (loop)->dest;
2007
2008   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2009   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2010                                             niters_vector, log_vf);
2011   if (!is_gimple_val (niters_vector_mult_vf))
2012     {
2013       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2014       gimple_seq stmts = NULL;
2015       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2016                                                     &stmts, true, var);
2017       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2018       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2019     }
2020   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2021 }
2022
2023 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2024    from SECOND/FIRST and puts it at the original loop's preheader/exit
2025    edge, the two loops are arranged as below:
2026
2027        preheader_a:
2028      first_loop:
2029        header_a:
2030          i_1 = PHI<i_0, i_2>;
2031          ...
2032          i_2 = i_1 + 1;
2033          if (cond_a)
2034            goto latch_a;
2035          else
2036            goto between_bb;
2037        latch_a:
2038          goto header_a;
2039
2040        between_bb:
2041          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2042
2043      second_loop:
2044        header_b:
2045          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2046                                  or with i_2 if no LCSSA phi is created
2047                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2048          ...
2049          i_4 = i_3 + 1;
2050          if (cond_b)
2051            goto latch_b;
2052          else
2053            goto exit_bb;
2054        latch_b:
2055          goto header_b;
2056
2057        exit_bb:
2058
2059    This function creates loop closed SSA for the first loop; update the
2060    second loop's PHI nodes by replacing argument on incoming edge with the
2061    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2062    is false, Loop closed ssa phis will only be created for non-iv phis for
2063    the first loop.
2064
2065    This function assumes exit bb of the first loop is preheader bb of the
2066    second loop, i.e, between_bb in the example code.  With PHIs updated,
2067    the second loop will execute rest iterations of the first.  */
2068
2069 static void
2070 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2071                                    struct loop *first, struct loop *second,
2072                                    bool create_lcssa_for_iv_phis)
2073 {
2074   gphi_iterator gsi_update, gsi_orig;
2075   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2076
2077   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2078   edge second_preheader_e = loop_preheader_edge (second);
2079   basic_block between_bb = single_exit (first)->dest;
2080
2081   gcc_assert (between_bb == second_preheader_e->src);
2082   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2083   /* Either the first loop or the second is the loop to be vectorized.  */
2084   gcc_assert (loop == first || loop == second);
2085
2086   for (gsi_orig = gsi_start_phis (first->header),
2087        gsi_update = gsi_start_phis (second->header);
2088        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2089        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2090     {
2091       gphi *orig_phi = gsi_orig.phi ();
2092       gphi *update_phi = gsi_update.phi ();
2093
2094       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2095       /* Generate lcssa PHI node for the first loop.  */
2096       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2097       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2098       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2099         {
2100           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2101           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2102           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2103           arg = new_res;
2104         }
2105
2106       /* Update PHI node in the second loop by replacing arg on the loop's
2107          incoming edge.  */
2108       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2109     }
2110 }
2111
2112 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2113    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2114    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2115    appear like below:
2116
2117        guard_bb:
2118          if (cond)
2119            goto merge_bb;
2120          else
2121            goto skip_loop;
2122
2123      skip_loop:
2124        header_a:
2125          i_1 = PHI<i_0, i_2>;
2126          ...
2127          i_2 = i_1 + 1;
2128          if (cond_a)
2129            goto latch_a;
2130          else
2131            goto exit_a;
2132        latch_a:
2133          goto header_a;
2134
2135        exit_a:
2136          i_5 = PHI<i_2>;
2137
2138        merge_bb:
2139          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2140
2141      update_loop:
2142        header_b:
2143          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2144          ...
2145          i_4 = i_3 + 1;
2146          if (cond_b)
2147            goto latch_b;
2148          else
2149            goto exit_bb;
2150        latch_b:
2151          goto header_b;
2152
2153        exit_bb:
2154
2155    This function creates PHI nodes at merge_bb and replaces the use of i_5
2156    in the update_loop's PHI node with the result of new PHI result.  */
2157
2158 static void
2159 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2160                                     struct loop *update_loop,
2161                                     edge guard_edge, edge merge_edge)
2162 {
2163   source_location merge_loc, guard_loc;
2164   edge orig_e = loop_preheader_edge (skip_loop);
2165   edge update_e = loop_preheader_edge (update_loop);
2166   gphi_iterator gsi_orig, gsi_update;
2167
2168   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2169         gsi_update = gsi_start_phis (update_loop->header));
2170        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2171        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2172     {
2173       gphi *orig_phi = gsi_orig.phi ();
2174       gphi *update_phi = gsi_update.phi ();
2175
2176       /* Generate new phi node at merge bb of the guard.  */
2177       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2178       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2179
2180       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2181          args in NEW_PHI for these edges.  */
2182       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2183       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2184       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2185       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2186       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2187       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2188
2189       /* Update phi in UPDATE_PHI.  */
2190       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2191     }
2192 }
2193
2194 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2195    this function searches for the corresponding lcssa phi node in exit
2196    bb of LOOP.  If it is found, return the phi result; otherwise return
2197    NULL.  */
2198
2199 static tree
2200 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2201                 gphi *lcssa_phi)
2202 {
2203   gphi_iterator gsi;
2204   edge e = single_exit (loop);
2205
2206   gcc_assert (single_pred_p (e->dest));
2207   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2208     {
2209       gphi *phi = gsi.phi ();
2210       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2211                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2212         return PHI_RESULT (phi);
2213     }
2214   return NULL_TREE;
2215 }
2216
2217 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2218    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2219    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2220    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2221    The CFG looks like:
2222
2223      loop:
2224        header_a:
2225          i_1 = PHI<i_0, i_2>;
2226          ...
2227          i_2 = i_1 + 1;
2228          if (cond_a)
2229            goto latch_a;
2230          else
2231            goto exit_a;
2232        latch_a:
2233          goto header_a;
2234
2235        exit_a:
2236
2237        guard_bb:
2238          if (cond)
2239            goto merge_bb;
2240          else
2241            goto epilog_loop;
2242
2243        ;; fall_through_bb
2244
2245      epilog_loop:
2246        header_b:
2247          i_3 = PHI<i_2, i_4>;
2248          ...
2249          i_4 = i_3 + 1;
2250          if (cond_b)
2251            goto latch_b;
2252          else
2253            goto merge_bb;
2254        latch_b:
2255          goto header_b;
2256
2257        merge_bb:
2258          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2259
2260        exit_bb:
2261          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2262
2263    For each name used out side EPILOG (i.e - for each name that has a lcssa
2264    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2265    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2266    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2267    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2268    in exit_bb will also be updated.  */
2269
2270 static void
2271 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2272                                     edge guard_edge, edge merge_edge)
2273 {
2274   gphi_iterator gsi;
2275   basic_block merge_bb = guard_edge->dest;
2276
2277   gcc_assert (single_succ_p (merge_bb));
2278   edge e = single_succ_edge (merge_bb);
2279   basic_block exit_bb = e->dest;
2280   gcc_assert (single_pred_p (exit_bb));
2281   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2282
2283   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2284     {
2285       gphi *update_phi = gsi.phi ();
2286       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2287       /* This loop-closed-phi actually doesn't represent a use out of the
2288          loop - the phi arg is a constant.  */
2289       if (TREE_CODE (old_arg) != SSA_NAME)
2290         continue;
2291
2292       tree merge_arg = get_current_def (old_arg);
2293       if (!merge_arg)
2294         merge_arg = old_arg;
2295
2296       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2297       /* If the var is live after loop but not a reduction, we simply
2298          use the old arg.  */
2299       if (!guard_arg)
2300         guard_arg = old_arg;
2301
2302       /* Create new phi node in MERGE_BB:  */
2303       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2304       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2305
2306       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2307          the two PHI args in merge_phi for these edges.  */
2308       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2309       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2310
2311       /* Update the original phi in exit_bb.  */
2312       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2313     }
2314 }
2315
2316 /* EPILOG loop is duplicated from the original loop for vectorizing,
2317    the arg of its loop closed ssa PHI needs to be updated.  */
2318
2319 static void
2320 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2321 {
2322   gphi_iterator gsi;
2323   basic_block exit_bb = single_exit (epilog)->dest;
2324
2325   gcc_assert (single_pred_p (exit_bb));
2326   edge e = EDGE_PRED (exit_bb, 0);
2327   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2328     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2329 }
2330
2331 /* Function vect_do_peeling.
2332
2333    Input:
2334    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2335
2336        preheader:
2337      LOOP:
2338        header_bb:
2339          loop_body
2340          if (exit_loop_cond) goto exit_bb
2341          else                goto header_bb
2342        exit_bb:
2343
2344    - NITERS: The number of iterations of the loop.
2345    - NITERSM1: The number of iterations of the loop's latch.
2346    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2347    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2348                               CHECK_PROFITABILITY is true.
2349    Output:
2350    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2351      iterate after vectorization; see vect_set_loop_condition for details.
2352    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2353      should be set to the number of scalar iterations handled by the
2354      vector loop.  The SSA name is only used on exit from the loop.
2355
2356    This function peels prolog and epilog from the loop, adds guards skipping
2357    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2358    would look like:
2359
2360        guard_bb_1:
2361          if (prefer_scalar_loop) goto merge_bb_1
2362          else                    goto guard_bb_2
2363
2364        guard_bb_2:
2365          if (skip_prolog) goto merge_bb_2
2366          else             goto prolog_preheader
2367
2368        prolog_preheader:
2369      PROLOG:
2370        prolog_header_bb:
2371          prolog_body
2372          if (exit_prolog_cond) goto prolog_exit_bb
2373          else                  goto prolog_header_bb
2374        prolog_exit_bb:
2375
2376        merge_bb_2:
2377
2378        vector_preheader:
2379      VECTOR LOOP:
2380        vector_header_bb:
2381          vector_body
2382          if (exit_vector_cond) goto vector_exit_bb
2383          else                  goto vector_header_bb
2384        vector_exit_bb:
2385
2386        guard_bb_3:
2387          if (skip_epilog) goto merge_bb_3
2388          else             goto epilog_preheader
2389
2390        merge_bb_1:
2391
2392        epilog_preheader:
2393      EPILOG:
2394        epilog_header_bb:
2395          epilog_body
2396          if (exit_epilog_cond) goto merge_bb_3
2397          else                  goto epilog_header_bb
2398
2399        merge_bb_3:
2400
2401    Note this function peels prolog and epilog only if it's necessary,
2402    as well as guards.
2403    Returns created epilogue or NULL.
2404
2405    TODO: Guard for prefer_scalar_loop should be emitted along with
2406    versioning conditions if loop versioning is needed.  */
2407
2408
2409 struct loop *
2410 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2411                  tree *niters_vector, tree *step_vector,
2412                  tree *niters_vector_mult_vf_var, int th,
2413                  bool check_profitability, bool niters_no_overflow)
2414 {
2415   edge e, guard_e;
2416   tree type = TREE_TYPE (niters), guard_cond;
2417   basic_block guard_bb, guard_to;
2418   profile_probability prob_prolog, prob_vector, prob_epilog;
2419   int estimated_vf;
2420   int prolog_peeling = 0;
2421   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2422     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2423
2424   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2425   poly_uint64 bound_epilog = 0;
2426   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2427       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2428     bound_epilog += vf - 1;
2429   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2430     bound_epilog += 1;
2431   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2432   poly_uint64 bound_scalar = bound_epilog;
2433
2434   if (!prolog_peeling && !epilog_peeling)
2435     return NULL;
2436
2437   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2438   estimated_vf = vect_vf_for_cost (loop_vinfo);
2439   if (estimated_vf == 2)
2440     estimated_vf = 3;
2441   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2442                         .apply_scale (estimated_vf - 1, estimated_vf);
2443
2444   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2445   struct loop *first_loop = loop;
2446   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2447   create_lcssa_for_virtual_phi (loop);
2448   update_ssa (TODO_update_ssa_only_virtuals);
2449
2450   if (MAY_HAVE_DEBUG_BIND_STMTS)
2451     {
2452       gcc_assert (!adjust_vec.exists ());
2453       adjust_vec.create (32);
2454     }
2455   initialize_original_copy_tables ();
2456
2457   /* Record the anchor bb at which the guard should be placed if the scalar
2458      loop might be preferred.  */
2459   basic_block anchor = loop_preheader_edge (loop)->src;
2460
2461   /* Generate the number of iterations for the prolog loop.  We do this here
2462      so that we can also get the upper bound on the number of iterations.  */
2463   tree niters_prolog;
2464   int bound_prolog = 0;
2465   if (prolog_peeling)
2466     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2467                                                  &bound_prolog);
2468   else
2469     niters_prolog = build_int_cst (type, 0);
2470
2471   /* Prolog loop may be skipped.  */
2472   bool skip_prolog = (prolog_peeling != 0);
2473   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2474      when we peel for epilog loop and when it hasn't been checked with
2475      loop versioning.  */
2476   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2477                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2478                                   bound_prolog + bound_epilog)
2479                       : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2480   /* Epilog loop must be executed if the number of iterations for epilog
2481      loop is known at compile time, otherwise we need to add a check at
2482      the end of vector loop and skip to the end of epilog loop.  */
2483   bool skip_epilog = (prolog_peeling < 0
2484                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2485                       || !vf.is_constant ());
2486   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2487   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2488     skip_epilog = false;
2489
2490   if (skip_vector)
2491     {
2492       split_edge (loop_preheader_edge (loop));
2493
2494       /* Due to the order in which we peel prolog and epilog, we first
2495          propagate probability to the whole loop.  The purpose is to
2496          avoid adjusting probabilities of both prolog and vector loops
2497          separately.  Note in this case, the probability of epilog loop
2498          needs to be scaled back later.  */
2499       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2500       if (prob_vector.initialized_p ())
2501         {
2502           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2503           scale_loop_profile (loop, prob_vector, 0);
2504         }
2505     }
2506
2507   dump_user_location_t loop_loc = find_loop_location (loop);
2508   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2509   if (prolog_peeling)
2510     {
2511       e = loop_preheader_edge (loop);
2512       if (!slpeel_can_duplicate_loop_p (loop, e))
2513         {
2514           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2515                            "loop can't be duplicated to preheader edge.\n");
2516           gcc_unreachable ();
2517         }
2518       /* Peel prolog and put it on preheader edge of loop.  */
2519       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2520       if (!prolog)
2521         {
2522           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2523                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2524           gcc_unreachable ();
2525         }
2526       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2527       first_loop = prolog;
2528       reset_original_copy_tables ();
2529
2530       /* Update the number of iterations for prolog loop.  */
2531       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2532       vect_set_loop_condition (prolog, NULL, niters_prolog,
2533                                step_prolog, NULL_TREE, false);
2534
2535       /* Skip the prolog loop.  */
2536       if (skip_prolog)
2537         {
2538           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2539                                     niters_prolog, build_int_cst (type, 0));
2540           guard_bb = loop_preheader_edge (prolog)->src;
2541           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2542           guard_to = split_edge (loop_preheader_edge (loop));
2543           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2544                                            guard_to, guard_bb,
2545                                            prob_prolog.invert (),
2546                                            irred_flag);
2547           e = EDGE_PRED (guard_to, 0);
2548           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2549           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2550
2551           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2552           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2553         }
2554       /* Update init address of DRs.  */
2555       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2556       /* Update niters for vector loop.  */
2557       LOOP_VINFO_NITERS (loop_vinfo)
2558         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2559       LOOP_VINFO_NITERSM1 (loop_vinfo)
2560         = fold_build2 (MINUS_EXPR, type,
2561                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2562       bool new_var_p = false;
2563       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2564       /* It's guaranteed that vector loop bound before vectorization is at
2565          least VF, so set range information for newly generated var.  */
2566       if (new_var_p)
2567         set_range_info (niters, VR_RANGE,
2568                         wi::to_wide (build_int_cst (type, vf)),
2569                         wi::to_wide (TYPE_MAX_VALUE (type)));
2570
2571       /* Prolog iterates at most bound_prolog times, latch iterates at
2572          most bound_prolog - 1 times.  */
2573       record_niter_bound (prolog, bound_prolog - 1, false, true);
2574       delete_update_ssa ();
2575       adjust_vec_debug_stmts ();
2576       scev_reset ();
2577     }
2578
2579   if (epilog_peeling)
2580     {
2581       e = single_exit (loop);
2582       if (!slpeel_can_duplicate_loop_p (loop, e))
2583         {
2584           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2585                            "loop can't be duplicated to exit edge.\n");
2586           gcc_unreachable ();
2587         }
2588       /* Peel epilog and put it on exit edge of loop.  */
2589       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2590       if (!epilog)
2591         {
2592           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2593                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2594           gcc_unreachable ();
2595         }
2596       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2597
2598       /* Scalar version loop may be preferred.  In this case, add guard
2599          and skip to epilog.  Note this only happens when the number of
2600          iterations of loop is unknown at compile time, otherwise this
2601          won't be vectorized.  */
2602       if (skip_vector)
2603         {
2604           /* Additional epilogue iteration is peeled if gap exists.  */
2605           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2606                                                 bound_prolog, bound_epilog,
2607                                                 th, &bound_scalar,
2608                                                 check_profitability);
2609           /* Build guard against NITERSM1 since NITERS may overflow.  */
2610           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2611           guard_bb = anchor;
2612           guard_to = split_edge (loop_preheader_edge (epilog));
2613           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2614                                            guard_to, guard_bb,
2615                                            prob_vector.invert (),
2616                                            irred_flag);
2617           e = EDGE_PRED (guard_to, 0);
2618           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2619           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2620
2621           /* Simply propagate profile info from guard_bb to guard_to which is
2622              a merge point of control flow.  */
2623           guard_to->count = guard_bb->count;
2624
2625           /* Scale probability of epilog loop back.
2626              FIXME: We should avoid scaling down and back up.  Profile may
2627              get lost if we scale down to 0.  */
2628           basic_block *bbs = get_loop_body (epilog);
2629           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2630             bbs[i]->count = bbs[i]->count.apply_scale
2631                                  (bbs[i]->count,
2632                                   bbs[i]->count.apply_probability
2633                                     (prob_vector));
2634           free (bbs);
2635         }
2636
2637       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2638       tree niters_vector_mult_vf;
2639       /* If loop is peeled for non-zero constant times, now niters refers to
2640          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2641          overflows.  */
2642       niters_no_overflow |= (prolog_peeling > 0);
2643       vect_gen_vector_loop_niters (loop_vinfo, niters,
2644                                    niters_vector, step_vector,
2645                                    niters_no_overflow);
2646       if (!integer_onep (*step_vector))
2647         {
2648           /* On exit from the loop we will have an easy way of calcalating
2649              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2650              until then.  */
2651           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2652           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2653           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2654         }
2655       else
2656         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2657                                              &niters_vector_mult_vf);
2658       /* Update IVs of original loop as if they were advanced by
2659          niters_vector_mult_vf steps.  */
2660       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2661       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2662       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2663                                         update_e);
2664
2665       if (skip_epilog)
2666         {
2667           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2668                                     niters, niters_vector_mult_vf);
2669           guard_bb = single_exit (loop)->dest;
2670           guard_to = split_edge (single_exit (epilog));
2671           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2672                                            skip_vector ? anchor : guard_bb,
2673                                            prob_epilog.invert (),
2674                                            irred_flag);
2675           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2676                                               single_exit (epilog));
2677           /* Only need to handle basic block before epilog loop if it's not
2678              the guard_bb, which is the case when skip_vector is true.  */
2679           if (guard_bb != bb_before_epilog)
2680             {
2681               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2682
2683               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2684             }
2685           scale_loop_profile (epilog, prob_epilog, 0);
2686         }
2687       else
2688         slpeel_update_phi_nodes_for_lcssa (epilog);
2689
2690       unsigned HOST_WIDE_INT bound;
2691       if (bound_scalar.is_constant (&bound))
2692         {
2693           gcc_assert (bound != 0);
2694           /* -1 to convert loop iterations to latch iterations.  */
2695           record_niter_bound (epilog, bound - 1, false, true);
2696         }
2697
2698       delete_update_ssa ();
2699       adjust_vec_debug_stmts ();
2700       scev_reset ();
2701     }
2702   adjust_vec.release ();
2703   free_original_copy_tables ();
2704
2705   return epilog;
2706 }
2707
2708 /* Function vect_create_cond_for_niters_checks.
2709
2710    Create a conditional expression that represents the run-time checks for
2711    loop's niter.  The loop is guaranteed to terminate if the run-time
2712    checks hold.
2713
2714    Input:
2715    COND_EXPR  - input conditional expression.  New conditions will be chained
2716                 with logical AND operation.  If it is NULL, then the function
2717                 is used to return the number of alias checks.
2718    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2719                 to be checked.
2720
2721    Output:
2722    COND_EXPR - conditional expression.
2723
2724    The returned COND_EXPR is the conditional expression to be used in the
2725    if statement that controls which version of the loop gets executed at
2726    runtime.  */
2727
2728 static void
2729 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2730 {
2731   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2732
2733   if (*cond_expr)
2734     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2735                               *cond_expr, part_cond_expr);
2736   else
2737     *cond_expr = part_cond_expr;
2738 }
2739
2740 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2741    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2742
2743 static void
2744 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2745 {
2746   if (*cond_expr)
2747     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2748                               *cond_expr, part_cond_expr);
2749   else
2750     *cond_expr = part_cond_expr;
2751 }
2752
2753 /* Function vect_create_cond_for_align_checks.
2754
2755    Create a conditional expression that represents the alignment checks for
2756    all of data references (array element references) whose alignment must be
2757    checked at runtime.
2758
2759    Input:
2760    COND_EXPR  - input conditional expression.  New conditions will be chained
2761                 with logical AND operation.
2762    LOOP_VINFO - two fields of the loop information are used.
2763                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2764                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2765
2766    Output:
2767    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2768                          expression.
2769    The returned value is the conditional expression to be used in the if
2770    statement that controls which version of the loop gets executed at runtime.
2771
2772    The algorithm makes two assumptions:
2773      1) The number of bytes "n" in a vector is a power of 2.
2774      2) An address "a" is aligned if a%n is zero and that this
2775         test can be done as a&(n-1) == 0.  For example, for 16
2776         byte vectors the test is a&0xf == 0.  */
2777
2778 static void
2779 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2780                                    tree *cond_expr,
2781                                    gimple_seq *cond_expr_stmt_list)
2782 {
2783   vec<stmt_vec_info> may_misalign_stmts
2784     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2785   stmt_vec_info stmt_info;
2786   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2787   tree mask_cst;
2788   unsigned int i;
2789   tree int_ptrsize_type;
2790   char tmp_name[20];
2791   tree or_tmp_name = NULL_TREE;
2792   tree and_tmp_name;
2793   gimple *and_stmt;
2794   tree ptrsize_zero;
2795   tree part_cond_expr;
2796
2797   /* Check that mask is one less than a power of 2, i.e., mask is
2798      all zeros followed by all ones.  */
2799   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2800
2801   int_ptrsize_type = signed_type_for (ptr_type_node);
2802
2803   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2804      of the first vector of the i'th data reference. */
2805
2806   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2807     {
2808       gimple_seq new_stmt_list = NULL;
2809       tree addr_base;
2810       tree addr_tmp_name;
2811       tree new_or_tmp_name;
2812       gimple *addr_stmt, *or_stmt;
2813       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2814       bool negative = tree_int_cst_compare
2815         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2816       tree offset = negative
2817         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2818
2819       /* create: addr_tmp = (int)(address_of_first_vector) */
2820       addr_base =
2821         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2822                                               offset);
2823       if (new_stmt_list != NULL)
2824         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2825
2826       sprintf (tmp_name, "addr2int%d", i);
2827       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2828       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2829       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2830
2831       /* The addresses are OR together.  */
2832
2833       if (or_tmp_name != NULL_TREE)
2834         {
2835           /* create: or_tmp = or_tmp | addr_tmp */
2836           sprintf (tmp_name, "orptrs%d", i);
2837           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2838           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2839                                          or_tmp_name, addr_tmp_name);
2840           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2841           or_tmp_name = new_or_tmp_name;
2842         }
2843       else
2844         or_tmp_name = addr_tmp_name;
2845
2846     } /* end for i */
2847
2848   mask_cst = build_int_cst (int_ptrsize_type, mask);
2849
2850   /* create: and_tmp = or_tmp & mask  */
2851   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2852
2853   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2854                                   or_tmp_name, mask_cst);
2855   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2856
2857   /* Make and_tmp the left operand of the conditional test against zero.
2858      if and_tmp has a nonzero bit then some address is unaligned.  */
2859   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2860   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2861                                 and_tmp_name, ptrsize_zero);
2862   chain_cond_expr (cond_expr, part_cond_expr);
2863 }
2864
2865 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2866    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2867    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2868    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2869
2870 static void
2871 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2872 {
2873   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2874   unsigned int i;
2875   vec_object_pair *pair;
2876   FOR_EACH_VEC_ELT (pairs, i, pair)
2877     {
2878       tree addr1 = build_fold_addr_expr (pair->first);
2879       tree addr2 = build_fold_addr_expr (pair->second);
2880       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2881                                          addr1, addr2);
2882       chain_cond_expr (cond_expr, part_cond_expr);
2883     }
2884 }
2885
2886 /* Create an expression that is true when all lower-bound conditions for
2887    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2888
2889 static void
2890 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2891 {
2892   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2893   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2894     {
2895       tree expr = lower_bounds[i].expr;
2896       tree type = unsigned_type_for (TREE_TYPE (expr));
2897       expr = fold_convert (type, expr);
2898       poly_uint64 bound = lower_bounds[i].min_value;
2899       if (!lower_bounds[i].unsigned_p)
2900         {
2901           expr = fold_build2 (PLUS_EXPR, type, expr,
2902                               build_int_cstu (type, bound - 1));
2903           bound += bound - 1;
2904         }
2905       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2906                                          build_int_cstu (type, bound));
2907       chain_cond_expr (cond_expr, part_cond_expr);
2908     }
2909 }
2910
2911 /* Function vect_create_cond_for_alias_checks.
2912
2913    Create a conditional expression that represents the run-time checks for
2914    overlapping of address ranges represented by a list of data references
2915    relations passed as input.
2916
2917    Input:
2918    COND_EXPR  - input conditional expression.  New conditions will be chained
2919                 with logical AND operation.  If it is NULL, then the function
2920                 is used to return the number of alias checks.
2921    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2922                 to be checked.
2923
2924    Output:
2925    COND_EXPR - conditional expression.
2926
2927    The returned COND_EXPR is the conditional expression to be used in the if
2928    statement that controls which version of the loop gets executed at runtime.
2929 */
2930
2931 void
2932 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2933 {
2934   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2935     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2936
2937   if (comp_alias_ddrs.is_empty ())
2938     return;
2939
2940   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2941                                &comp_alias_ddrs, cond_expr);
2942   if (dump_enabled_p ())
2943     dump_printf_loc (MSG_NOTE, vect_location,
2944                      "created %u versioning for alias checks.\n",
2945                      comp_alias_ddrs.length ());
2946 }
2947
2948
2949 /* Function vect_loop_versioning.
2950
2951    If the loop has data references that may or may not be aligned or/and
2952    has data reference relations whose independence was not proven then
2953    two versions of the loop need to be generated, one which is vectorized
2954    and one which isn't.  A test is then generated to control which of the
2955    loops is executed.  The test checks for the alignment of all of the
2956    data references that may or may not be aligned.  An additional
2957    sequence of runtime tests is generated for each pairs of DDRs whose
2958    independence was not proven.  The vectorized version of loop is
2959    executed only if both alias and alignment tests are passed.
2960
2961    The test generated to check which version of loop is executed
2962    is modified to also check for profitability as indicated by the
2963    cost model threshold TH.
2964
2965    The versioning precondition(s) are placed in *COND_EXPR and
2966    *COND_EXPR_STMT_LIST.  */
2967
2968 void
2969 vect_loop_versioning (loop_vec_info loop_vinfo,
2970                       unsigned int th, bool check_profitability,
2971                       poly_uint64 versioning_threshold)
2972 {
2973   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2974   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2975   basic_block condition_bb;
2976   gphi_iterator gsi;
2977   gimple_stmt_iterator cond_exp_gsi;
2978   basic_block merge_bb;
2979   basic_block new_exit_bb;
2980   edge new_exit_e, e;
2981   gphi *orig_phi, *new_phi;
2982   tree cond_expr = NULL_TREE;
2983   gimple_seq cond_expr_stmt_list = NULL;
2984   tree arg;
2985   profile_probability prob = profile_probability::likely ();
2986   gimple_seq gimplify_stmt_list = NULL;
2987   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2988   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2989   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2990   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2991
2992   if (check_profitability)
2993     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2994                              build_int_cst (TREE_TYPE (scalar_loop_iters),
2995                                             th - 1));
2996   if (maybe_ne (versioning_threshold, 0U))
2997     {
2998       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2999                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3000                                               versioning_threshold - 1));
3001       if (cond_expr)
3002         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3003                                  expr, cond_expr);
3004       else
3005         cond_expr = expr;
3006     }
3007
3008   if (version_niter)
3009     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3010
3011   if (cond_expr)
3012     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3013                                         is_gimple_condexpr, NULL_TREE);
3014
3015   if (version_align)
3016     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3017                                        &cond_expr_stmt_list);
3018
3019   if (version_alias)
3020     {
3021       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3022       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3023       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3024     }
3025
3026   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3027                                       &gimplify_stmt_list,
3028                                       is_gimple_condexpr, NULL_TREE);
3029   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3030
3031   initialize_original_copy_tables ();
3032   if (scalar_loop)
3033     {
3034       edge scalar_e;
3035       basic_block preheader, scalar_preheader;
3036
3037       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3038          scale LOOP's frequencies instead.  */
3039       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3040                             prob, prob.invert (), prob, prob.invert (), true);
3041       scale_loop_frequencies (loop, prob);
3042       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3043          while we need to move it above LOOP's preheader.  */
3044       e = loop_preheader_edge (loop);
3045       scalar_e = loop_preheader_edge (scalar_loop);
3046       /* The vector loop preheader might not be empty, since new
3047          invariants could have been created while analyzing the loop.  */
3048       gcc_assert (single_pred_p (e->src));
3049       gcc_assert (empty_block_p (scalar_e->src)
3050                   && single_pred_p (scalar_e->src));
3051       gcc_assert (single_pred_p (condition_bb));
3052       preheader = e->src;
3053       scalar_preheader = scalar_e->src;
3054       scalar_e = find_edge (condition_bb, scalar_preheader);
3055       e = single_pred_edge (preheader);
3056       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3057                                       scalar_preheader);
3058       redirect_edge_and_branch_force (scalar_e, preheader);
3059       redirect_edge_and_branch_force (e, condition_bb);
3060       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3061                                single_pred (condition_bb));
3062       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3063                                single_pred (scalar_preheader));
3064       set_immediate_dominator (CDI_DOMINATORS, preheader,
3065                                condition_bb);
3066     }
3067   else
3068     nloop = loop_version (loop, cond_expr, &condition_bb,
3069                           prob, prob.invert (), prob, prob.invert (), true);
3070
3071   if (version_niter)
3072     {
3073       /* The versioned loop could be infinite, we need to clear existing
3074          niter information which is copied from the original loop.  */
3075       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3076       vect_free_loop_info_assumptions (nloop);
3077       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3078       loop_constraint_set (loop, LOOP_C_INFINITE);
3079     }
3080
3081   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3082       && dump_enabled_p ())
3083     {
3084       if (version_alias)
3085         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3086                          "loop versioned for vectorization because of "
3087                          "possible aliasing\n");
3088       if (version_align)
3089         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3090                          "loop versioned for vectorization to enhance "
3091                          "alignment\n");
3092
3093     }
3094   free_original_copy_tables ();
3095
3096   /* Loop versioning violates an assumption we try to maintain during
3097      vectorization - that the loop exit block has a single predecessor.
3098      After versioning, the exit block of both loop versions is the same
3099      basic block (i.e. it has two predecessors). Just in order to simplify
3100      following transformations in the vectorizer, we fix this situation
3101      here by adding a new (empty) block on the exit-edge of the loop,
3102      with the proper loop-exit phis to maintain loop-closed-form.
3103      If loop versioning wasn't done from loop, but scalar_loop instead,
3104      merge_bb will have already just a single successor.  */
3105
3106   merge_bb = single_exit (loop)->dest;
3107   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3108     {
3109       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3110       new_exit_bb = split_edge (single_exit (loop));
3111       new_exit_e = single_exit (loop);
3112       e = EDGE_SUCC (new_exit_bb, 0);
3113
3114       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3115         {
3116           tree new_res;
3117           orig_phi = gsi.phi ();
3118           new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3119           new_phi = create_phi_node (new_res, new_exit_bb);
3120           arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3121           add_phi_arg (new_phi, arg, new_exit_e,
3122                        gimple_phi_arg_location_from_edge (orig_phi, e));
3123           adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3124         }
3125     }
3126
3127   /* End loop-exit-fixes after versioning.  */
3128
3129   if (cond_expr_stmt_list)
3130     {
3131       cond_exp_gsi = gsi_last_bb (condition_bb);
3132       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3133                              GSI_SAME_STMT);
3134     }
3135   update_ssa (TODO_update_ssa);
3136 }