gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53 #include "langhooks.h"
  54 #include "tree-vector-builder.h"
  55 #include "optabs-tree.h"
  56
  57 /*************************************************************************
  58   Simple Loop Peeling Utilities
  59
  60   Utilities to support loop peeling for vectorization purposes.
  61  *************************************************************************/
  62
  63
  64 /* Renames the use *OP_P.  */
  65
  66 static void
  67 rename_use_op (use_operand_p op_p)
  68 {
  69   tree new_name;
  70
  71   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  72     return;
  73
  74   new_name = get_current_def (USE_FROM_PTR (op_p));
  75
  76   /* Something defined outside of the loop.  */
  77   if (!new_name)
  78     return;
  79
  80   /* An ordinary ssa name defined in the loop.  */
  81
  82   SET_USE (op_p, new_name);
  83 }
  84
  85
  86 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  87    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  88    true.  */
  89
  90 static void
  91 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  92 {
  93   gimple *stmt;
  94   use_operand_p use_p;
  95   ssa_op_iter iter;
  96   edge e;
  97   edge_iterator ei;
  98   class loop *loop = bb->loop_father;
  99   class loop *outer_loop = NULL;
 100
 101   if (rename_from_outer_loop)
 102     {
 103       gcc_assert (loop);
 104       outer_loop = loop_outer (loop);
 105     }
 106
 107   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 108        gsi_next (&gsi))
 109     {
 110       stmt = gsi_stmt (gsi);
 111       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 112         rename_use_op (use_p);
 113     }
 114
 115   FOR_EACH_EDGE (e, ei, bb->preds)
 116     {
 117       if (!flow_bb_inside_loop_p (loop, e->src))
 118         {
 119           if (!rename_from_outer_loop)
 120             continue;
 121           if (e->src != outer_loop->header)
 122             {
 123               if (outer_loop->inner->next)
 124                 {
 125                   /* If outer_loop has 2 inner loops, allow there to
 126                      be an extra basic block which decides which of the
 127                      two loops to use using LOOP_VECTORIZED.  */
 128                   if (!single_pred_p (e->src)
 129                       || single_pred (e->src) != outer_loop->header)
 130                     continue;
 131                 }
 132             }
 133         }
 134       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 135            gsi_next (&gsi))
 136         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 137     }
 138 }
 139
 140
 141 struct adjust_info
 142 {
 143   tree from, to;
 144   basic_block bb;
 145 };
 146
 147 /* A stack of values to be adjusted in debug stmts.  We have to
 148    process them LIFO, so that the closest substitution applies.  If we
 149    processed them FIFO, without the stack, we might substitute uses
 150    with a PHI DEF that would soon become non-dominant, and when we got
 151    to the suitable one, it wouldn't have anything to substitute any
 152    more.  */
 153 static vec<adjust_info, va_heap> adjust_vec;
 154
 155 /* Adjust any debug stmts that referenced AI->from values to use the
 156    loop-closed AI->to, if the references are dominated by AI->bb and
 157    not by the definition of AI->from.  */
 158
 159 static void
 160 adjust_debug_stmts_now (adjust_info *ai)
 161 {
 162   basic_block bbphi = ai->bb;
 163   tree orig_def = ai->from;
 164   tree new_def = ai->to;
 165   imm_use_iterator imm_iter;
 166   gimple *stmt;
 167   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 168
 169   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 170
 171   /* Adjust any debug stmts that held onto non-loop-closed
 172      references.  */
 173   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 174     {
 175       use_operand_p use_p;
 176       basic_block bbuse;
 177
 178       if (!is_gimple_debug (stmt))
 179         continue;
 180
 181       gcc_assert (gimple_debug_bind_p (stmt));
 182
 183       bbuse = gimple_bb (stmt);
 184
 185       if ((bbuse == bbphi
 186            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 187           && !(bbuse == bbdef
 188                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 189         {
 190           if (new_def)
 191             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 192               SET_USE (use_p, new_def);
 193           else
 194             {
 195               gimple_debug_bind_reset_value (stmt);
 196               update_stmt (stmt);
 197             }
 198         }
 199     }
 200 }
 201
 202 /* Adjust debug stmts as scheduled before.  */
 203
 204 static void
 205 adjust_vec_debug_stmts (void)
 206 {
 207   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 208     return;
 209
 210   gcc_assert (adjust_vec.exists ());
 211
 212   while (!adjust_vec.is_empty ())
 213     {
 214       adjust_debug_stmts_now (&adjust_vec.last ());
 215       adjust_vec.pop ();
 216     }
 217 }
 218
 219 /* Adjust any debug stmts that referenced FROM values to use the
 220    loop-closed TO, if the references are dominated by BB and not by
 221    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 222    will be postponed until adjust_vec_debug_stmts is called.  */
 223
 224 static void
 225 adjust_debug_stmts (tree from, tree to, basic_block bb)
 226 {
 227   adjust_info ai;
 228
 229   if (MAY_HAVE_DEBUG_BIND_STMTS
 230       && TREE_CODE (from) == SSA_NAME
 231       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 232       && ! virtual_operand_p (from))
 233     {
 234       ai.from = from;
 235       ai.to = to;
 236       ai.bb = bb;
 237
 238       if (adjust_vec.exists ())
 239         adjust_vec.safe_push (ai);
 240       else
 241         adjust_debug_stmts_now (&ai);
 242     }
 243 }
 244
 245 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 246    to adjust any debug stmts that referenced the old phi arg,
 247    presumably non-loop-closed references left over from other
 248    transformations.  */
 249
 250 static void
 251 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 252 {
 253   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 254
 255   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 256
 257   if (MAY_HAVE_DEBUG_BIND_STMTS)
 258     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 259                         gimple_bb (update_phi));
 260 }
 261
 262 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 263    that the control should have during the first iteration and NEXT_CTRL is the
 264    value that it should have on subsequent iterations.  */
 265
 266 static void
 267 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 268                        tree next_ctrl)
 269 {
 270   gphi *phi = create_phi_node (ctrl, loop->header);
 271   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 272   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 273 }
 274
 275 /* Add SEQ to the end of LOOP's preheader block.  */
 276
 277 static void
 278 add_preheader_seq (class loop *loop, gimple_seq seq)
 279 {
 280   if (seq)
 281     {
 282       edge pe = loop_preheader_edge (loop);
 283       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 284       gcc_assert (!new_bb);
 285     }
 286 }
 287
 288 /* Add SEQ to the beginning of LOOP's header block.  */
 289
 290 static void
 291 add_header_seq (class loop *loop, gimple_seq seq)
 292 {
 293   if (seq)
 294     {
 295       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 296       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 297     }
 298 }
 299
 300 /* Return true if the target can interleave elements of two vectors.
 301    OFFSET is 0 if the first half of the vectors should be interleaved
 302    or 1 if the second half should.  When returning true, store the
 303    associated permutation in INDICES.  */
 304
 305 static bool
 306 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 307                         unsigned int offset)
 308 {
 309   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 310   poly_uint64 base = exact_div (nelts, 2) * offset;
 311   vec_perm_builder sel (nelts, 2, 3);
 312   for (unsigned int i = 0; i < 3; ++i)
 313     {
 314       sel.quick_push (base + i);
 315       sel.quick_push (base + i + nelts);
 316     }
 317   indices->new_vector (sel, 2, nelts);
 318   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 319                                *indices);
 320 }
 321
 322 /* Try to use permutes to define the masks in DEST_RGM using the masks
 323    in SRC_RGM, given that the former has twice as many masks as the
 324    latter.  Return true on success, adding any new statements to SEQ.  */
 325
 326 static bool
 327 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 328                                rgroup_controls *src_rgm)
 329 {
 330   tree src_masktype = src_rgm->type;
 331   tree dest_masktype = dest_rgm->type;
 332   machine_mode src_mode = TYPE_MODE (src_masktype);
 333   insn_code icode1, icode2;
 334   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 335       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 336                                   src_mode)) != CODE_FOR_nothing
 337       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 338                                   src_mode)) != CODE_FOR_nothing)
 339     {
 340       /* Unpacking the source masks gives at least as many mask bits as
 341          we need.  We can then VIEW_CONVERT any excess bits away.  */
 342       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 343       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 344       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 345       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 346         {
 347           tree src = src_rgm->controls[i / 2];
 348           tree dest = dest_rgm->controls[i];
 349           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 350                             ? VEC_UNPACK_HI_EXPR
 351                             : VEC_UNPACK_LO_EXPR);
 352           gassign *stmt;
 353           if (dest_masktype == unpack_masktype)
 354             stmt = gimple_build_assign (dest, code, src);
 355           else
 356             {
 357               tree temp = make_ssa_name (unpack_masktype);
 358               stmt = gimple_build_assign (temp, code, src);
 359               gimple_seq_add_stmt (seq, stmt);
 360               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 361                                           build1 (VIEW_CONVERT_EXPR,
 362                                                   dest_masktype, temp));
 363             }
 364           gimple_seq_add_stmt (seq, stmt);
 365         }
 366       return true;
 367     }
 368   vec_perm_indices indices[2];
 369   if (dest_masktype == src_masktype
 370       && interleave_supported_p (&indices[0], src_masktype, 0)
 371       && interleave_supported_p (&indices[1], src_masktype, 1))
 372     {
 373       /* The destination requires twice as many mask bits as the source, so
 374          we can use interleaving permutes to double up the number of bits.  */
 375       tree masks[2];
 376       for (unsigned int i = 0; i < 2; ++i)
 377         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 378       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 379         {
 380           tree src = src_rgm->controls[i / 2];
 381           tree dest = dest_rgm->controls[i];
 382           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 383                                               src, src, masks[i & 1]);
 384           gimple_seq_add_stmt (seq, stmt);
 385         }
 386       return true;
 387     }
 388   return false;
 389 }
 390
 391 /* Populate DEST_RGM->controls, given that they should add up to STEP.
 392
 393      STEP = MIN_EXPR <ivtmp_34, VF>;
 394
 395      First length (MIN (X, VF/N)):
 396        loop_len_15 = MIN_EXPR <STEP, VF/N>;
 397
 398      Second length:
 399        tmp = STEP - loop_len_15;
 400        loop_len_16 = MIN (tmp, VF/N);
 401
 402      Third length:
 403        tmp2 = tmp - loop_len_16;
 404        loop_len_17 = MIN (tmp2, VF/N);
 405
 406      Last length:
 407        loop_len_18 = tmp2 - loop_len_17;
 408 */
 409
 410 static void
 411 vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
 412                                rgroup_controls *dest_rgm, tree step)
 413 {
 414   tree ctrl_type = dest_rgm->type;
 415   poly_uint64 nitems_per_ctrl
 416     = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
 417   tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
 418
 419   for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 420     {
 421       tree ctrl = dest_rgm->controls[i];
 422       if (i == 0)
 423         {
 424           /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
 425           gassign *assign
 426             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 427           gimple_seq_add_stmt (seq, assign);
 428         }
 429       else if (i == dest_rgm->controls.length () - 1)
 430         {
 431           /* Last iteration: Remain capped to the range [0, VF/N].  */
 432           gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
 433                                                  dest_rgm->controls[i - 1]);
 434           gimple_seq_add_stmt (seq, assign);
 435         }
 436       else
 437         {
 438           /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
 439           step = gimple_build (seq, MINUS_EXPR, iv_type, step,
 440                                dest_rgm->controls[i - 1]);
 441           gassign *assign
 442             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 443           gimple_seq_add_stmt (seq, assign);
 444         }
 445     }
 446 }
 447
 448 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 449    for all the rgroup controls in RGC and return a control that is nonzero
 450    when the loop needs to iterate.  Add any new preheader statements to
 451    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 452
 453    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 454    times and has been vectorized according to LOOP_VINFO.
 455
 456    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 457    starts with NITERS_SKIP dummy iterations of the scalar loop before
 458    the real work starts.  The mask elements for these dummy iterations
 459    must be 0, to ensure that the extra iterations do not have an effect.
 460
 461    It is known that:
 462
 463      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 464
 465    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 466    variable that starts at 0 and has step:
 467
 468      VF * RGC->max_nscalars_per_iter * RGC->factor
 469
 470    might overflow before hitting a value above:
 471
 472      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 473
 474    This means that we cannot guarantee that such an induction variable
 475    would ever hit a value that produces a set of all-false masks or zero
 476    lengths for RGC.
 477
 478    Note: the cost of the code generated by this function is modeled
 479    by vect_estimate_min_profitable_iters, so changes here may need
 480    corresponding changes there.  */
 481
 482 static tree
 483 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 484                                  gimple_seq *preheader_seq,
 485                                  gimple_seq *header_seq,
 486                                  gimple_stmt_iterator loop_cond_gsi,
 487                                  rgroup_controls *rgc, tree niters,
 488                                  tree niters_skip, bool might_wrap_p,
 489                                  tree *iv_step, tree *compare_step)
 490 {
 491   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 492   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 493   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 494
 495   tree ctrl_type = rgc->type;
 496   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 497   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 498   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 499   tree length_limit = NULL_TREE;
 500   /* For length, we need length_limit to ensure length in range.  */
 501   if (!use_masks_p)
 502     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 503
 504   /* Calculate the maximum number of item values that the rgroup
 505      handles in total, the number that it handles for each iteration
 506      of the vector loop, and the number that it should skip during the
 507      first iteration of the vector loop.  */
 508   tree nitems_total = niters;
 509   tree nitems_step = build_int_cst (iv_type, vf);
 510   tree nitems_skip = niters_skip;
 511   if (nitems_per_iter != 1)
 512     {
 513       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 514          these multiplications don't overflow.  */
 515       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 516       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 517       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 518                                    nitems_total, compare_factor);
 519       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 520                                   nitems_step, iv_factor);
 521       if (nitems_skip)
 522         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 523                                     nitems_skip, compare_factor);
 524     }
 525
 526   /* Create an induction variable that counts the number of items
 527      processed.  */
 528   tree index_before_incr, index_after_incr;
 529   gimple_stmt_iterator incr_gsi;
 530   bool insert_after;
 531   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 532   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
 533     {
 534       /* Create an IV that counts down from niters_total and whose step
 535          is the (variable) amount processed in the current iteration:
 536            ...
 537            _10 = (unsigned long) count_12(D);
 538            ...
 539            # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
 540            _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
 541            ...
 542            vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
 543            ...
 544            ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
 545            ...
 546            if (ivtmp_9 > POLY_INT_CST [4, 4])
 547              goto <bb 4>; [83.33%]
 548            else
 549              goto <bb 5>; [16.67%]
 550       */
 551       nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
 552       tree step = rgc->controls.length () == 1 ? rgc->controls[0]
 553                                                : make_ssa_name (iv_type);
 554       /* Create decrement IV.  */
 555       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 556         {
 557           create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
 558                      insert_after, &index_before_incr, &index_after_incr);
 559           tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
 560                                    index_before_incr, nitems_step);
 561           gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
 562         }
 563       else
 564         {
 565           create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
 566                      &incr_gsi, insert_after, &index_before_incr,
 567                      &index_after_incr);
 568           gimple_seq_add_stmt (header_seq,
 569                                gimple_build_assign (step, MIN_EXPR,
 570                                                     index_before_incr,
 571                                                     nitems_step));
 572         }
 573       *iv_step = step;
 574       *compare_step = nitems_step;
 575       return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
 576                                                        : index_before_incr;
 577     }
 578
 579   /* Create increment IV.  */
 580   create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
 581              loop, &incr_gsi, insert_after, &index_before_incr,
 582              &index_after_incr);
 583
 584   tree zero_index = build_int_cst (compare_type, 0);
 585   tree test_index, test_limit, first_limit;
 586   gimple_stmt_iterator *test_gsi;
 587   if (might_wrap_p)
 588     {
 589       /* In principle the loop should stop iterating once the incremented
 590          IV reaches a value greater than or equal to:
 591
 592            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 593
 594          However, there's no guarantee that this addition doesn't overflow
 595          the comparison type, or that the IV hits a value above it before
 596          wrapping around.  We therefore adjust the limit down by one
 597          IV step:
 598
 599            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 600            -[infinite-prec] NITEMS_STEP
 601
 602          and compare the IV against this limit _before_ incrementing it.
 603          Since the comparison type is unsigned, we actually want the
 604          subtraction to saturate at zero:
 605
 606            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 607            -[sat] NITEMS_STEP
 608
 609          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 610
 611            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 612
 613          where the rightmost subtraction can be done directly in
 614          COMPARE_TYPE.  */
 615       test_index = index_before_incr;
 616       tree adjust = gimple_convert (preheader_seq, compare_type,
 617                                     nitems_step);
 618       if (nitems_skip)
 619         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 620                                adjust, nitems_skip);
 621       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 622                                  nitems_total, adjust);
 623       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 624                                  test_limit, adjust);
 625       test_gsi = &incr_gsi;
 626
 627       /* Get a safe limit for the first iteration.  */
 628       if (nitems_skip)
 629         {
 630           /* The first vector iteration can handle at most NITEMS_STEP
 631              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 632              NITEMS_SKIP to that cannot overflow.  */
 633           tree const_limit = build_int_cst (compare_type,
 634                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 635                                             * nitems_per_iter);
 636           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 637                                       nitems_total, const_limit);
 638           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 639                                       first_limit, nitems_skip);
 640         }
 641       else
 642         /* For the first iteration it doesn't matter whether the IV hits
 643            a value above NITEMS_TOTAL.  That only matters for the latch
 644            condition.  */
 645         first_limit = nitems_total;
 646     }
 647   else
 648     {
 649       /* Test the incremented IV, which will always hit a value above
 650          the bound before wrapping.  */
 651       test_index = index_after_incr;
 652       test_limit = nitems_total;
 653       if (nitems_skip)
 654         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 655                                    test_limit, nitems_skip);
 656       test_gsi = &loop_cond_gsi;
 657
 658       first_limit = test_limit;
 659     }
 660
 661   /* Convert the IV value to the comparison type (either a no-op or
 662      a demotion).  */
 663   gimple_seq test_seq = NULL;
 664   test_index = gimple_convert (&test_seq, compare_type, test_index);
 665   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 666
 667   /* Provide a definition of each control in the group.  */
 668   tree next_ctrl = NULL_TREE;
 669   tree ctrl;
 670   unsigned int i;
 671   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 672     {
 673       /* Previous controls will cover BIAS items.  This control covers the
 674          next batch.  */
 675       poly_uint64 bias = nitems_per_ctrl * i;
 676       tree bias_tree = build_int_cst (compare_type, bias);
 677
 678       /* See whether the first iteration of the vector loop is known
 679          to have a full control.  */
 680       poly_uint64 const_limit;
 681       bool first_iteration_full
 682         = (poly_int_tree_p (first_limit, &const_limit)
 683            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 684
 685       /* Rather than have a new IV that starts at BIAS and goes up to
 686          TEST_LIMIT, prefer to use the same 0-based IV for each control
 687          and adjust the bound down by BIAS.  */
 688       tree this_test_limit = test_limit;
 689       if (i != 0)
 690         {
 691           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 692                                           compare_type, this_test_limit,
 693                                           bias_tree);
 694           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 695                                           compare_type, this_test_limit,
 696                                           bias_tree);
 697         }
 698
 699       /* Create the initial control.  First include all items that
 700          are within the loop limit.  */
 701       tree init_ctrl = NULL_TREE;
 702       if (!first_iteration_full)
 703         {
 704           tree start, end;
 705           if (first_limit == test_limit)
 706             {
 707               /* Use a natural test between zero (the initial IV value)
 708                  and the loop limit.  The "else" block would be valid too,
 709                  but this choice can avoid the need to load BIAS_TREE into
 710                  a register.  */
 711               start = zero_index;
 712               end = this_test_limit;
 713             }
 714           else
 715             {
 716               /* FIRST_LIMIT is the maximum number of items handled by the
 717                  first iteration of the vector loop.  Test the portion
 718                  associated with this control.  */
 719               start = bias_tree;
 720               end = first_limit;
 721             }
 722
 723           if (use_masks_p)
 724             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 725                                         start, end, "max_mask");
 726           else
 727             {
 728               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 729               gimple_seq seq = vect_gen_len (init_ctrl, start,
 730                                              end, length_limit);
 731               gimple_seq_add_seq (preheader_seq, seq);
 732             }
 733         }
 734
 735       /* Now AND out the bits that are within the number of skipped
 736          items.  */
 737       poly_uint64 const_skip;
 738       if (nitems_skip
 739           && !(poly_int_tree_p (nitems_skip, &const_skip)
 740                && known_le (const_skip, bias)))
 741         {
 742           gcc_assert (use_masks_p);
 743           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 744                                                     bias_tree, nitems_skip);
 745           if (init_ctrl)
 746             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 747                                       init_ctrl, unskipped_mask);
 748           else
 749             init_ctrl = unskipped_mask;
 750         }
 751
 752       if (!init_ctrl)
 753         {
 754           /* First iteration is full.  */
 755           if (use_masks_p)
 756             init_ctrl = build_minus_one_cst (ctrl_type);
 757           else
 758             init_ctrl = length_limit;
 759         }
 760
 761       /* Get the control value for the next iteration of the loop.  */
 762       if (use_masks_p)
 763         {
 764           gimple_seq stmts = NULL;
 765           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 766                                       this_test_limit, "next_mask");
 767           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 768         }
 769       else
 770         {
 771           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 772           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 773                                          length_limit);
 774           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 775         }
 776
 777       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 778     }
 779
 780   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 781   if (partial_load_bias != 0)
 782     {
 783       tree adjusted_len = rgc->bias_adjusted_ctrl;
 784       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 785                                             rgc->controls[0],
 786                                             build_int_cst
 787                                             (TREE_TYPE (rgc->controls[0]),
 788                                              partial_load_bias));
 789       gimple_seq_add_stmt (header_seq, minus);
 790     }
 791
 792   return next_ctrl;
 793 }
 794
 795 /* Set up the iteration condition and rgroup controls for LOOP, given
 796    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 797    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 798    the number of iterations of the original scalar loop that should be
 799    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 800    for vect_set_loop_condition.
 801
 802    Insert the branch-back condition before LOOP_COND_GSI and return the
 803    final gcond.  */
 804
 805 static gcond *
 806 vect_set_loop_condition_partial_vectors (class loop *loop,
 807                                          loop_vec_info loop_vinfo, tree niters,
 808                                          tree final_iv, bool niters_maybe_zero,
 809                                          gimple_stmt_iterator loop_cond_gsi)
 810 {
 811   gimple_seq preheader_seq = NULL;
 812   gimple_seq header_seq = NULL;
 813
 814   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 815   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 816   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 817   tree orig_niters = niters;
 818
 819   /* Type of the initial value of NITERS.  */
 820   tree ni_actual_type = TREE_TYPE (niters);
 821   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 822   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 823   if (niters_skip)
 824     niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
 825
 826   /* Convert NITERS to the same size as the compare.  */
 827   if (compare_precision > ni_actual_precision
 828       && niters_maybe_zero)
 829     {
 830       /* We know that there is always at least one iteration, so if the
 831          count is zero then it must have wrapped.  Cope with this by
 832          subtracting 1 before the conversion and adding 1 to the result.  */
 833       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 834       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 835                              niters, build_minus_one_cst (ni_actual_type));
 836       niters = gimple_convert (&preheader_seq, compare_type, niters);
 837       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 838                              niters, build_one_cst (compare_type));
 839     }
 840   else
 841     niters = gimple_convert (&preheader_seq, compare_type, niters);
 842
 843   /* Iterate over all the rgroups and fill in their controls.  We could use
 844      the first control from any rgroup for the loop condition; here we
 845      arbitrarily pick the last.  */
 846   tree test_ctrl = NULL_TREE;
 847   tree iv_step = NULL_TREE;
 848   tree compare_step = NULL_TREE;
 849   rgroup_controls *rgc;
 850   rgroup_controls *iv_rgc = nullptr;
 851   unsigned int i;
 852   auto_vec<rgroup_controls> *controls = use_masks_p
 853                                           ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
 854                                           : &LOOP_VINFO_LENS (loop_vinfo);
 855   FOR_EACH_VEC_ELT (*controls, i, rgc)
 856     if (!rgc->controls.is_empty ())
 857       {
 858         /* First try using permutes.  This adds a single vector
 859            instruction to the loop for each mask, but needs no extra
 860            loop invariants or IVs.  */
 861         unsigned int nmasks = i + 1;
 862         if (use_masks_p && (nmasks & 1) == 0)
 863           {
 864             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 865             if (!half_rgc->controls.is_empty ()
 866                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 867               continue;
 868           }
 869
 870         if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 871             || !iv_rgc
 872             || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
 873                 != rgc->max_nscalars_per_iter * rgc->factor))
 874           {
 875             /* See whether zero-based IV would ever generate all-false masks
 876                or zero length before wrapping around.  */
 877             bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 878
 879             /* Set up all controls for this group.  */
 880             test_ctrl
 881               = vect_set_loop_controls_directly (loop, loop_vinfo,
 882                                                  &preheader_seq, &header_seq,
 883                                                  loop_cond_gsi, rgc, niters,
 884                                                  niters_skip, might_wrap_p,
 885                                                  &iv_step, &compare_step);
 886
 887             iv_rgc = rgc;
 888           }
 889
 890         if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 891             && rgc->controls.length () > 1)
 892           {
 893             /* vect_set_loop_controls_directly creates an IV whose step
 894                is equal to the expected sum of RGC->controls.  Use that
 895                information to populate RGC->controls.  */
 896             tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 897             gcc_assert (iv_step);
 898             vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
 899           }
 900       }
 901
 902   /* Emit all accumulated statements.  */
 903   add_preheader_seq (loop, preheader_seq);
 904   add_header_seq (loop, header_seq);
 905
 906   /* Get a boolean result that tells us whether to iterate.  */
 907   edge exit_edge = single_exit (loop);
 908   gcond *cond_stmt;
 909   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 910       && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 911     {
 912       gcc_assert (compare_step);
 913       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
 914       cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
 915                                      NULL_TREE);
 916     }
 917   else
 918     {
 919       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 920       tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 921       cond_stmt
 922         = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
 923     }
 924   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 925
 926   /* The loop iterates (NITERS - 1) / VF + 1 times.
 927      Subtract one from this to get the latch count.  */
 928   tree step = build_int_cst (compare_type,
 929                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 930   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 931                                        build_minus_one_cst (compare_type));
 932   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 933                                      niters_minus_one, step);
 934
 935   if (final_iv)
 936     {
 937       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 938       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 939     }
 940
 941   return cond_stmt;
 942 }
 943
 944 /* Set up the iteration condition and rgroup controls for LOOP in AVX512
 945    style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
 946    vectorized loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 947    the number of iterations of the original scalar loop that should be
 948    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 949    for vect_set_loop_condition.
 950
 951    Insert the branch-back condition before LOOP_COND_GSI and return the
 952    final gcond.  */
 953
 954 static gcond *
 955 vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
 956                                          loop_vec_info loop_vinfo, tree niters,
 957                                          tree final_iv,
 958                                          bool niters_maybe_zero,
 959                                          gimple_stmt_iterator loop_cond_gsi)
 960 {
 961   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 962   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 963   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 964   tree orig_niters = niters;
 965   gimple_seq preheader_seq = NULL;
 966
 967   /* Create an IV that counts down from niters and whose step
 968      is the number of iterations processed in the current iteration.
 969      Produce the controls with compares like the following.
 970
 971        # iv_2 = PHI <niters, iv_3>
 972        rem_4 = MIN <iv_2, VF>;
 973        remv_6 = { rem_4, rem_4, rem_4, ... }
 974        mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
 975        iv_3 = iv_2 - VF;
 976        if (iv_2 > VF)
 977          continue;
 978
 979      Where the constant is built with elements at most VF - 1 and
 980      repetitions according to max_nscalars_per_iter which is guarnateed
 981      to be the same within a group.  */
 982
 983   /* Convert NITERS to the determined IV type.  */
 984   if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
 985       && niters_maybe_zero)
 986     {
 987       /* We know that there is always at least one iteration, so if the
 988          count is zero then it must have wrapped.  Cope with this by
 989          subtracting 1 before the conversion and adding 1 to the result.  */
 990       gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
 991       niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
 992                              niters, build_minus_one_cst (TREE_TYPE (niters)));
 993       niters = gimple_convert (&preheader_seq, iv_type, niters);
 994       niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
 995                              niters, build_one_cst (iv_type));
 996     }
 997   else
 998     niters = gimple_convert (&preheader_seq, iv_type, niters);
 999
1000   /* Bias the initial value of the IV in case we need to skip iterations
1001      at the beginning.  */
1002   tree niters_adj = niters;
1003   if (niters_skip)
1004     {
1005       tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1006       niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1007                                  iv_type, niters, skip);
1008     }
1009
1010   /* The iteration step is the vectorization factor.  */
1011   tree iv_step = build_int_cst (iv_type, vf);
1012
1013   /* Create the decrement IV.  */
1014   tree index_before_incr, index_after_incr;
1015   gimple_stmt_iterator incr_gsi;
1016   bool insert_after;
1017   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1018   create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1019              &incr_gsi, insert_after, &index_before_incr,
1020              &index_after_incr);
1021
1022   /* Iterate over all the rgroups and fill in their controls.  */
1023   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1024     {
1025       if (rgc.controls.is_empty ())
1026         continue;
1027
1028       tree ctrl_type = rgc.type;
1029       poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1030
1031       tree vectype = rgc.compare_type;
1032
1033       /* index_after_incr is the IV specifying the remaining iterations in
1034          the next iteration.  */
1035       tree rem = index_after_incr;
1036       /* When the data type for the compare to produce the mask is
1037          smaller than the IV type we need to saturate.  Saturate to
1038          the smallest possible value (IV_TYPE) so we only have to
1039          saturate once (CSE will catch redundant ones we add).  */
1040       if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1041         rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1042                             UNKNOWN_LOCATION,
1043                             MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1044       rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1045                             UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1046
1047       /* Build a data vector composed of the remaining iterations.  */
1048       rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1049                                           UNKNOWN_LOCATION, vectype, rem);
1050
1051       /* Provide a definition of each vector in the control group.  */
1052       tree next_ctrl = NULL_TREE;
1053       tree first_rem = NULL_TREE;
1054       tree ctrl;
1055       unsigned int i;
1056       FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1057         {
1058           /* Previous controls will cover BIAS items.  This control covers the
1059              next batch.  */
1060           poly_uint64 bias = nitems_per_ctrl * i;
1061
1062           /* Build the constant to compare the remaining iters against,
1063              this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1064              split into pieces.  */
1065           unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1066           tree_vector_builder builder (vectype, n, 1);
1067           for (unsigned i = 0; i < n; ++i)
1068             {
1069               unsigned HOST_WIDE_INT val
1070                 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1071               gcc_assert (val < vf.to_constant ());
1072               builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1073             }
1074           tree cmp_series = builder.build ();
1075
1076           /* Create the initial control.  First include all items that
1077              are within the loop limit.  */
1078           tree init_ctrl = NULL_TREE;
1079           poly_uint64 const_limit;
1080           /* See whether the first iteration of the vector loop is known
1081              to have a full control.  */
1082           if (poly_int_tree_p (niters, &const_limit)
1083               && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1084             init_ctrl = build_minus_one_cst (ctrl_type);
1085           else
1086             {
1087               /* The remaining work items initially are niters.  Saturate,
1088                  splat and compare.  */
1089               if (!first_rem)
1090                 {
1091                   first_rem = niters;
1092                   if (TYPE_PRECISION (TREE_TYPE (vectype))
1093                       < TYPE_PRECISION (iv_type))
1094                     first_rem = gimple_build (&preheader_seq,
1095                                               MIN_EXPR, TREE_TYPE (first_rem),
1096                                               first_rem, iv_step);
1097                   first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1098                                               first_rem);
1099                   first_rem = gimple_build_vector_from_val (&preheader_seq,
1100                                                             vectype, first_rem);
1101                 }
1102               init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1103                                         cmp_series, first_rem);
1104             }
1105
1106           /* Now AND out the bits that are within the number of skipped
1107              items.  */
1108           poly_uint64 const_skip;
1109           if (niters_skip
1110               && !(poly_int_tree_p (niters_skip, &const_skip)
1111                    && known_le (const_skip, bias)))
1112             {
1113               /* For integer mode masks it's cheaper to shift out the bits
1114                  since that avoids loading a constant.  */
1115               gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1116               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1117                                         lang_hooks.types.type_for_mode
1118                                           (TYPE_MODE (ctrl_type), 1),
1119                                         init_ctrl);
1120               /* ???  But when the shift amount isn't constant this requires
1121                  a round-trip to GRPs.  We could apply the bias to either
1122                  side of the compare instead.  */
1123               tree shift = gimple_build (&preheader_seq, MULT_EXPR,
1124                                          TREE_TYPE (niters_skip), niters_skip,
1125                                          build_int_cst (TREE_TYPE (niters_skip),
1126                                                         rgc.max_nscalars_per_iter));
1127               init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1128                                         TREE_TYPE (init_ctrl),
1129                                         init_ctrl, shift);
1130               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1131                                         ctrl_type, init_ctrl);
1132             }
1133
1134           /* Get the control value for the next iteration of the loop.  */
1135           next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1136                                     UNKNOWN_LOCATION,
1137                                     LT_EXPR, ctrl_type, cmp_series, rem);
1138
1139           vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1140         }
1141     }
1142
1143   /* Emit all accumulated statements.  */
1144   add_preheader_seq (loop, preheader_seq);
1145
1146   /* Adjust the exit test using the decrementing IV.  */
1147   edge exit_edge = single_exit (loop);
1148   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1149   /* When we peel for alignment with niter_skip != 0 this can
1150      cause niter + niter_skip to wrap and since we are comparing the
1151      value before the decrement here we get a false early exit.
1152      We can't compare the value after decrement either because that
1153      decrement could wrap as well as we're not doing a saturating
1154      decrement.  To avoid this situation we force a larger
1155      iv_type.  */
1156   gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1157                                         NULL_TREE, NULL_TREE);
1158   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1159
1160   /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1161      Subtract one from this to get the latch count.  */
1162   tree niters_minus_one
1163     = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1164                    build_minus_one_cst (TREE_TYPE (orig_niters)));
1165   tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1166   if (niters_skip)
1167     niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1168                                fold_convert (iv_type, niters_skip));
1169   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1170                                      niters_adj2, iv_step);
1171
1172   if (final_iv)
1173     {
1174       gassign *assign = gimple_build_assign (final_iv, orig_niters);
1175       gsi_insert_on_edge_immediate (single_exit (loop), assign);
1176     }
1177
1178   return cond_stmt;
1179 }
1180
1181
1182 /* Like vect_set_loop_condition, but handle the case in which the vector
1183    loop handles exactly VF scalars per iteration.  */
1184
1185 static gcond *
1186 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
1187                                 tree final_iv, bool niters_maybe_zero,
1188                                 gimple_stmt_iterator loop_cond_gsi)
1189 {
1190   tree indx_before_incr, indx_after_incr;
1191   gcond *cond_stmt;
1192   gcond *orig_cond;
1193   edge pe = loop_preheader_edge (loop);
1194   edge exit_edge = single_exit (loop);
1195   gimple_stmt_iterator incr_gsi;
1196   bool insert_after;
1197   enum tree_code code;
1198   tree niters_type = TREE_TYPE (niters);
1199
1200   orig_cond = get_loop_exit_condition (loop);
1201   gcc_assert (orig_cond);
1202   loop_cond_gsi = gsi_for_stmt (orig_cond);
1203
1204   tree init, limit;
1205   if (!niters_maybe_zero && integer_onep (step))
1206     {
1207       /* In this case we can use a simple 0-based IV:
1208
1209          A:
1210            x = 0;
1211            do
1212              {
1213                ...
1214                x += 1;
1215              }
1216            while (x < NITERS);  */
1217       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1218       init = build_zero_cst (niters_type);
1219       limit = niters;
1220     }
1221   else
1222     {
1223       /* The following works for all values of NITERS except 0:
1224
1225          B:
1226            x = 0;
1227            do
1228              {
1229                ...
1230                x += STEP;
1231              }
1232            while (x <= NITERS - STEP);
1233
1234          so that the loop continues to iterate if x + STEP - 1 < NITERS
1235          but stops if x + STEP - 1 >= NITERS.
1236
1237          However, if NITERS is zero, x never hits a value above NITERS - STEP
1238          before wrapping around.  There are two obvious ways of dealing with
1239          this:
1240
1241          - start at STEP - 1 and compare x before incrementing it
1242          - start at -1 and compare x after incrementing it
1243
1244          The latter is simpler and is what we use.  The loop in this case
1245          looks like:
1246
1247          C:
1248            x = -1;
1249            do
1250              {
1251                ...
1252                x += STEP;
1253              }
1254            while (x < NITERS - STEP);
1255
1256          In both cases the loop limit is NITERS - STEP.  */
1257       gimple_seq seq = NULL;
1258       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1259       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1260       if (seq)
1261         {
1262           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1263           gcc_assert (!new_bb);
1264         }
1265       if (niters_maybe_zero)
1266         {
1267           /* Case C.  */
1268           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1269           init = build_all_ones_cst (niters_type);
1270         }
1271       else
1272         {
1273           /* Case B.  */
1274           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1275           init = build_zero_cst (niters_type);
1276         }
1277     }
1278
1279   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1280   create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1281              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1282   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1283                                               true, NULL_TREE, true,
1284                                               GSI_SAME_STMT);
1285   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1286                                      true, GSI_SAME_STMT);
1287
1288   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1289                                  NULL_TREE);
1290
1291   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1292
1293   /* Record the number of latch iterations.  */
1294   if (limit == niters)
1295     /* Case A: the loop iterates NITERS times.  Subtract one to get the
1296        latch count.  */
1297     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1298                                        build_int_cst (niters_type, 1));
1299   else
1300     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1301        Subtract one from this to get the latch count.  */
1302     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1303                                        limit, step);
1304
1305   if (final_iv)
1306     {
1307       gassign *assign;
1308       edge exit = single_exit (loop);
1309       gcc_assert (single_pred_p (exit->dest));
1310       tree phi_dest
1311         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1312       /* Make sure to maintain LC SSA form here and elide the subtraction
1313          if the value is zero.  */
1314       gphi *phi = create_phi_node (phi_dest, exit->dest);
1315       add_phi_arg (phi, indx_after_incr, exit, UNKNOWN_LOCATION);
1316       if (!integer_zerop (init))
1317         {
1318           assign = gimple_build_assign (final_iv, MINUS_EXPR,
1319                                         phi_dest, init);
1320           gimple_stmt_iterator gsi = gsi_after_labels (exit->dest);
1321           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1322         }
1323     }
1324
1325   return cond_stmt;
1326 }
1327
1328 /* If we're using fully-masked loops, make LOOP iterate:
1329
1330       N == (NITERS - 1) / STEP + 1
1331
1332    times.  When NITERS is zero, this is equivalent to making the loop
1333    execute (1 << M) / STEP times, where M is the precision of NITERS.
1334    NITERS_MAYBE_ZERO is true if this last case might occur.
1335
1336    If we're not using fully-masked loops, make LOOP iterate:
1337
1338       N == (NITERS - STEP) / STEP + 1
1339
1340    times, where NITERS is known to be outside the range [1, STEP - 1].
1341    This is equivalent to making the loop execute NITERS / STEP times
1342    when NITERS is nonzero and (1 << M) / STEP times otherwise.
1343    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1344
1345    If FINAL_IV is nonnull, it is an SSA name that should be set to
1346    N * STEP on exit from the loop.
1347
1348    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
1349
1350 void
1351 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
1352                          tree niters, tree step, tree final_iv,
1353                          bool niters_maybe_zero)
1354 {
1355   gcond *cond_stmt;
1356   gcond *orig_cond = get_loop_exit_condition (loop);
1357   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1358
1359   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1360     {
1361       if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1362         cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_vinfo,
1363                                                                     niters, final_iv,
1364                                                                     niters_maybe_zero,
1365                                                                     loop_cond_gsi);
1366       else
1367         cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
1368                                                              niters, final_iv,
1369                                                              niters_maybe_zero,
1370                                                              loop_cond_gsi);
1371     }
1372   else
1373     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
1374                                                 niters_maybe_zero,
1375                                                 loop_cond_gsi);
1376
1377   /* Remove old loop exit test.  */
1378   stmt_vec_info orig_cond_info;
1379   if (loop_vinfo
1380       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1381     loop_vinfo->remove_stmt (orig_cond_info);
1382   else
1383     gsi_remove (&loop_cond_gsi, true);
1384
1385   if (dump_enabled_p ())
1386     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1387                      (gimple *) cond_stmt);
1388 }
1389
1390 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
1391    For all PHI arguments in FROM->dest and TO->dest from those
1392    edges ensure that TO->dest PHI arguments have current_def
1393    to that in from.  */
1394
1395 static void
1396 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1397 {
1398   gimple_stmt_iterator gsi_from, gsi_to;
1399
1400   for (gsi_from = gsi_start_phis (from->dest),
1401        gsi_to = gsi_start_phis (to->dest);
1402        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1403     {
1404       gimple *from_phi = gsi_stmt (gsi_from);
1405       gimple *to_phi = gsi_stmt (gsi_to);
1406       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1407       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1408       if (virtual_operand_p (from_arg))
1409         {
1410           gsi_next (&gsi_from);
1411           continue;
1412         }
1413       if (virtual_operand_p (to_arg))
1414         {
1415           gsi_next (&gsi_to);
1416           continue;
1417         }
1418       if (TREE_CODE (from_arg) != SSA_NAME)
1419         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1420       else if (TREE_CODE (to_arg) == SSA_NAME
1421                && from_arg != to_arg)
1422         {
1423           if (get_current_def (to_arg) == NULL_TREE)
1424             {
1425               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1426                                               TREE_TYPE (get_current_def
1427                                                            (from_arg))));
1428               set_current_def (to_arg, get_current_def (from_arg));
1429             }
1430         }
1431       gsi_next (&gsi_from);
1432       gsi_next (&gsi_to);
1433     }
1434
1435   gphi *from_phi = get_virtual_phi (from->dest);
1436   gphi *to_phi = get_virtual_phi (to->dest);
1437   if (from_phi)
1438     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1439                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1440 }
1441
1442
1443 /* Given LOOP this function generates a new copy of it and puts it
1444    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1445    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1446    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1447    entry or exit of LOOP.  */
1448
1449 class loop *
1450 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1451                                         class loop *scalar_loop, edge e)
1452 {
1453   class loop *new_loop;
1454   basic_block *new_bbs, *bbs, *pbbs;
1455   bool at_exit;
1456   bool was_imm_dom;
1457   basic_block exit_dest;
1458   edge exit, new_exit;
1459   bool duplicate_outer_loop = false;
1460
1461   exit = single_exit (loop);
1462   at_exit = (e == exit);
1463   if (!at_exit && e != loop_preheader_edge (loop))
1464     return NULL;
1465
1466   if (scalar_loop == NULL)
1467     scalar_loop = loop;
1468
1469   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1470   pbbs = bbs + 1;
1471   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1472   /* Allow duplication of outer loops.  */
1473   if (scalar_loop->inner)
1474     duplicate_outer_loop = true;
1475
1476   /* Generate new loop structure.  */
1477   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1478   duplicate_subloops (scalar_loop, new_loop);
1479
1480   exit_dest = exit->dest;
1481   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1482                                           exit_dest) == loop->header ?
1483                  true : false);
1484
1485   /* Also copy the pre-header, this avoids jumping through hoops to
1486      duplicate the loop entry PHI arguments.  Create an empty
1487      pre-header unconditionally for this.  */
1488   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1489   edge entry_e = single_pred_edge (preheader);
1490   bbs[0] = preheader;
1491   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1492
1493   exit = single_exit (scalar_loop);
1494   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1495             &exit, 1, &new_exit, NULL,
1496             at_exit ? loop->latch : e->src, true);
1497   exit = single_exit (loop);
1498   basic_block new_preheader = new_bbs[0];
1499
1500   /* Before installing PHI arguments make sure that the edges
1501      into them match that of the scalar loop we analyzed.  This
1502      makes sure the SLP tree matches up between the main vectorized
1503      loop and the epilogue vectorized copies.  */
1504   if (single_succ_edge (preheader)->dest_idx
1505       != single_succ_edge (new_bbs[0])->dest_idx)
1506     {
1507       basic_block swap_bb = new_bbs[1];
1508       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1509       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1510       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1511       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1512     }
1513   if (duplicate_outer_loop)
1514     {
1515       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1516       if (loop_preheader_edge (scalar_loop)->dest_idx
1517           != loop_preheader_edge (new_inner_loop)->dest_idx)
1518         {
1519           basic_block swap_bb = new_inner_loop->header;
1520           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1521           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1522           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1523           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1524         }
1525     }
1526
1527   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1528
1529   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1530   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1531     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1532
1533   if (scalar_loop != loop)
1534     {
1535       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1536          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1537          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1538          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1539          header) to have current_def set, so copy them over.  */
1540       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1541                                                 exit);
1542       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1543                                                            0),
1544                                                 EDGE_SUCC (loop->latch, 0));
1545     }
1546
1547   if (at_exit) /* Add the loop copy at exit.  */
1548     {
1549       if (scalar_loop != loop)
1550         {
1551           gphi_iterator gsi;
1552           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1553
1554           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1555                gsi_next (&gsi))
1556             {
1557               gphi *phi = gsi.phi ();
1558               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1559               location_t orig_locus
1560                 = gimple_phi_arg_location_from_edge (phi, e);
1561
1562               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1563             }
1564         }
1565       redirect_edge_and_branch_force (e, new_preheader);
1566       flush_pending_stmts (e);
1567       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1568       if (was_imm_dom || duplicate_outer_loop)
1569         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1570
1571       /* And remove the non-necessary forwarder again.  Keep the other
1572          one so we have a proper pre-header for the loop at the exit edge.  */
1573       redirect_edge_pred (single_succ_edge (preheader),
1574                           single_pred (preheader));
1575       delete_basic_block (preheader);
1576       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1577                                loop_preheader_edge (scalar_loop)->src);
1578     }
1579   else /* Add the copy at entry.  */
1580     {
1581       if (scalar_loop != loop)
1582         {
1583           /* Remove the non-necessary forwarder of scalar_loop again.  */
1584           redirect_edge_pred (single_succ_edge (preheader),
1585                               single_pred (preheader));
1586           delete_basic_block (preheader);
1587           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1588                                    loop_preheader_edge (scalar_loop)->src);
1589           preheader = split_edge (loop_preheader_edge (loop));
1590           entry_e = single_pred_edge (preheader);
1591         }
1592
1593       redirect_edge_and_branch_force (entry_e, new_preheader);
1594       flush_pending_stmts (entry_e);
1595       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1596
1597       redirect_edge_and_branch_force (new_exit, preheader);
1598       flush_pending_stmts (new_exit);
1599       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1600
1601       /* And remove the non-necessary forwarder again.  Keep the other
1602          one so we have a proper pre-header for the loop at the exit edge.  */
1603       redirect_edge_pred (single_succ_edge (new_preheader),
1604                           single_pred (new_preheader));
1605       delete_basic_block (new_preheader);
1606       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1607                                loop_preheader_edge (new_loop)->src);
1608     }
1609
1610   if (scalar_loop != loop)
1611     {
1612       /* Update new_loop->header PHIs, so that on the preheader
1613          edge they are the ones from loop rather than scalar_loop.  */
1614       gphi_iterator gsi_orig, gsi_new;
1615       edge orig_e = loop_preheader_edge (loop);
1616       edge new_e = loop_preheader_edge (new_loop);
1617
1618       for (gsi_orig = gsi_start_phis (loop->header),
1619            gsi_new = gsi_start_phis (new_loop->header);
1620            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1621            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1622         {
1623           gphi *orig_phi = gsi_orig.phi ();
1624           gphi *new_phi = gsi_new.phi ();
1625           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1626           location_t orig_locus
1627             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1628
1629           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1630         }
1631     }
1632
1633   free (new_bbs);
1634   free (bbs);
1635
1636   checking_verify_dominators (CDI_DOMINATORS);
1637
1638   return new_loop;
1639 }
1640
1641
1642 /* Given the condition expression COND, put it as the last statement of
1643    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1644    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1645    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1646    new edge as irreducible if IRREDUCIBLE_P is true.  */
1647
1648 static edge
1649 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1650                        basic_block guard_to, basic_block dom_bb,
1651                        profile_probability probability, bool irreducible_p)
1652 {
1653   gimple_stmt_iterator gsi;
1654   edge new_e, enter_e;
1655   gcond *cond_stmt;
1656   gimple_seq gimplify_stmt_list = NULL;
1657
1658   enter_e = EDGE_SUCC (guard_bb, 0);
1659   enter_e->flags &= ~EDGE_FALLTHRU;
1660   enter_e->flags |= EDGE_FALSE_VALUE;
1661   gsi = gsi_last_bb (guard_bb);
1662
1663   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1664                                  is_gimple_condexpr_for_cond, NULL_TREE);
1665   if (gimplify_stmt_list)
1666     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1667
1668   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1669   gsi = gsi_last_bb (guard_bb);
1670   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1671
1672   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1673   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1674
1675   new_e->probability = probability;
1676   if (irreducible_p)
1677     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1678
1679   enter_e->probability = probability.invert ();
1680   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1681
1682   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1683   if (enter_e->dest->loop_father->header == enter_e->dest)
1684     split_edge (enter_e);
1685
1686   return new_e;
1687 }
1688
1689
1690 /* This function verifies that the following restrictions apply to LOOP:
1691    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1692        for innermost loop and 5 basic blocks for outer-loop.
1693    (2) it is single entry, single exit
1694    (3) its exit condition is the last stmt in the header
1695    (4) E is the entry/exit edge of LOOP.
1696  */
1697
1698 bool
1699 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1700 {
1701   edge exit_e = single_exit (loop);
1702   edge entry_e = loop_preheader_edge (loop);
1703   gcond *orig_cond = get_loop_exit_condition (loop);
1704   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1705   unsigned int num_bb = loop->inner? 5 : 2;
1706
1707   /* All loops have an outer scope; the only case loop->outer is NULL is for
1708      the function itself.  */
1709   if (!loop_outer (loop)
1710       || loop->num_nodes != num_bb
1711       || !empty_block_p (loop->latch)
1712       || !single_exit (loop)
1713       /* Verify that new loop exit condition can be trivially modified.  */
1714       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1715       || (e != exit_e && e != entry_e))
1716     return false;
1717
1718   basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1719   get_loop_body_with_size (loop, bbs, loop->num_nodes);
1720   bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1721   free (bbs);
1722   return ret;
1723 }
1724
1725 /* Function vect_get_loop_location.
1726
1727    Extract the location of the loop in the source code.
1728    If the loop is not well formed for vectorization, an estimated
1729    location is calculated.
1730    Return the loop location if succeed and NULL if not.  */
1731
1732 dump_user_location_t
1733 find_loop_location (class loop *loop)
1734 {
1735   gimple *stmt = NULL;
1736   basic_block bb;
1737   gimple_stmt_iterator si;
1738
1739   if (!loop)
1740     return dump_user_location_t ();
1741
1742   stmt = get_loop_exit_condition (loop);
1743
1744   if (stmt
1745       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1746     return stmt;
1747
1748   /* If we got here the loop is probably not "well formed",
1749      try to estimate the loop location */
1750
1751   if (!loop->header)
1752     return dump_user_location_t ();
1753
1754   bb = loop->header;
1755
1756   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1757     {
1758       stmt = gsi_stmt (si);
1759       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1760         return stmt;
1761     }
1762
1763   return dump_user_location_t ();
1764 }
1765
1766 /* Return true if the phi described by STMT_INFO defines an IV of the
1767    loop to be vectorized.  */
1768
1769 static bool
1770 iv_phi_p (stmt_vec_info stmt_info)
1771 {
1772   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1773   if (virtual_operand_p (PHI_RESULT (phi)))
1774     return false;
1775
1776   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1777       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1778     return false;
1779
1780   return true;
1781 }
1782
1783 /* Return true if vectorizer can peel for nonlinear iv.  */
1784 static bool
1785 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
1786                               enum vect_induction_op_type induction_type)
1787 {
1788   tree niters_skip;
1789   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
1790      if niters or vf is unkown:
1791      For shift, when shift mount >= precision, there would be UD.
1792      For mult, don't known how to generate
1793      init_expr * pow (step, niters) for variable niters.
1794      For neg, it should be ok, since niters of vectorized main loop
1795      will always be multiple of 2.  */
1796   if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1797        || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
1798       && induction_type != vect_step_op_neg)
1799     {
1800       if (dump_enabled_p ())
1801         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1802                          "Peeling for epilogue is not supported"
1803                          " for nonlinear induction except neg"
1804                          " when iteration count is unknown.\n");
1805       return false;
1806     }
1807
1808   /* Also doens't support peel for neg when niter is variable.
1809      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
1810   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
1811   if ((niters_skip != NULL_TREE
1812        && TREE_CODE (niters_skip) != INTEGER_CST)
1813       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
1814           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
1815     {
1816       if (dump_enabled_p ())
1817         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818                          "Peeling for alignement is not supported"
1819                          " for nonlinear induction when niters_skip"
1820                          " is not constant.\n");
1821       return false;
1822     }
1823
1824   return true;
1825 }
1826
1827 /* Function vect_can_advance_ivs_p
1828
1829    In case the number of iterations that LOOP iterates is unknown at compile
1830    time, an epilog loop will be generated, and the loop induction variables
1831    (IVs) will be "advanced" to the value they are supposed to take just before
1832    the epilog loop.  Here we check that the access function of the loop IVs
1833    and the expression that represents the loop bound are simple enough.
1834    These restrictions will be relaxed in the future.  */
1835
1836 bool
1837 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1838 {
1839   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1840   basic_block bb = loop->header;
1841   gphi_iterator gsi;
1842
1843   /* Analyze phi functions of the loop header.  */
1844
1845   if (dump_enabled_p ())
1846     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1847   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1848     {
1849       tree evolution_part;
1850       enum vect_induction_op_type induction_type;
1851
1852       gphi *phi = gsi.phi ();
1853       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1854       if (dump_enabled_p ())
1855         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1856                          phi_info->stmt);
1857
1858       /* Skip virtual phi's. The data dependences that are associated with
1859          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1860
1861          Skip reduction phis.  */
1862       if (!iv_phi_p (phi_info))
1863         {
1864           if (dump_enabled_p ())
1865             dump_printf_loc (MSG_NOTE, vect_location,
1866                              "reduc or virtual phi. skip.\n");
1867           continue;
1868         }
1869
1870       induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
1871       if (induction_type != vect_step_op_add)
1872         {
1873           if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
1874             return false;
1875
1876           continue;
1877         }
1878
1879       /* Analyze the evolution function.  */
1880
1881       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1882       if (evolution_part == NULL_TREE)
1883         {
1884           if (dump_enabled_p ())
1885             dump_printf (MSG_MISSED_OPTIMIZATION,
1886                          "No access function or evolution.\n");
1887           return false;
1888         }
1889
1890       /* FORNOW: We do not transform initial conditions of IVs
1891          which evolution functions are not invariants in the loop.  */
1892
1893       if (!expr_invariant_in_loop_p (loop, evolution_part))
1894         {
1895           if (dump_enabled_p ())
1896             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1897                              "evolution not invariant in loop.\n");
1898           return false;
1899         }
1900
1901       /* FORNOW: We do not transform initial conditions of IVs
1902          which evolution functions are a polynomial of degree >= 2.  */
1903
1904       if (tree_is_chrec (evolution_part))
1905         {
1906           if (dump_enabled_p ())
1907             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908                              "evolution is chrec.\n");
1909           return false;
1910         }
1911     }
1912
1913   return true;
1914 }
1915
1916
1917 /*   Function vect_update_ivs_after_vectorizer.
1918
1919      "Advance" the induction variables of LOOP to the value they should take
1920      after the execution of LOOP.  This is currently necessary because the
1921      vectorizer does not handle induction variables that are used after the
1922      loop.  Such a situation occurs when the last iterations of LOOP are
1923      peeled, because:
1924      1. We introduced new uses after LOOP for IVs that were not originally used
1925         after LOOP: the IVs of LOOP are now used by an epilog loop.
1926      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1927         times, whereas the loop IVs should be bumped N times.
1928
1929      Input:
1930      - LOOP - a loop that is going to be vectorized. The last few iterations
1931               of LOOP were peeled.
1932      - NITERS - the number of iterations that LOOP executes (before it is
1933                 vectorized). i.e, the number of times the ivs should be bumped.
1934      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1935                   coming out from LOOP on which there are uses of the LOOP ivs
1936                   (this is the path from LOOP->exit to epilog_loop->preheader).
1937
1938                   The new definitions of the ivs are placed in LOOP->exit.
1939                   The phi args associated with the edge UPDATE_E in the bb
1940                   UPDATE_E->dest are updated accordingly.
1941
1942      Assumption 1: Like the rest of the vectorizer, this function assumes
1943      a single loop exit that has a single predecessor.
1944
1945      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1946      organized in the same order.
1947
1948      Assumption 3: The access function of the ivs is simple enough (see
1949      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1950
1951      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1952      coming out of LOOP on which the ivs of LOOP are used (this is the path
1953      that leads to the epilog loop; other paths skip the epilog loop).  This
1954      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1955      needs to have its phis updated.
1956  */
1957
1958 static void
1959 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1960                                   tree niters, edge update_e)
1961 {
1962   gphi_iterator gsi, gsi1;
1963   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1964   basic_block update_bb = update_e->dest;
1965   basic_block exit_bb = single_exit (loop)->dest;
1966
1967   /* Make sure there exists a single-predecessor exit bb:  */
1968   gcc_assert (single_pred_p (exit_bb));
1969   gcc_assert (single_succ_edge (exit_bb) == update_e);
1970
1971   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1972        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1973        gsi_next (&gsi), gsi_next (&gsi1))
1974     {
1975       tree init_expr;
1976       tree step_expr, off;
1977       tree type;
1978       tree var, ni, ni_name;
1979       gimple_stmt_iterator last_gsi;
1980
1981       gphi *phi = gsi.phi ();
1982       gphi *phi1 = gsi1.phi ();
1983       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1984       if (dump_enabled_p ())
1985         dump_printf_loc (MSG_NOTE, vect_location,
1986                          "vect_update_ivs_after_vectorizer: phi: %G",
1987                          (gimple *) phi);
1988
1989       /* Skip reduction and virtual phis.  */
1990       if (!iv_phi_p (phi_info))
1991         {
1992           if (dump_enabled_p ())
1993             dump_printf_loc (MSG_NOTE, vect_location,
1994                              "reduc or virtual phi. skip.\n");
1995           continue;
1996         }
1997
1998       type = TREE_TYPE (gimple_phi_result (phi));
1999       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2000       step_expr = unshare_expr (step_expr);
2001
2002       /* FORNOW: We do not support IVs whose evolution function is a polynomial
2003          of degree >= 2 or exponential.  */
2004       gcc_assert (!tree_is_chrec (step_expr));
2005
2006       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2007       gimple_seq stmts = NULL;
2008       enum vect_induction_op_type induction_type
2009         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2010
2011       if (induction_type == vect_step_op_add)
2012         {
2013           tree stype = TREE_TYPE (step_expr);
2014           off = fold_build2 (MULT_EXPR, stype,
2015                              fold_convert (stype, niters), step_expr);
2016           if (POINTER_TYPE_P (type))
2017             ni = fold_build_pointer_plus (init_expr, off);
2018           else
2019             ni = fold_convert (type,
2020                                fold_build2 (PLUS_EXPR, stype,
2021                                             fold_convert (stype, init_expr),
2022                                             off));
2023         }
2024       /* Don't bother call vect_peel_nonlinear_iv_init.  */
2025       else if (induction_type == vect_step_op_neg)
2026         ni = init_expr;
2027       else
2028         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2029                                           niters, step_expr,
2030                                           induction_type);
2031
2032       var = create_tmp_var (type, "tmp");
2033
2034       last_gsi = gsi_last_bb (exit_bb);
2035       gimple_seq new_stmts = NULL;
2036       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2037       /* Exit_bb shouldn't be empty.  */
2038       if (!gsi_end_p (last_gsi))
2039         {
2040           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2041           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2042         }
2043       else
2044         {
2045           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2046           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2047         }
2048
2049       /* Fix phi expressions in the successor bb.  */
2050       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2051     }
2052 }
2053
2054 /* Return a gimple value containing the misalignment (measured in vector
2055    elements) for the loop described by LOOP_VINFO, i.e. how many elements
2056    it is away from a perfectly aligned address.  Add any new statements
2057    to SEQ.  */
2058
2059 static tree
2060 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2061 {
2062   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2063   stmt_vec_info stmt_info = dr_info->stmt;
2064   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2065
2066   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2067   unsigned HOST_WIDE_INT target_align_c;
2068   tree target_align_minus_1;
2069
2070   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2071                                         size_zero_node) < 0;
2072   tree offset = (negative
2073                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2074                              * TREE_INT_CST_LOW
2075                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2076                  : size_zero_node);
2077   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2078                                                           stmt_info, seq,
2079                                                           offset);
2080   tree type = unsigned_type_for (TREE_TYPE (start_addr));
2081   if (target_align.is_constant (&target_align_c))
2082     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2083   else
2084     {
2085       tree vla = build_int_cst (type, target_align);
2086       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2087                                     fold_build2 (MINUS_EXPR, type,
2088                                                  build_int_cst (type, 0), vla));
2089       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2090                                           build_int_cst (type, 1));
2091     }
2092
2093   HOST_WIDE_INT elem_size
2094     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2095   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2096
2097   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
2098   tree int_start_addr = fold_convert (type, start_addr);
2099   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2100                                         target_align_minus_1);
2101
2102   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
2103   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2104                                         elem_size_log);
2105
2106   return misalign_in_elems;
2107 }
2108
2109 /* Function vect_gen_prolog_loop_niters
2110
2111    Generate the number of iterations which should be peeled as prolog for the
2112    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
2113    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2114    As a result, after the execution of this loop, the data reference DR will
2115    refer to an aligned location.  The following computation is generated:
2116
2117    If the misalignment of DR is known at compile time:
2118      addr_mis = int mis = DR_MISALIGNMENT (dr);
2119    Else, compute address misalignment in bytes:
2120      addr_mis = addr & (target_align - 1)
2121
2122    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2123
2124    (elem_size = element type size; an element is the scalar element whose type
2125    is the inner type of the vectype)
2126
2127    The computations will be emitted at the end of BB.  We also compute and
2128    store upper bound (included) of the result in BOUND.
2129
2130    When the step of the data-ref in the loop is not 1 (as in interleaved data
2131    and SLP), the number of iterations of the prolog must be divided by the step
2132    (which is equal to the size of interleaved group).
2133
2134    The above formulas assume that VF == number of elements in the vector. This
2135    may not hold when there are multiple-types in the loop.
2136    In this case, for some data-references in the loop the VF does not represent
2137    the number of elements that fit in the vector.  Therefore, instead of VF we
2138    use TYPE_VECTOR_SUBPARTS.  */
2139
2140 static tree
2141 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2142                              basic_block bb, int *bound)
2143 {
2144   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2145   tree var;
2146   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2147   gimple_seq stmts = NULL, new_stmts = NULL;
2148   tree iters, iters_name;
2149   stmt_vec_info stmt_info = dr_info->stmt;
2150   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2151   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2152
2153   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2154     {
2155       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2156
2157       if (dump_enabled_p ())
2158         dump_printf_loc (MSG_NOTE, vect_location,
2159                          "known peeling = %d.\n", npeel);
2160
2161       iters = build_int_cst (niters_type, npeel);
2162       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2163     }
2164   else
2165     {
2166       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2167       tree type = TREE_TYPE (misalign_in_elems);
2168       HOST_WIDE_INT elem_size
2169         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2170       /* We only do prolog peeling if the target alignment is known at compile
2171          time.  */
2172       poly_uint64 align_in_elems =
2173         exact_div (target_align, elem_size);
2174       tree align_in_elems_minus_1 =
2175         build_int_cst (type, align_in_elems - 1);
2176       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2177
2178       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
2179                                  & (align_in_elems - 1)).  */
2180       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2181                                             size_zero_node) < 0;
2182       if (negative)
2183         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2184                              align_in_elems_tree);
2185       else
2186         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2187                              misalign_in_elems);
2188       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2189       iters = fold_convert (niters_type, iters);
2190       unsigned HOST_WIDE_INT align_in_elems_c;
2191       if (align_in_elems.is_constant (&align_in_elems_c))
2192         *bound = align_in_elems_c - 1;
2193       else
2194         *bound = -1;
2195     }
2196
2197   if (dump_enabled_p ())
2198     dump_printf_loc (MSG_NOTE, vect_location,
2199                      "niters for prolog loop: %T\n", iters);
2200
2201   var = create_tmp_var (niters_type, "prolog_loop_niters");
2202   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2203
2204   if (new_stmts)
2205     gimple_seq_add_seq (&stmts, new_stmts);
2206   if (stmts)
2207     {
2208       gcc_assert (single_succ_p (bb));
2209       gimple_stmt_iterator gsi = gsi_last_bb (bb);
2210       if (gsi_end_p (gsi))
2211         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2212       else
2213         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2214     }
2215   return iters_name;
2216 }
2217
2218
2219 /* Function vect_update_init_of_dr
2220
2221    If CODE is PLUS, the vector loop starts NITERS iterations after the
2222    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2223    iterations before the scalar one (using masking to skip inactive
2224    elements).  This function updates the information recorded in DR to
2225    account for the difference.  Specifically, it updates the OFFSET
2226    field of DR_INFO.  */
2227
2228 static void
2229 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2230 {
2231   struct data_reference *dr = dr_info->dr;
2232   tree offset = dr_info->offset;
2233   if (!offset)
2234     offset = build_zero_cst (sizetype);
2235
2236   niters = fold_build2 (MULT_EXPR, sizetype,
2237                         fold_convert (sizetype, niters),
2238                         fold_convert (sizetype, DR_STEP (dr)));
2239   offset = fold_build2 (code, sizetype,
2240                         fold_convert (sizetype, offset), niters);
2241   dr_info->offset = offset;
2242 }
2243
2244
2245 /* Function vect_update_inits_of_drs
2246
2247    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2248    CODE and NITERS are as for vect_update_inits_of_dr.  */
2249
2250 void
2251 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2252                           tree_code code)
2253 {
2254   unsigned int i;
2255   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2256   struct data_reference *dr;
2257
2258   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2259
2260   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
2261      here, but since we might use these niters to update the epilogues niters
2262      and data references we can't insert them here as this definition might not
2263      always dominate its uses.  */
2264   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2265     niters = fold_convert (sizetype, niters);
2266
2267   FOR_EACH_VEC_ELT (datarefs, i, dr)
2268     {
2269       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2270       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2271           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2272         vect_update_init_of_dr (dr_info, niters, code);
2273     }
2274 }
2275
2276 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2277    by masking.  This involves calculating the number of iterations to
2278    be peeled and then aligning all memory references appropriately.  */
2279
2280 void
2281 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2282 {
2283   tree misalign_in_elems;
2284   tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2285
2286   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2287
2288   /* From the information recorded in LOOP_VINFO get the number of iterations
2289      that need to be skipped via masking.  */
2290   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2291     {
2292       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2293                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2294       misalign_in_elems = build_int_cst (type, misalign);
2295     }
2296   else
2297     {
2298       gimple_seq seq1 = NULL, seq2 = NULL;
2299       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2300       misalign_in_elems = fold_convert (type, misalign_in_elems);
2301       misalign_in_elems = force_gimple_operand (misalign_in_elems,
2302                                                 &seq2, true, NULL_TREE);
2303       gimple_seq_add_seq (&seq1, seq2);
2304       if (seq1)
2305         {
2306           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2307           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2308           gcc_assert (!new_bb);
2309         }
2310     }
2311
2312   if (dump_enabled_p ())
2313     dump_printf_loc (MSG_NOTE, vect_location,
2314                      "misalignment for fully-masked loop: %T\n",
2315                      misalign_in_elems);
2316
2317   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2318
2319   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2320 }
2321
2322 /* This function builds ni_name = number of iterations.  Statements
2323    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
2324    it to TRUE if new ssa_var is generated.  */
2325
2326 tree
2327 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2328 {
2329   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2330   if (TREE_CODE (ni) == INTEGER_CST)
2331     return ni;
2332   else
2333     {
2334       tree ni_name, var;
2335       gimple_seq stmts = NULL;
2336       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2337
2338       var = create_tmp_var (TREE_TYPE (ni), "niters");
2339       ni_name = force_gimple_operand (ni, &stmts, false, var);
2340       if (stmts)
2341         {
2342           gsi_insert_seq_on_edge_immediate (pe, stmts);
2343           if (new_var_p != NULL)
2344             *new_var_p = true;
2345         }
2346
2347       return ni_name;
2348     }
2349 }
2350
2351 /* Calculate the number of iterations above which vectorized loop will be
2352    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
2353    of prolog loop.  If it's integer const, the integer number is also passed
2354    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
2355    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
2356    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
2357    threshold below which the scalar (rather than vectorized) loop will be
2358    executed.  This function stores the upper bound (inclusive) of the result
2359    in BOUND_SCALAR.  */
2360
2361 static tree
2362 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2363                              int bound_prolog, poly_int64 bound_epilog, int th,
2364                              poly_uint64 *bound_scalar,
2365                              bool check_profitability)
2366 {
2367   tree type = TREE_TYPE (niters_prolog);
2368   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2369                              build_int_cst (type, bound_epilog));
2370
2371   *bound_scalar = bound_prolog + bound_epilog;
2372   if (check_profitability)
2373     {
2374       /* TH indicates the minimum niters of vectorized loop, while we
2375          compute the maximum niters of scalar loop.  */
2376       th--;
2377       /* Peeling for constant times.  */
2378       if (int_niters_prolog >= 0)
2379         {
2380           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2381           return build_int_cst (type, *bound_scalar);
2382         }
2383       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
2384          and BOUND_EPILOG are inclusive upper bounds.  */
2385       if (known_ge (th, bound_prolog + bound_epilog))
2386         {
2387           *bound_scalar = th;
2388           return build_int_cst (type, th);
2389         }
2390       /* Need to do runtime comparison.  */
2391       else if (maybe_gt (th, bound_epilog))
2392         {
2393           *bound_scalar = upper_bound (*bound_scalar, th);
2394           return fold_build2 (MAX_EXPR, type,
2395                               build_int_cst (type, th), niters);
2396         }
2397     }
2398   return niters;
2399 }
2400
2401 /* NITERS is the number of times that the original scalar loop executes
2402    after peeling.  Work out the maximum number of iterations N that can
2403    be handled by the vectorized form of the loop and then either:
2404
2405    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2406
2407         niters_vector = N
2408
2409    b) set *STEP_VECTOR_PTR to one and generate:
2410
2411         niters_vector = N / vf
2412
2413    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2414    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
2415    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
2416
2417 void
2418 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2419                              tree *niters_vector_ptr, tree *step_vector_ptr,
2420                              bool niters_no_overflow)
2421 {
2422   tree ni_minus_gap, var;
2423   tree niters_vector, step_vector, type = TREE_TYPE (niters);
2424   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2425   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2426   tree log_vf = NULL_TREE;
2427
2428   /* If epilogue loop is required because of data accesses with gaps, we
2429      subtract one iteration from the total number of iterations here for
2430      correct calculation of RATIO.  */
2431   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2432     {
2433       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2434                                   build_one_cst (type));
2435       if (!is_gimple_val (ni_minus_gap))
2436         {
2437           var = create_tmp_var (type, "ni_gap");
2438           gimple *stmts = NULL;
2439           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2440                                                true, var);
2441           gsi_insert_seq_on_edge_immediate (pe, stmts);
2442         }
2443     }
2444   else
2445     ni_minus_gap = niters;
2446
2447   /* To silence some unexpected warnings, simply initialize to 0. */
2448   unsigned HOST_WIDE_INT const_vf = 0;
2449   if (vf.is_constant (&const_vf)
2450       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2451     {
2452       /* Create: niters >> log2(vf) */
2453       /* If it's known that niters == number of latch executions + 1 doesn't
2454          overflow, we can generate niters >> log2(vf); otherwise we generate
2455          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2456          will be at least one.  */
2457       log_vf = build_int_cst (type, exact_log2 (const_vf));
2458       if (niters_no_overflow)
2459         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2460       else
2461         niters_vector
2462           = fold_build2 (PLUS_EXPR, type,
2463                          fold_build2 (RSHIFT_EXPR, type,
2464                                       fold_build2 (MINUS_EXPR, type,
2465                                                    ni_minus_gap,
2466                                                    build_int_cst (type, vf)),
2467                                       log_vf),
2468                          build_int_cst (type, 1));
2469       step_vector = build_one_cst (type);
2470     }
2471   else
2472     {
2473       niters_vector = ni_minus_gap;
2474       step_vector = build_int_cst (type, vf);
2475     }
2476
2477   if (!is_gimple_val (niters_vector))
2478     {
2479       var = create_tmp_var (type, "bnd");
2480       gimple_seq stmts = NULL;
2481       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2482       gsi_insert_seq_on_edge_immediate (pe, stmts);
2483       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2484          we set range information to make niters analyzer's life easier.
2485          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2486          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2487       if (stmts != NULL && log_vf)
2488         {
2489           if (niters_no_overflow)
2490             {
2491               value_range vr (type,
2492                               wi::one (TYPE_PRECISION (type)),
2493                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2494                                                          TYPE_SIGN (type)),
2495                                           exact_log2 (const_vf),
2496                                           TYPE_SIGN (type)));
2497               set_range_info (niters_vector, vr);
2498             }
2499           /* For VF == 1 the vector IV might also overflow so we cannot
2500              assert a minimum value of 1.  */
2501           else if (const_vf > 1)
2502             {
2503               value_range vr (type,
2504                               wi::one (TYPE_PRECISION (type)),
2505                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2506                                                          TYPE_SIGN (type))
2507                                           - (const_vf - 1),
2508                                           exact_log2 (const_vf), TYPE_SIGN (type))
2509                               + 1);
2510               set_range_info (niters_vector, vr);
2511             }
2512         }
2513     }
2514   *niters_vector_ptr = niters_vector;
2515   *step_vector_ptr = step_vector;
2516
2517   return;
2518 }
2519
2520 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2521    loop specified by LOOP_VINFO after vectorization, compute the number
2522    of iterations before vectorization (niters_vector * vf) and store it
2523    to NITERS_VECTOR_MULT_VF_PTR.  */
2524
2525 static void
2526 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2527                                      tree niters_vector,
2528                                      tree *niters_vector_mult_vf_ptr)
2529 {
2530   /* We should be using a step_vector of VF if VF is variable.  */
2531   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2532   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2533   tree type = TREE_TYPE (niters_vector);
2534   tree log_vf = build_int_cst (type, exact_log2 (vf));
2535   basic_block exit_bb = single_exit (loop)->dest;
2536
2537   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2538   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2539                                             niters_vector, log_vf);
2540   if (!is_gimple_val (niters_vector_mult_vf))
2541     {
2542       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2543       gimple_seq stmts = NULL;
2544       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2545                                                     &stmts, true, var);
2546       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2547       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2548     }
2549   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2550 }
2551
2552 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2553    this function searches for the corresponding lcssa phi node in exit
2554    bb of LOOP.  If it is found, return the phi result; otherwise return
2555    NULL.  */
2556
2557 static tree
2558 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2559                 gphi *lcssa_phi)
2560 {
2561   gphi_iterator gsi;
2562   edge e = single_exit (loop);
2563
2564   gcc_assert (single_pred_p (e->dest));
2565   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2566     {
2567       gphi *phi = gsi.phi ();
2568       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2569                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2570         return PHI_RESULT (phi);
2571     }
2572   return NULL_TREE;
2573 }
2574
2575 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2576    from SECOND/FIRST and puts it at the original loop's preheader/exit
2577    edge, the two loops are arranged as below:
2578
2579        preheader_a:
2580      first_loop:
2581        header_a:
2582          i_1 = PHI<i_0, i_2>;
2583          ...
2584          i_2 = i_1 + 1;
2585          if (cond_a)
2586            goto latch_a;
2587          else
2588            goto between_bb;
2589        latch_a:
2590          goto header_a;
2591
2592        between_bb:
2593          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2594
2595      second_loop:
2596        header_b:
2597          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2598                                  or with i_2 if no LCSSA phi is created
2599                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2600          ...
2601          i_4 = i_3 + 1;
2602          if (cond_b)
2603            goto latch_b;
2604          else
2605            goto exit_bb;
2606        latch_b:
2607          goto header_b;
2608
2609        exit_bb:
2610
2611    This function creates loop closed SSA for the first loop; update the
2612    second loop's PHI nodes by replacing argument on incoming edge with the
2613    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2614    is false, Loop closed ssa phis will only be created for non-iv phis for
2615    the first loop.
2616
2617    This function assumes exit bb of the first loop is preheader bb of the
2618    second loop, i.e, between_bb in the example code.  With PHIs updated,
2619    the second loop will execute rest iterations of the first.  */
2620
2621 static void
2622 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2623                                    class loop *first, class loop *second,
2624                                    bool create_lcssa_for_iv_phis)
2625 {
2626   gphi_iterator gsi_update, gsi_orig;
2627   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2628
2629   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2630   edge second_preheader_e = loop_preheader_edge (second);
2631   basic_block between_bb = single_exit (first)->dest;
2632
2633   gcc_assert (between_bb == second_preheader_e->src);
2634   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2635   /* Either the first loop or the second is the loop to be vectorized.  */
2636   gcc_assert (loop == first || loop == second);
2637
2638   for (gsi_orig = gsi_start_phis (first->header),
2639        gsi_update = gsi_start_phis (second->header);
2640        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2641        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2642     {
2643       gphi *orig_phi = gsi_orig.phi ();
2644       gphi *update_phi = gsi_update.phi ();
2645
2646       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2647       /* Generate lcssa PHI node for the first loop.  */
2648       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2649       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2650       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2651         {
2652           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2653           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2654           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2655           arg = new_res;
2656         }
2657
2658       /* Update PHI node in the second loop by replacing arg on the loop's
2659          incoming edge.  */
2660       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2661     }
2662
2663   /* For epilogue peeling we have to make sure to copy all LC PHIs
2664      for correct vectorization of live stmts.  */
2665   if (loop == first)
2666     {
2667       basic_block orig_exit = single_exit (second)->dest;
2668       for (gsi_orig = gsi_start_phis (orig_exit);
2669            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2670         {
2671           gphi *orig_phi = gsi_orig.phi ();
2672           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2673           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2674             continue;
2675
2676           /* Already created in the above loop.   */
2677           if (find_guard_arg (first, second, orig_phi))
2678             continue;
2679
2680           tree new_res = copy_ssa_name (orig_arg);
2681           gphi *lcphi = create_phi_node (new_res, between_bb);
2682           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2683         }
2684     }
2685 }
2686
2687 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2688    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2689    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2690    appear like below:
2691
2692        guard_bb:
2693          if (cond)
2694            goto merge_bb;
2695          else
2696            goto skip_loop;
2697
2698      skip_loop:
2699        header_a:
2700          i_1 = PHI<i_0, i_2>;
2701          ...
2702          i_2 = i_1 + 1;
2703          if (cond_a)
2704            goto latch_a;
2705          else
2706            goto exit_a;
2707        latch_a:
2708          goto header_a;
2709
2710        exit_a:
2711          i_5 = PHI<i_2>;
2712
2713        merge_bb:
2714          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2715
2716      update_loop:
2717        header_b:
2718          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2719          ...
2720          i_4 = i_3 + 1;
2721          if (cond_b)
2722            goto latch_b;
2723          else
2724            goto exit_bb;
2725        latch_b:
2726          goto header_b;
2727
2728        exit_bb:
2729
2730    This function creates PHI nodes at merge_bb and replaces the use of i_5
2731    in the update_loop's PHI node with the result of new PHI result.  */
2732
2733 static void
2734 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2735                                     class loop *update_loop,
2736                                     edge guard_edge, edge merge_edge)
2737 {
2738   location_t merge_loc, guard_loc;
2739   edge orig_e = loop_preheader_edge (skip_loop);
2740   edge update_e = loop_preheader_edge (update_loop);
2741   gphi_iterator gsi_orig, gsi_update;
2742
2743   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2744         gsi_update = gsi_start_phis (update_loop->header));
2745        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2746        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2747     {
2748       gphi *orig_phi = gsi_orig.phi ();
2749       gphi *update_phi = gsi_update.phi ();
2750
2751       /* Generate new phi node at merge bb of the guard.  */
2752       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2753       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2754
2755       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2756          args in NEW_PHI for these edges.  */
2757       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2758       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2759       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2760       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2761       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2762       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2763
2764       /* Update phi in UPDATE_PHI.  */
2765       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2766     }
2767 }
2768
2769 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2770    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2771    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2772    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2773    The CFG looks like:
2774
2775      loop:
2776        header_a:
2777          i_1 = PHI<i_0, i_2>;
2778          ...
2779          i_2 = i_1 + 1;
2780          if (cond_a)
2781            goto latch_a;
2782          else
2783            goto exit_a;
2784        latch_a:
2785          goto header_a;
2786
2787        exit_a:
2788
2789        guard_bb:
2790          if (cond)
2791            goto merge_bb;
2792          else
2793            goto epilog_loop;
2794
2795        ;; fall_through_bb
2796
2797      epilog_loop:
2798        header_b:
2799          i_3 = PHI<i_2, i_4>;
2800          ...
2801          i_4 = i_3 + 1;
2802          if (cond_b)
2803            goto latch_b;
2804          else
2805            goto merge_bb;
2806        latch_b:
2807          goto header_b;
2808
2809        merge_bb:
2810          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2811
2812        exit_bb:
2813          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2814
2815    For each name used out side EPILOG (i.e - for each name that has a lcssa
2816    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2817    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2818    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2819    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2820    in exit_bb will also be updated.  */
2821
2822 static void
2823 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2824                                     edge guard_edge, edge merge_edge)
2825 {
2826   gphi_iterator gsi;
2827   basic_block merge_bb = guard_edge->dest;
2828
2829   gcc_assert (single_succ_p (merge_bb));
2830   edge e = single_succ_edge (merge_bb);
2831   basic_block exit_bb = e->dest;
2832   gcc_assert (single_pred_p (exit_bb));
2833   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2834
2835   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2836     {
2837       gphi *update_phi = gsi.phi ();
2838       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2839
2840       tree merge_arg = NULL_TREE;
2841
2842       /* If the old argument is a SSA_NAME use its current_def.  */
2843       if (TREE_CODE (old_arg) == SSA_NAME)
2844         merge_arg = get_current_def (old_arg);
2845       /* If it's a constant or doesn't have a current_def, just use the old
2846          argument.  */
2847       if (!merge_arg)
2848         merge_arg = old_arg;
2849
2850       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2851       /* If the var is live after loop but not a reduction, we simply
2852          use the old arg.  */
2853       if (!guard_arg)
2854         guard_arg = old_arg;
2855
2856       /* Create new phi node in MERGE_BB:  */
2857       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2858       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2859
2860       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2861          the two PHI args in merge_phi for these edges.  */
2862       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2863       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2864
2865       /* Update the original phi in exit_bb.  */
2866       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2867     }
2868 }
2869
2870 /* EPILOG loop is duplicated from the original loop for vectorizing,
2871    the arg of its loop closed ssa PHI needs to be updated.  */
2872
2873 static void
2874 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2875 {
2876   gphi_iterator gsi;
2877   basic_block exit_bb = single_exit (epilog)->dest;
2878
2879   gcc_assert (single_pred_p (exit_bb));
2880   edge e = EDGE_PRED (exit_bb, 0);
2881   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2882     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2883 }
2884
2885 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2886    iterate exactly CONST_NITERS times.  Make a final decision about
2887    whether the epilogue loop should be used, returning true if so.  */
2888
2889 static bool
2890 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2891                              unsigned HOST_WIDE_INT const_niters)
2892 {
2893   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2894      using an epilogue loop for a single scalar iteration, even if
2895      we could in principle implement that using partial vectors.  */
2896   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2897   if (const_niters <= gap_niters + 1)
2898     return false;
2899
2900   /* Install the number of iterations.  */
2901   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2902   tree niters_tree = build_int_cst (niters_type, const_niters);
2903   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2904
2905   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2906   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2907
2908   /* Decide what to do if the number of epilogue iterations is not
2909      a multiple of the epilogue loop's vectorization factor.  */
2910   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2911 }
2912
2913 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2914    Return a value that equals:
2915
2916    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2917    - SKIP_VALUE when the main loop is skipped.  */
2918
2919 tree
2920 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2921                            tree skip_value)
2922 {
2923   gcc_assert (loop_vinfo->main_loop_edge);
2924
2925   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2926   basic_block bb = loop_vinfo->main_loop_edge->dest;
2927   gphi *new_phi = create_phi_node (phi_result, bb);
2928   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2929                UNKNOWN_LOCATION);
2930   add_phi_arg (new_phi, skip_value,
2931                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2932   return phi_result;
2933 }
2934
2935 /* Function vect_do_peeling.
2936
2937    Input:
2938    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2939
2940        preheader:
2941      LOOP:
2942        header_bb:
2943          loop_body
2944          if (exit_loop_cond) goto exit_bb
2945          else                goto header_bb
2946        exit_bb:
2947
2948    - NITERS: The number of iterations of the loop.
2949    - NITERSM1: The number of iterations of the loop's latch.
2950    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2951    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2952                               CHECK_PROFITABILITY is true.
2953    Output:
2954    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2955      iterate after vectorization; see vect_set_loop_condition for details.
2956    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2957      should be set to the number of scalar iterations handled by the
2958      vector loop.  The SSA name is only used on exit from the loop.
2959
2960    This function peels prolog and epilog from the loop, adds guards skipping
2961    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2962    would look like:
2963
2964        guard_bb_1:
2965          if (prefer_scalar_loop) goto merge_bb_1
2966          else                    goto guard_bb_2
2967
2968        guard_bb_2:
2969          if (skip_prolog) goto merge_bb_2
2970          else             goto prolog_preheader
2971
2972        prolog_preheader:
2973      PROLOG:
2974        prolog_header_bb:
2975          prolog_body
2976          if (exit_prolog_cond) goto prolog_exit_bb
2977          else                  goto prolog_header_bb
2978        prolog_exit_bb:
2979
2980        merge_bb_2:
2981
2982        vector_preheader:
2983      VECTOR LOOP:
2984        vector_header_bb:
2985          vector_body
2986          if (exit_vector_cond) goto vector_exit_bb
2987          else                  goto vector_header_bb
2988        vector_exit_bb:
2989
2990        guard_bb_3:
2991          if (skip_epilog) goto merge_bb_3
2992          else             goto epilog_preheader
2993
2994        merge_bb_1:
2995
2996        epilog_preheader:
2997      EPILOG:
2998        epilog_header_bb:
2999          epilog_body
3000          if (exit_epilog_cond) goto merge_bb_3
3001          else                  goto epilog_header_bb
3002
3003        merge_bb_3:
3004
3005    Note this function peels prolog and epilog only if it's necessary,
3006    as well as guards.
3007    This function returns the epilogue loop if a decision was made to vectorize
3008    it, otherwise NULL.
3009
3010    The analysis resulting in this epilogue loop's loop_vec_info was performed
3011    in the same vect_analyze_loop call as the main loop's.  At that time
3012    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3013    vectorization factors than the main loop.  This list is stored in the main
3014    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
3015    vectorize the epilogue loop for a lower vectorization factor,  the
3016    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
3017    updated and linked to the epilogue loop.  This is later used to vectorize
3018    the epilogue.  The reason the loop_vec_info needs updating is that it was
3019    constructed based on the original main loop, and the epilogue loop is a
3020    copy of this loop, so all links pointing to statements in the original loop
3021    need updating.  Furthermore, these loop_vec_infos share the
3022    data_reference's records, which will also need to be updated.
3023
3024    TODO: Guard for prefer_scalar_loop should be emitted along with
3025    versioning conditions if loop versioning is needed.  */
3026
3027
3028 class loop *
3029 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3030                  tree *niters_vector, tree *step_vector,
3031                  tree *niters_vector_mult_vf_var, int th,
3032                  bool check_profitability, bool niters_no_overflow,
3033                  tree *advance)
3034 {
3035   edge e, guard_e;
3036   tree type = TREE_TYPE (niters), guard_cond;
3037   basic_block guard_bb, guard_to;
3038   profile_probability prob_prolog, prob_vector, prob_epilog;
3039   int estimated_vf;
3040   int prolog_peeling = 0;
3041   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
3042   bool vect_epilogues_updated_niters = false;
3043   /* We currently do not support prolog peeling if the target alignment is not
3044      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
3045      target alignment being constant.  */
3046   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3047   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
3048     return NULL;
3049
3050   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3051     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3052
3053   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3054   poly_uint64 bound_epilog = 0;
3055   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3056       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3057     bound_epilog += vf - 1;
3058   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3059     bound_epilog += 1;
3060   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3061   poly_uint64 bound_scalar = bound_epilog;
3062
3063   if (!prolog_peeling && !epilog_peeling)
3064     return NULL;
3065
3066   /* Before doing any peeling make sure to reset debug binds outside of
3067      the loop refering to defs not in LC SSA.  */
3068   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3069   for (unsigned i = 0; i < loop->num_nodes; ++i)
3070     {
3071       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3072       imm_use_iterator ui;
3073       gimple *use_stmt;
3074       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3075            gsi_next (&gsi))
3076         {
3077           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3078             if (gimple_debug_bind_p (use_stmt)
3079                 && loop != gimple_bb (use_stmt)->loop_father
3080                 && !flow_loop_nested_p (loop,
3081                                         gimple_bb (use_stmt)->loop_father))
3082               {
3083                 gimple_debug_bind_reset_value (use_stmt);
3084                 update_stmt (use_stmt);
3085               }
3086         }
3087       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3088            gsi_next (&gsi))
3089         {
3090           ssa_op_iter op_iter;
3091           def_operand_p def_p;
3092           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3093             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3094               if (gimple_debug_bind_p (use_stmt)
3095                   && loop != gimple_bb (use_stmt)->loop_father
3096                   && !flow_loop_nested_p (loop,
3097                                           gimple_bb (use_stmt)->loop_father))
3098                 {
3099                   gimple_debug_bind_reset_value (use_stmt);
3100                   update_stmt (use_stmt);
3101                 }
3102         }
3103     }
3104
3105   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3106   estimated_vf = vect_vf_for_cost (loop_vinfo);
3107   if (estimated_vf == 2)
3108     estimated_vf = 3;
3109   prob_prolog = prob_epilog = profile_probability::guessed_always ()
3110                         .apply_scale (estimated_vf - 1, estimated_vf);
3111
3112   class loop *prolog, *epilog = NULL;
3113   class loop *first_loop = loop;
3114   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3115
3116   /* SSA form needs to be up-to-date since we are going to manually
3117      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3118      update SSA state after that, so we have to make sure to not lose any
3119      pending update needs.  */
3120   gcc_assert (!need_ssa_update_p (cfun));
3121
3122   /* If we're vectorizing an epilogue loop, we have ensured that the
3123      virtual operand is in SSA form throughout the vectorized main loop.
3124      Normally it is possible to trace the updated
3125      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3126      back to scalar-stmt vuses, meaning that the effect of the SSA update
3127      remains local to the main loop.  However, there are rare cases in
3128      which the vectorized loop should have vdefs even when the original scalar
3129      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
3130      introduces clobbers of the temporary vector array, which in turn
3131      needs new vdefs.  If the scalar loop doesn't write to memory, these
3132      new vdefs will be the only ones in the vector loop.
3133      We are currently defering updating virtual SSA form and creating
3134      of a virtual PHI for this case so we do not have to make sure the
3135      newly introduced virtual def is in LCSSA form.  */
3136
3137   if (MAY_HAVE_DEBUG_BIND_STMTS)
3138     {
3139       gcc_assert (!adjust_vec.exists ());
3140       adjust_vec.create (32);
3141     }
3142   initialize_original_copy_tables ();
3143
3144   /* Record the anchor bb at which the guard should be placed if the scalar
3145      loop might be preferred.  */
3146   basic_block anchor = loop_preheader_edge (loop)->src;
3147
3148   /* Generate the number of iterations for the prolog loop.  We do this here
3149      so that we can also get the upper bound on the number of iterations.  */
3150   tree niters_prolog;
3151   int bound_prolog = 0;
3152   if (prolog_peeling)
3153     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3154                                                   &bound_prolog);
3155   else
3156     niters_prolog = build_int_cst (type, 0);
3157
3158   loop_vec_info epilogue_vinfo = NULL;
3159   if (vect_epilogues)
3160     {
3161       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
3162       loop_vinfo->epilogue_vinfos.ordered_remove (0);
3163     }
3164
3165   tree niters_vector_mult_vf = NULL_TREE;
3166   /* Saving NITERs before the loop, as this may be changed by prologue.  */
3167   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3168   edge update_e = NULL, skip_e = NULL;
3169   unsigned int lowest_vf = constant_lower_bound (vf);
3170   /* If we know the number of scalar iterations for the main loop we should
3171      check whether after the main loop there are enough iterations left over
3172      for the epilogue.  */
3173   if (vect_epilogues
3174       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3175       && prolog_peeling >= 0
3176       && known_eq (vf, lowest_vf))
3177     {
3178       unsigned HOST_WIDE_INT eiters
3179         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
3180            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
3181
3182       eiters -= prolog_peeling;
3183       eiters
3184         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
3185
3186       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
3187         {
3188           delete epilogue_vinfo;
3189           epilogue_vinfo = NULL;
3190           if (loop_vinfo->epilogue_vinfos.length () == 0)
3191             {
3192               vect_epilogues = false;
3193               break;
3194             }
3195           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
3196           loop_vinfo->epilogue_vinfos.ordered_remove (0);
3197         }
3198       vect_epilogues_updated_niters = true;
3199     }
3200   /* Prolog loop may be skipped.  */
3201   bool skip_prolog = (prolog_peeling != 0);
3202   /* Skip this loop to epilog when there are not enough iterations to enter this
3203      vectorized loop.  If true we should perform runtime checks on the NITERS
3204      to check whether we should skip the current vectorized loop.  If we know
3205      the number of scalar iterations we may choose to add a runtime check if
3206      this number "maybe" smaller than the number of iterations required
3207      when we know the number of scalar iterations may potentially
3208      be smaller than the number of iterations required to enter this loop, for
3209      this we use the upper bounds on the prolog and epilog peeling.  When we
3210      don't know the number of iterations and don't require versioning it is
3211      because we have asserted that there are enough scalar iterations to enter
3212      the main loop, so this skip is not necessary.  When we are versioning then
3213      we only add such a skip if we have chosen to vectorize the epilogue.  */
3214   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3215                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3216                                   bound_prolog + bound_epilog)
3217                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3218                          || vect_epilogues));
3219   /* Epilog loop must be executed if the number of iterations for epilog
3220      loop is known at compile time, otherwise we need to add a check at
3221      the end of vector loop and skip to the end of epilog loop.  */
3222   bool skip_epilog = (prolog_peeling < 0
3223                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3224                       || !vf.is_constant ());
3225   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
3226   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3227     skip_epilog = false;
3228
3229   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3230   auto_vec<profile_count> original_counts;
3231   basic_block *original_bbs = NULL;
3232
3233   if (skip_vector)
3234     {
3235       split_edge (loop_preheader_edge (loop));
3236
3237       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3238         {
3239           original_bbs = get_loop_body (loop);
3240           for (unsigned int i = 0; i < loop->num_nodes; i++)
3241             original_counts.safe_push(original_bbs[i]->count);
3242         }
3243
3244       /* Due to the order in which we peel prolog and epilog, we first
3245          propagate probability to the whole loop.  The purpose is to
3246          avoid adjusting probabilities of both prolog and vector loops
3247          separately.  Note in this case, the probability of epilog loop
3248          needs to be scaled back later.  */
3249       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3250       if (prob_vector.initialized_p ())
3251         {
3252           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3253           scale_loop_profile (loop, prob_vector, 0);
3254         }
3255     }
3256
3257   if (vect_epilogues)
3258     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3259        use the original scalar loop as remaining epilogue if necessary.  */
3260     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3261       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3262
3263   if (prolog_peeling)
3264     {
3265       e = loop_preheader_edge (loop);
3266       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
3267
3268       /* Peel prolog and put it on preheader edge of loop.  */
3269       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
3270       gcc_assert (prolog);
3271       prolog->force_vectorize = false;
3272       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
3273       first_loop = prolog;
3274       reset_original_copy_tables ();
3275
3276       /* Update the number of iterations for prolog loop.  */
3277       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3278       vect_set_loop_condition (prolog, NULL, niters_prolog,
3279                                step_prolog, NULL_TREE, false);
3280
3281       /* Skip the prolog loop.  */
3282       if (skip_prolog)
3283         {
3284           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3285                                     niters_prolog, build_int_cst (type, 0));
3286           guard_bb = loop_preheader_edge (prolog)->src;
3287           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3288           guard_to = split_edge (loop_preheader_edge (loop));
3289           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3290                                            guard_to, guard_bb,
3291                                            prob_prolog.invert (),
3292                                            irred_flag);
3293           e = EDGE_PRED (guard_to, 0);
3294           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3295           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3296
3297           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3298           scale_loop_profile (prolog, prob_prolog, bound_prolog);
3299         }
3300
3301       /* Update init address of DRs.  */
3302       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3303       /* Update niters for vector loop.  */
3304       LOOP_VINFO_NITERS (loop_vinfo)
3305         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3306       LOOP_VINFO_NITERSM1 (loop_vinfo)
3307         = fold_build2 (MINUS_EXPR, type,
3308                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3309       bool new_var_p = false;
3310       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3311       /* It's guaranteed that vector loop bound before vectorization is at
3312          least VF, so set range information for newly generated var.  */
3313       if (new_var_p)
3314         {
3315           value_range vr (type,
3316                           wi::to_wide (build_int_cst (type, lowest_vf)),
3317                           wi::to_wide (TYPE_MAX_VALUE (type)));
3318           set_range_info (niters, vr);
3319         }
3320
3321       /* Prolog iterates at most bound_prolog times, latch iterates at
3322          most bound_prolog - 1 times.  */
3323       record_niter_bound (prolog, bound_prolog - 1, false, true);
3324       delete_update_ssa ();
3325       adjust_vec_debug_stmts ();
3326       scev_reset ();
3327     }
3328
3329   if (epilog_peeling)
3330     {
3331       e = single_exit (loop);
3332       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e));
3333
3334       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
3335          said epilog then we should use a copy of the main loop as a starting
3336          point.  This loop may have already had some preliminary transformations
3337          to allow for more optimal vectorization, for example if-conversion.
3338          If we are not vectorizing the epilog then we should use the scalar loop
3339          as the transformations mentioned above make less or no sense when not
3340          vectorizing.  */
3341       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3342       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
3343       gcc_assert (epilog);
3344
3345       epilog->force_vectorize = false;
3346       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
3347
3348       /* Scalar version loop may be preferred.  In this case, add guard
3349          and skip to epilog.  Note this only happens when the number of
3350          iterations of loop is unknown at compile time, otherwise this
3351          won't be vectorized.  */
3352       if (skip_vector)
3353         {
3354           /* Additional epilogue iteration is peeled if gap exists.  */
3355           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3356                                                 bound_prolog, bound_epilog,
3357                                                 th, &bound_scalar,
3358                                                 check_profitability);
3359           /* Build guard against NITERSM1 since NITERS may overflow.  */
3360           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3361           guard_bb = anchor;
3362           guard_to = split_edge (loop_preheader_edge (epilog));
3363           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3364                                            guard_to, guard_bb,
3365                                            prob_vector.invert (),
3366                                            irred_flag);
3367           skip_e = guard_e;
3368           e = EDGE_PRED (guard_to, 0);
3369           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3370           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3371
3372           /* Simply propagate profile info from guard_bb to guard_to which is
3373              a merge point of control flow.  */
3374           guard_to->count = guard_bb->count;
3375
3376           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3377           if (vect_epilogues || scalar_loop == NULL)
3378             {
3379               gcc_assert(epilog->num_nodes == loop->num_nodes);
3380               basic_block *bbs = get_loop_body (epilog);
3381               for (unsigned int i = 0; i < epilog->num_nodes; i++)
3382                 {
3383                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3384                   bbs[i]->count = original_counts[i];
3385                 }
3386               free (bbs);
3387               free (original_bbs);
3388             }
3389         }
3390
3391       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
3392       /* If loop is peeled for non-zero constant times, now niters refers to
3393          orig_niters - prolog_peeling, it won't overflow even the orig_niters
3394          overflows.  */
3395       niters_no_overflow |= (prolog_peeling > 0);
3396       vect_gen_vector_loop_niters (loop_vinfo, niters,
3397                                    niters_vector, step_vector,
3398                                    niters_no_overflow);
3399       if (!integer_onep (*step_vector))
3400         {
3401           /* On exit from the loop we will have an easy way of calcalating
3402              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3403              until then.  */
3404           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3405           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3406           *niters_vector_mult_vf_var = niters_vector_mult_vf;
3407         }
3408       else
3409         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3410                                              &niters_vector_mult_vf);
3411       /* Update IVs of original loop as if they were advanced by
3412          niters_vector_mult_vf steps.  */
3413       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3414       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3415       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3416                                         update_e);
3417
3418       if (skip_epilog)
3419         {
3420           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3421                                     niters, niters_vector_mult_vf);
3422           guard_bb = single_exit (loop)->dest;
3423           guard_to = split_edge (single_exit (epilog));
3424           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3425                                            skip_vector ? anchor : guard_bb,
3426                                            prob_epilog.invert (),
3427                                            irred_flag);
3428           if (vect_epilogues)
3429             epilogue_vinfo->skip_this_loop_edge = guard_e;
3430           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3431                                               single_exit (epilog));
3432           /* Only need to handle basic block before epilog loop if it's not
3433              the guard_bb, which is the case when skip_vector is true.  */
3434           if (guard_bb != bb_before_epilog)
3435             {
3436               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3437
3438               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3439             }
3440           scale_loop_profile (epilog, prob_epilog, 0);
3441         }
3442       else
3443         slpeel_update_phi_nodes_for_lcssa (epilog);
3444
3445       unsigned HOST_WIDE_INT bound;
3446       if (bound_scalar.is_constant (&bound))
3447         {
3448           gcc_assert (bound != 0);
3449           /* -1 to convert loop iterations to latch iterations.  */
3450           record_niter_bound (epilog, bound - 1, false, true);
3451         }
3452
3453       delete_update_ssa ();
3454       adjust_vec_debug_stmts ();
3455       scev_reset ();
3456     }
3457
3458   if (vect_epilogues)
3459     {
3460       epilog->aux = epilogue_vinfo;
3461       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3462
3463       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3464
3465       /* We now must calculate the number of NITERS performed by the previous
3466          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3467       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3468                                  niters_prolog, niters_vector_mult_vf);
3469
3470       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3471          determine whether we are coming from the previous vectorized loop
3472          using the update_e edge or the skip_vector basic block using the
3473          skip_e edge.  */
3474       if (skip_vector)
3475         {
3476           gcc_assert (update_e != NULL
3477                       && skip_e != NULL
3478                       && !vect_epilogues_updated_niters);
3479           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3480                                            update_e->dest);
3481           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3482           gimple *stmt = gimple_build_assign (new_ssa, niters);
3483           gimple_stmt_iterator gsi;
3484           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3485               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3486             {
3487               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3488               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3489             }
3490           else
3491             {
3492               gsi = gsi_last_bb (update_e->src);
3493               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3494             }
3495
3496           niters = new_ssa;
3497           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3498           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3499                        UNKNOWN_LOCATION);
3500           niters = PHI_RESULT (new_phi);
3501           epilogue_vinfo->main_loop_edge = update_e;
3502           epilogue_vinfo->skip_main_loop_edge = skip_e;
3503         }
3504
3505       /* Set ADVANCE to the number of iterations performed by the previous
3506          loop and its prologue.  */
3507       *advance = niters;
3508
3509       if (!vect_epilogues_updated_niters)
3510         {
3511           /* Subtract the number of iterations performed by the vectorized loop
3512              from the number of total iterations.  */
3513           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3514                                               before_loop_niters,
3515                                               niters);
3516
3517           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3518           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3519             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3520                            epilogue_niters,
3521                            build_one_cst (TREE_TYPE (epilogue_niters)));
3522
3523           /* Decide what to do if the number of epilogue iterations is not
3524              a multiple of the epilogue loop's vectorization factor.
3525              We should have rejected the loop during the analysis phase
3526              if this fails.  */
3527           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3528                                                            true))
3529             gcc_unreachable ();
3530         }
3531     }
3532
3533   adjust_vec.release ();
3534   free_original_copy_tables ();
3535
3536   return vect_epilogues ? epilog : NULL;
3537 }
3538
3539 /* Function vect_create_cond_for_niters_checks.
3540
3541    Create a conditional expression that represents the run-time checks for
3542    loop's niter.  The loop is guaranteed to terminate if the run-time
3543    checks hold.
3544
3545    Input:
3546    COND_EXPR  - input conditional expression.  New conditions will be chained
3547                 with logical AND operation.  If it is NULL, then the function
3548                 is used to return the number of alias checks.
3549    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3550                 to be checked.
3551
3552    Output:
3553    COND_EXPR - conditional expression.
3554
3555    The returned COND_EXPR is the conditional expression to be used in the
3556    if statement that controls which version of the loop gets executed at
3557    runtime.  */
3558
3559 static void
3560 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3561 {
3562   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3563
3564   if (*cond_expr)
3565     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3566                               *cond_expr, part_cond_expr);
3567   else
3568     *cond_expr = part_cond_expr;
3569 }
3570
3571 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3572    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3573
3574 static void
3575 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3576 {
3577   if (*cond_expr)
3578     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3579                               *cond_expr, part_cond_expr);
3580   else
3581     *cond_expr = part_cond_expr;
3582 }
3583
3584 /* Function vect_create_cond_for_align_checks.
3585
3586    Create a conditional expression that represents the alignment checks for
3587    all of data references (array element references) whose alignment must be
3588    checked at runtime.
3589
3590    Input:
3591    COND_EXPR  - input conditional expression.  New conditions will be chained
3592                 with logical AND operation.
3593    LOOP_VINFO - two fields of the loop information are used.
3594                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3595                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3596
3597    Output:
3598    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3599                          expression.
3600    The returned value is the conditional expression to be used in the if
3601    statement that controls which version of the loop gets executed at runtime.
3602
3603    The algorithm makes two assumptions:
3604      1) The number of bytes "n" in a vector is a power of 2.
3605      2) An address "a" is aligned if a%n is zero and that this
3606         test can be done as a&(n-1) == 0.  For example, for 16
3607         byte vectors the test is a&0xf == 0.  */
3608
3609 static void
3610 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3611                                    tree *cond_expr,
3612                                    gimple_seq *cond_expr_stmt_list)
3613 {
3614   const vec<stmt_vec_info> &may_misalign_stmts
3615     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3616   stmt_vec_info stmt_info;
3617   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3618   tree mask_cst;
3619   unsigned int i;
3620   tree int_ptrsize_type;
3621   char tmp_name[20];
3622   tree or_tmp_name = NULL_TREE;
3623   tree and_tmp_name;
3624   gimple *and_stmt;
3625   tree ptrsize_zero;
3626   tree part_cond_expr;
3627
3628   /* Check that mask is one less than a power of 2, i.e., mask is
3629      all zeros followed by all ones.  */
3630   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3631
3632   int_ptrsize_type = signed_type_for (ptr_type_node);
3633
3634   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3635      of the first vector of the i'th data reference. */
3636
3637   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3638     {
3639       gimple_seq new_stmt_list = NULL;
3640       tree addr_base;
3641       tree addr_tmp_name;
3642       tree new_or_tmp_name;
3643       gimple *addr_stmt, *or_stmt;
3644       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3645       bool negative = tree_int_cst_compare
3646         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3647       tree offset = negative
3648         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3649                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3650         : size_zero_node;
3651
3652       /* create: addr_tmp = (int)(address_of_first_vector) */
3653       addr_base =
3654         vect_create_addr_base_for_vector_ref (loop_vinfo,
3655                                               stmt_info, &new_stmt_list,
3656                                               offset);
3657       if (new_stmt_list != NULL)
3658         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3659
3660       sprintf (tmp_name, "addr2int%d", i);
3661       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3662       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3663       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3664
3665       /* The addresses are OR together.  */
3666
3667       if (or_tmp_name != NULL_TREE)
3668         {
3669           /* create: or_tmp = or_tmp | addr_tmp */
3670           sprintf (tmp_name, "orptrs%d", i);
3671           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3672           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3673                                          or_tmp_name, addr_tmp_name);
3674           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3675           or_tmp_name = new_or_tmp_name;
3676         }
3677       else
3678         or_tmp_name = addr_tmp_name;
3679
3680     } /* end for i */
3681
3682   mask_cst = build_int_cst (int_ptrsize_type, mask);
3683
3684   /* create: and_tmp = or_tmp & mask  */
3685   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3686
3687   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3688                                   or_tmp_name, mask_cst);
3689   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3690
3691   /* Make and_tmp the left operand of the conditional test against zero.
3692      if and_tmp has a nonzero bit then some address is unaligned.  */
3693   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3694   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3695                                 and_tmp_name, ptrsize_zero);
3696   chain_cond_expr (cond_expr, part_cond_expr);
3697 }
3698
3699 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3700    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3701    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3702    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3703
3704 static void
3705 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3706 {
3707   const vec<vec_object_pair> &pairs
3708     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3709   unsigned int i;
3710   vec_object_pair *pair;
3711   FOR_EACH_VEC_ELT (pairs, i, pair)
3712     {
3713       tree addr1 = build_fold_addr_expr (pair->first);
3714       tree addr2 = build_fold_addr_expr (pair->second);
3715       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3716                                          addr1, addr2);
3717       chain_cond_expr (cond_expr, part_cond_expr);
3718     }
3719 }
3720
3721 /* Create an expression that is true when all lower-bound conditions for
3722    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3723
3724 static void
3725 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3726 {
3727   const vec<vec_lower_bound> &lower_bounds
3728     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3729   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3730     {
3731       tree expr = lower_bounds[i].expr;
3732       tree type = unsigned_type_for (TREE_TYPE (expr));
3733       expr = fold_convert (type, expr);
3734       poly_uint64 bound = lower_bounds[i].min_value;
3735       if (!lower_bounds[i].unsigned_p)
3736         {
3737           expr = fold_build2 (PLUS_EXPR, type, expr,
3738                               build_int_cstu (type, bound - 1));
3739           bound += bound - 1;
3740         }
3741       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3742                                          build_int_cstu (type, bound));
3743       chain_cond_expr (cond_expr, part_cond_expr);
3744     }
3745 }
3746
3747 /* Function vect_create_cond_for_alias_checks.
3748
3749    Create a conditional expression that represents the run-time checks for
3750    overlapping of address ranges represented by a list of data references
3751    relations passed as input.
3752
3753    Input:
3754    COND_EXPR  - input conditional expression.  New conditions will be chained
3755                 with logical AND operation.  If it is NULL, then the function
3756                 is used to return the number of alias checks.
3757    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3758                 to be checked.
3759
3760    Output:
3761    COND_EXPR - conditional expression.
3762
3763    The returned COND_EXPR is the conditional expression to be used in the if
3764    statement that controls which version of the loop gets executed at runtime.
3765 */
3766
3767 void
3768 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3769 {
3770   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3771     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3772
3773   if (comp_alias_ddrs.is_empty ())
3774     return;
3775
3776   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3777                                &comp_alias_ddrs, cond_expr);
3778   if (dump_enabled_p ())
3779     dump_printf_loc (MSG_NOTE, vect_location,
3780                      "created %u versioning for alias checks.\n",
3781                      comp_alias_ddrs.length ());
3782 }
3783
3784
3785 /* Function vect_loop_versioning.
3786
3787    If the loop has data references that may or may not be aligned or/and
3788    has data reference relations whose independence was not proven then
3789    two versions of the loop need to be generated, one which is vectorized
3790    and one which isn't.  A test is then generated to control which of the
3791    loops is executed.  The test checks for the alignment of all of the
3792    data references that may or may not be aligned.  An additional
3793    sequence of runtime tests is generated for each pairs of DDRs whose
3794    independence was not proven.  The vectorized version of loop is
3795    executed only if both alias and alignment tests are passed.
3796
3797    The test generated to check which version of loop is executed
3798    is modified to also check for profitability as indicated by the
3799    cost model threshold TH.
3800
3801    The versioning precondition(s) are placed in *COND_EXPR and
3802    *COND_EXPR_STMT_LIST.  */
3803
3804 class loop *
3805 vect_loop_versioning (loop_vec_info loop_vinfo,
3806                       gimple *loop_vectorized_call)
3807 {
3808   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3809   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3810   basic_block condition_bb;
3811   gphi_iterator gsi;
3812   gimple_stmt_iterator cond_exp_gsi;
3813   basic_block merge_bb;
3814   basic_block new_exit_bb;
3815   edge new_exit_e, e;
3816   gphi *orig_phi, *new_phi;
3817   tree cond_expr = NULL_TREE;
3818   gimple_seq cond_expr_stmt_list = NULL;
3819   tree arg;
3820   profile_probability prob = profile_probability::likely ();
3821   gimple_seq gimplify_stmt_list = NULL;
3822   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3823   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3824   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3825   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3826   poly_uint64 versioning_threshold
3827     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3828   tree version_simd_if_cond
3829     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3830   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3831
3832   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3833       && !ordered_p (th, versioning_threshold))
3834     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3835                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3836                                             th - 1));
3837   if (maybe_ne (versioning_threshold, 0U))
3838     {
3839       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3840                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3841                                               versioning_threshold - 1));
3842       if (cond_expr)
3843         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3844                                  expr, cond_expr);
3845       else
3846         cond_expr = expr;
3847     }
3848
3849   tree cost_name = NULL_TREE;
3850   profile_probability prob2 = profile_probability::uninitialized ();
3851   if (cond_expr
3852       && EXPR_P (cond_expr)
3853       && (version_niter
3854           || version_align
3855           || version_alias
3856           || version_simd_if_cond))
3857     {
3858       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3859                                                       &cond_expr_stmt_list,
3860                                                       is_gimple_val, NULL_TREE);
3861       /* Split prob () into two so that the overall probability of passing
3862          both the cost-model and versioning checks is the orig prob.  */
3863       prob2 = prob.split (prob);
3864     }
3865
3866   if (version_niter)
3867     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3868
3869   if (cond_expr)
3870     {
3871       gimple_seq tem = NULL;
3872       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3873                                           &tem, is_gimple_condexpr_for_cond,
3874                                           NULL_TREE);
3875       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3876     }
3877
3878   if (version_align)
3879     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3880                                        &cond_expr_stmt_list);
3881
3882   if (version_alias)
3883     {
3884       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3885       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3886       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3887     }
3888
3889   if (version_simd_if_cond)
3890     {
3891       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3892       if (flag_checking)
3893         if (basic_block bb
3894             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3895           gcc_assert (bb != loop->header
3896                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3897                       && (scalar_loop == NULL
3898                           || (bb != scalar_loop->header
3899                               && dominated_by_p (CDI_DOMINATORS,
3900                                                  scalar_loop->header, bb))));
3901       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3902       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3903                             version_simd_if_cond, zero);
3904       if (cond_expr)
3905         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3906                                  c, cond_expr);
3907       else
3908         cond_expr = c;
3909       if (dump_enabled_p ())
3910         dump_printf_loc (MSG_NOTE, vect_location,
3911                          "created versioning for simd if condition check.\n");
3912     }
3913
3914   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3915                                       &gimplify_stmt_list,
3916                                       is_gimple_condexpr_for_cond, NULL_TREE);
3917   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3918
3919   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3920      invariant in.  */
3921   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3922   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3923        !gsi_end_p (gsi); gsi_next (&gsi))
3924     {
3925       gimple *stmt = gsi_stmt (gsi);
3926       update_stmt (stmt);
3927       ssa_op_iter iter;
3928       use_operand_p use_p;
3929       basic_block def_bb;
3930       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3931         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3932             && flow_bb_inside_loop_p (outermost, def_bb))
3933           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3934     }
3935
3936   /* Search for the outermost loop we can version.  Avoid versioning of
3937      non-perfect nests but allow if-conversion versioned loops inside.  */
3938   class loop *loop_to_version = loop;
3939   if (flow_loop_nested_p (outermost, loop))
3940     {
3941       if (dump_enabled_p ())
3942         dump_printf_loc (MSG_NOTE, vect_location,
3943                          "trying to apply versioning to outer loop %d\n",
3944                          outermost->num);
3945       if (outermost->num == 0)
3946         outermost = superloop_at_depth (loop, 1);
3947       /* And avoid applying versioning on non-perfect nests.  */
3948       while (loop_to_version != outermost
3949              && (e = single_exit (loop_outer (loop_to_version)))
3950              && !(e->flags & EDGE_COMPLEX)
3951              && (!loop_outer (loop_to_version)->inner->next
3952                  || vect_loop_vectorized_call (loop_to_version))
3953              && (!loop_outer (loop_to_version)->inner->next
3954                  || !loop_outer (loop_to_version)->inner->next->next))
3955         loop_to_version = loop_outer (loop_to_version);
3956     }
3957
3958   /* Apply versioning.  If there is already a scalar version created by
3959      if-conversion re-use that.  Note we cannot re-use the copy of
3960      an if-converted outer-loop when vectorizing the inner loop only.  */
3961   gcond *cond;
3962   if ((!loop_to_version->inner || loop == loop_to_version)
3963       && loop_vectorized_call)
3964     {
3965       gcc_assert (scalar_loop);
3966       condition_bb = gimple_bb (loop_vectorized_call);
3967       cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
3968       gimple_cond_set_condition_from_tree (cond, cond_expr);
3969       update_stmt (cond);
3970
3971       if (cond_expr_stmt_list)
3972         {
3973           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3974           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3975                                  GSI_SAME_STMT);
3976         }
3977
3978       /* if-conversion uses profile_probability::always () for both paths,
3979          reset the paths probabilities appropriately.  */
3980       edge te, fe;
3981       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3982       te->probability = prob;
3983       fe->probability = prob.invert ();
3984       /* We can scale loops counts immediately but have to postpone
3985          scaling the scalar loop because we re-use it during peeling.  */
3986       scale_loop_frequencies (loop_to_version, te->probability);
3987       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3988
3989       nloop = scalar_loop;
3990       if (dump_enabled_p ())
3991         dump_printf_loc (MSG_NOTE, vect_location,
3992                          "reusing %sloop version created by if conversion\n",
3993                          loop_to_version != loop ? "outer " : "");
3994     }
3995   else
3996     {
3997       if (loop_to_version != loop
3998           && dump_enabled_p ())
3999         dump_printf_loc (MSG_NOTE, vect_location,
4000                          "applying loop versioning to outer loop %d\n",
4001                          loop_to_version->num);
4002
4003       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4004
4005       initialize_original_copy_tables ();
4006       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4007                             prob, prob.invert (), prob, prob.invert (), true);
4008       gcc_assert (nloop);
4009       nloop = get_loop_copy (loop);
4010
4011       /* For cycle vectorization with SLP we rely on the PHI arguments
4012          appearing in the same order as the SLP node operands which for the
4013          loop PHI nodes means the preheader edge dest index needs to remain
4014          the same for the analyzed loop which also becomes the vectorized one.
4015          Make it so in case the state after versioning differs by redirecting
4016          the first edge into the header to the same destination which moves
4017          it last.  */
4018       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4019         {
4020           edge e = EDGE_PRED (loop->header, 0);
4021           ssa_redirect_edge (e, e->dest);
4022           flush_pending_stmts (e);
4023         }
4024       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4025
4026       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4027          reap those otherwise;  they also refer to the original
4028          loops.  */
4029       class loop *l = loop;
4030       while (gimple *call = vect_loop_vectorized_call (l))
4031         {
4032           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4033           fold_loop_internal_call (call, boolean_false_node);
4034           l = loop_outer (l);
4035         }
4036       free_original_copy_tables ();
4037
4038       if (cond_expr_stmt_list)
4039         {
4040           cond_exp_gsi = gsi_last_bb (condition_bb);
4041           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4042                                  GSI_SAME_STMT);
4043         }
4044
4045       /* Loop versioning violates an assumption we try to maintain during
4046          vectorization - that the loop exit block has a single predecessor.
4047          After versioning, the exit block of both loop versions is the same
4048          basic block (i.e. it has two predecessors). Just in order to simplify
4049          following transformations in the vectorizer, we fix this situation
4050          here by adding a new (empty) block on the exit-edge of the loop,
4051          with the proper loop-exit phis to maintain loop-closed-form.
4052          If loop versioning wasn't done from loop, but scalar_loop instead,
4053          merge_bb will have already just a single successor.  */
4054
4055       merge_bb = single_exit (loop_to_version)->dest;
4056       if (EDGE_COUNT (merge_bb->preds) >= 2)
4057         {
4058           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4059           new_exit_bb = split_edge (single_exit (loop_to_version));
4060           new_exit_e = single_exit (loop_to_version);
4061           e = EDGE_SUCC (new_exit_bb, 0);
4062
4063           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4064                gsi_next (&gsi))
4065             {
4066               tree new_res;
4067               orig_phi = gsi.phi ();
4068               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4069               new_phi = create_phi_node (new_res, new_exit_bb);
4070               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4071               add_phi_arg (new_phi, arg, new_exit_e,
4072                            gimple_phi_arg_location_from_edge (orig_phi, e));
4073               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4074             }
4075         }
4076
4077       update_ssa (TODO_update_ssa_no_phi);
4078     }
4079
4080   /* Split the cost model check off to a separate BB.  Costing assumes
4081      this is the only thing we perform when we enter the scalar loop
4082      from a failed cost decision.  */
4083   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4084     {
4085       gimple *def = SSA_NAME_DEF_STMT (cost_name);
4086       gcc_assert (gimple_bb (def) == condition_bb);
4087       /* All uses of the cost check are 'true' after the check we
4088          are going to insert.  */
4089       replace_uses_by (cost_name, boolean_true_node);
4090       /* And we're going to build the new single use of it.  */
4091       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4092                                        NULL_TREE, NULL_TREE);
4093       edge e = split_block (gimple_bb (def), def);
4094       gimple_stmt_iterator gsi = gsi_for_stmt (def);
4095       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4096       edge true_e, false_e;
4097       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4098       e->flags &= ~EDGE_FALLTHRU;
4099       e->flags |= EDGE_TRUE_VALUE;
4100       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4101       e->probability = prob2;
4102       e2->probability = prob2.invert ();
4103       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4104       auto_vec<basic_block, 3> adj;
4105       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4106            son;
4107            son = next_dom_son (CDI_DOMINATORS, son))
4108         if (EDGE_COUNT (son->preds) > 1)
4109           adj.safe_push (son);
4110       for (auto son : adj)
4111         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4112     }
4113
4114   if (version_niter)
4115     {
4116       /* The versioned loop could be infinite, we need to clear existing
4117          niter information which is copied from the original loop.  */
4118       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4119       vect_free_loop_info_assumptions (nloop);
4120     }
4121
4122   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4123       && dump_enabled_p ())
4124     {
4125       if (version_alias)
4126         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4127                          vect_location,
4128                          "loop versioned for vectorization because of "
4129                          "possible aliasing\n");
4130       if (version_align)
4131         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4132                          vect_location,
4133                          "loop versioned for vectorization to enhance "
4134                          "alignment\n");
4135
4136     }
4137
4138   return nloop;
4139 }