gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53 #include "langhooks.h"
  54 #include "tree-vector-builder.h"
  55 #include "optabs-tree.h"
  56
  57 /*************************************************************************
  58   Simple Loop Peeling Utilities
  59
  60   Utilities to support loop peeling for vectorization purposes.
  61  *************************************************************************/
  62
  63
  64 /* Renames the use *OP_P.  */
  65
  66 static void
  67 rename_use_op (use_operand_p op_p)
  68 {
  69   tree new_name;
  70
  71   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  72     return;
  73
  74   new_name = get_current_def (USE_FROM_PTR (op_p));
  75
  76   /* Something defined outside of the loop.  */
  77   if (!new_name)
  78     return;
  79
  80   /* An ordinary ssa name defined in the loop.  */
  81
  82   SET_USE (op_p, new_name);
  83 }
  84
  85
  86 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  87    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  88    true.  */
  89
  90 static void
  91 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  92 {
  93   gimple *stmt;
  94   use_operand_p use_p;
  95   ssa_op_iter iter;
  96   edge e;
  97   edge_iterator ei;
  98   class loop *loop = bb->loop_father;
  99   class loop *outer_loop = NULL;
 100
 101   if (rename_from_outer_loop)
 102     {
 103       gcc_assert (loop);
 104       outer_loop = loop_outer (loop);
 105     }
 106
 107   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 108        gsi_next (&gsi))
 109     {
 110       stmt = gsi_stmt (gsi);
 111       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 112         rename_use_op (use_p);
 113     }
 114
 115   FOR_EACH_EDGE (e, ei, bb->preds)
 116     {
 117       if (!flow_bb_inside_loop_p (loop, e->src))
 118         {
 119           if (!rename_from_outer_loop)
 120             continue;
 121           if (e->src != outer_loop->header)
 122             {
 123               if (outer_loop->inner->next)
 124                 {
 125                   /* If outer_loop has 2 inner loops, allow there to
 126                      be an extra basic block which decides which of the
 127                      two loops to use using LOOP_VECTORIZED.  */
 128                   if (!single_pred_p (e->src)
 129                       || single_pred (e->src) != outer_loop->header)
 130                     continue;
 131                 }
 132             }
 133         }
 134       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 135            gsi_next (&gsi))
 136         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 137     }
 138 }
 139
 140
 141 struct adjust_info
 142 {
 143   tree from, to;
 144   basic_block bb;
 145 };
 146
 147 /* A stack of values to be adjusted in debug stmts.  We have to
 148    process them LIFO, so that the closest substitution applies.  If we
 149    processed them FIFO, without the stack, we might substitute uses
 150    with a PHI DEF that would soon become non-dominant, and when we got
 151    to the suitable one, it wouldn't have anything to substitute any
 152    more.  */
 153 static vec<adjust_info, va_heap> adjust_vec;
 154
 155 /* Adjust any debug stmts that referenced AI->from values to use the
 156    loop-closed AI->to, if the references are dominated by AI->bb and
 157    not by the definition of AI->from.  */
 158
 159 static void
 160 adjust_debug_stmts_now (adjust_info *ai)
 161 {
 162   basic_block bbphi = ai->bb;
 163   tree orig_def = ai->from;
 164   tree new_def = ai->to;
 165   imm_use_iterator imm_iter;
 166   gimple *stmt;
 167   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 168
 169   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 170
 171   /* Adjust any debug stmts that held onto non-loop-closed
 172      references.  */
 173   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 174     {
 175       use_operand_p use_p;
 176       basic_block bbuse;
 177
 178       if (!is_gimple_debug (stmt))
 179         continue;
 180
 181       gcc_assert (gimple_debug_bind_p (stmt));
 182
 183       bbuse = gimple_bb (stmt);
 184
 185       if ((bbuse == bbphi
 186            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 187           && !(bbuse == bbdef
 188                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 189         {
 190           if (new_def)
 191             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 192               SET_USE (use_p, new_def);
 193           else
 194             {
 195               gimple_debug_bind_reset_value (stmt);
 196               update_stmt (stmt);
 197             }
 198         }
 199     }
 200 }
 201
 202 /* Adjust debug stmts as scheduled before.  */
 203
 204 static void
 205 adjust_vec_debug_stmts (void)
 206 {
 207   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 208     return;
 209
 210   gcc_assert (adjust_vec.exists ());
 211
 212   while (!adjust_vec.is_empty ())
 213     {
 214       adjust_debug_stmts_now (&adjust_vec.last ());
 215       adjust_vec.pop ();
 216     }
 217 }
 218
 219 /* Adjust any debug stmts that referenced FROM values to use the
 220    loop-closed TO, if the references are dominated by BB and not by
 221    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 222    will be postponed until adjust_vec_debug_stmts is called.  */
 223
 224 static void
 225 adjust_debug_stmts (tree from, tree to, basic_block bb)
 226 {
 227   adjust_info ai;
 228
 229   if (MAY_HAVE_DEBUG_BIND_STMTS
 230       && TREE_CODE (from) == SSA_NAME
 231       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 232       && ! virtual_operand_p (from))
 233     {
 234       ai.from = from;
 235       ai.to = to;
 236       ai.bb = bb;
 237
 238       if (adjust_vec.exists ())
 239         adjust_vec.safe_push (ai);
 240       else
 241         adjust_debug_stmts_now (&ai);
 242     }
 243 }
 244
 245 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 246    to adjust any debug stmts that referenced the old phi arg,
 247    presumably non-loop-closed references left over from other
 248    transformations.  */
 249
 250 static void
 251 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 252 {
 253   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 254
 255   gcc_assert (TREE_CODE (orig_def) != SSA_NAME
 256               || orig_def != new_def);
 257
 258   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 259
 260   if (MAY_HAVE_DEBUG_BIND_STMTS)
 261     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 262                         gimple_bb (update_phi));
 263 }
 264
 265 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 266    that the control should have during the first iteration and NEXT_CTRL is the
 267    value that it should have on subsequent iterations.  */
 268
 269 static void
 270 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 271                        tree next_ctrl)
 272 {
 273   gphi *phi = create_phi_node (ctrl, loop->header);
 274   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 275   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 276 }
 277
 278 /* Add SEQ to the end of LOOP's preheader block.  */
 279
 280 static void
 281 add_preheader_seq (class loop *loop, gimple_seq seq)
 282 {
 283   if (seq)
 284     {
 285       edge pe = loop_preheader_edge (loop);
 286       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 287       gcc_assert (!new_bb);
 288     }
 289 }
 290
 291 /* Add SEQ to the beginning of LOOP's header block.  */
 292
 293 static void
 294 add_header_seq (class loop *loop, gimple_seq seq)
 295 {
 296   if (seq)
 297     {
 298       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 299       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 300     }
 301 }
 302
 303 /* Return true if the target can interleave elements of two vectors.
 304    OFFSET is 0 if the first half of the vectors should be interleaved
 305    or 1 if the second half should.  When returning true, store the
 306    associated permutation in INDICES.  */
 307
 308 static bool
 309 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 310                         unsigned int offset)
 311 {
 312   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 313   poly_uint64 base = exact_div (nelts, 2) * offset;
 314   vec_perm_builder sel (nelts, 2, 3);
 315   for (unsigned int i = 0; i < 3; ++i)
 316     {
 317       sel.quick_push (base + i);
 318       sel.quick_push (base + i + nelts);
 319     }
 320   indices->new_vector (sel, 2, nelts);
 321   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 322                                *indices);
 323 }
 324
 325 /* Try to use permutes to define the masks in DEST_RGM using the masks
 326    in SRC_RGM, given that the former has twice as many masks as the
 327    latter.  Return true on success, adding any new statements to SEQ.  */
 328
 329 static bool
 330 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 331                                rgroup_controls *src_rgm)
 332 {
 333   tree src_masktype = src_rgm->type;
 334   tree dest_masktype = dest_rgm->type;
 335   machine_mode src_mode = TYPE_MODE (src_masktype);
 336   insn_code icode1, icode2;
 337   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 338       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 339                                   src_mode)) != CODE_FOR_nothing
 340       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 341                                   src_mode)) != CODE_FOR_nothing)
 342     {
 343       /* Unpacking the source masks gives at least as many mask bits as
 344          we need.  We can then VIEW_CONVERT any excess bits away.  */
 345       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 346       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 347       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 348       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 349         {
 350           tree src = src_rgm->controls[i / 2];
 351           tree dest = dest_rgm->controls[i];
 352           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 353                             ? VEC_UNPACK_HI_EXPR
 354                             : VEC_UNPACK_LO_EXPR);
 355           gassign *stmt;
 356           if (dest_masktype == unpack_masktype)
 357             stmt = gimple_build_assign (dest, code, src);
 358           else
 359             {
 360               tree temp = make_ssa_name (unpack_masktype);
 361               stmt = gimple_build_assign (temp, code, src);
 362               gimple_seq_add_stmt (seq, stmt);
 363               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 364                                           build1 (VIEW_CONVERT_EXPR,
 365                                                   dest_masktype, temp));
 366             }
 367           gimple_seq_add_stmt (seq, stmt);
 368         }
 369       return true;
 370     }
 371   vec_perm_indices indices[2];
 372   if (dest_masktype == src_masktype
 373       && interleave_supported_p (&indices[0], src_masktype, 0)
 374       && interleave_supported_p (&indices[1], src_masktype, 1))
 375     {
 376       /* The destination requires twice as many mask bits as the source, so
 377          we can use interleaving permutes to double up the number of bits.  */
 378       tree masks[2];
 379       for (unsigned int i = 0; i < 2; ++i)
 380         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 381       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 382         {
 383           tree src = src_rgm->controls[i / 2];
 384           tree dest = dest_rgm->controls[i];
 385           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 386                                               src, src, masks[i & 1]);
 387           gimple_seq_add_stmt (seq, stmt);
 388         }
 389       return true;
 390     }
 391   return false;
 392 }
 393
 394 /* Populate DEST_RGM->controls, given that they should add up to STEP.
 395
 396      STEP = MIN_EXPR <ivtmp_34, VF>;
 397
 398      First length (MIN (X, VF/N)):
 399        loop_len_15 = MIN_EXPR <STEP, VF/N>;
 400
 401      Second length:
 402        tmp = STEP - loop_len_15;
 403        loop_len_16 = MIN (tmp, VF/N);
 404
 405      Third length:
 406        tmp2 = tmp - loop_len_16;
 407        loop_len_17 = MIN (tmp2, VF/N);
 408
 409      Last length:
 410        loop_len_18 = tmp2 - loop_len_17;
 411 */
 412
 413 static void
 414 vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
 415                                rgroup_controls *dest_rgm, tree step)
 416 {
 417   tree ctrl_type = dest_rgm->type;
 418   poly_uint64 nitems_per_ctrl
 419     = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
 420   tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
 421
 422   for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 423     {
 424       tree ctrl = dest_rgm->controls[i];
 425       if (i == 0)
 426         {
 427           /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
 428           gassign *assign
 429             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 430           gimple_seq_add_stmt (seq, assign);
 431         }
 432       else if (i == dest_rgm->controls.length () - 1)
 433         {
 434           /* Last iteration: Remain capped to the range [0, VF/N].  */
 435           gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
 436                                                  dest_rgm->controls[i - 1]);
 437           gimple_seq_add_stmt (seq, assign);
 438         }
 439       else
 440         {
 441           /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
 442           step = gimple_build (seq, MINUS_EXPR, iv_type, step,
 443                                dest_rgm->controls[i - 1]);
 444           gassign *assign
 445             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 446           gimple_seq_add_stmt (seq, assign);
 447         }
 448     }
 449 }
 450
 451 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 452    for all the rgroup controls in RGC and return a control that is nonzero
 453    when the loop needs to iterate.  Add any new preheader statements to
 454    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 455
 456    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 457    times and has been vectorized according to LOOP_VINFO.
 458
 459    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 460    starts with NITERS_SKIP dummy iterations of the scalar loop before
 461    the real work starts.  The mask elements for these dummy iterations
 462    must be 0, to ensure that the extra iterations do not have an effect.
 463
 464    It is known that:
 465
 466      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 467
 468    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 469    variable that starts at 0 and has step:
 470
 471      VF * RGC->max_nscalars_per_iter * RGC->factor
 472
 473    might overflow before hitting a value above:
 474
 475      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 476
 477    This means that we cannot guarantee that such an induction variable
 478    would ever hit a value that produces a set of all-false masks or zero
 479    lengths for RGC.
 480
 481    Note: the cost of the code generated by this function is modeled
 482    by vect_estimate_min_profitable_iters, so changes here may need
 483    corresponding changes there.  */
 484
 485 static tree
 486 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 487                                  gimple_seq *preheader_seq,
 488                                  gimple_seq *header_seq,
 489                                  gimple_stmt_iterator loop_cond_gsi,
 490                                  rgroup_controls *rgc, tree niters,
 491                                  tree niters_skip, bool might_wrap_p,
 492                                  tree *iv_step, tree *compare_step)
 493 {
 494   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 495   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 496   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 497
 498   tree ctrl_type = rgc->type;
 499   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 500   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 501   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 502   tree length_limit = NULL_TREE;
 503   /* For length, we need length_limit to ensure length in range.  */
 504   if (!use_masks_p)
 505     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 506
 507   /* Calculate the maximum number of item values that the rgroup
 508      handles in total, the number that it handles for each iteration
 509      of the vector loop, and the number that it should skip during the
 510      first iteration of the vector loop.  */
 511   tree nitems_total = niters;
 512   tree nitems_step = build_int_cst (iv_type, vf);
 513   tree nitems_skip = niters_skip;
 514   if (nitems_per_iter != 1)
 515     {
 516       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 517          these multiplications don't overflow.  */
 518       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 519       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 520       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 521                                    nitems_total, compare_factor);
 522       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 523                                   nitems_step, iv_factor);
 524       if (nitems_skip)
 525         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 526                                     nitems_skip, compare_factor);
 527     }
 528
 529   /* Create an induction variable that counts the number of items
 530      processed.  */
 531   tree index_before_incr, index_after_incr;
 532   gimple_stmt_iterator incr_gsi;
 533   bool insert_after;
 534   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 535   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
 536     {
 537       /* Create an IV that counts down from niters_total and whose step
 538          is the (variable) amount processed in the current iteration:
 539            ...
 540            _10 = (unsigned long) count_12(D);
 541            ...
 542            # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
 543            _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
 544            ...
 545            vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
 546            ...
 547            ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
 548            ...
 549            if (ivtmp_9 > POLY_INT_CST [4, 4])
 550              goto <bb 4>; [83.33%]
 551            else
 552              goto <bb 5>; [16.67%]
 553       */
 554       nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
 555       tree step = rgc->controls.length () == 1 ? rgc->controls[0]
 556                                                : make_ssa_name (iv_type);
 557       /* Create decrement IV.  */
 558       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 559         {
 560           create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
 561                      insert_after, &index_before_incr, &index_after_incr);
 562           tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
 563                                    index_before_incr, nitems_step);
 564           gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
 565         }
 566       else
 567         {
 568           create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
 569                      &incr_gsi, insert_after, &index_before_incr,
 570                      &index_after_incr);
 571           gimple_seq_add_stmt (header_seq,
 572                                gimple_build_assign (step, MIN_EXPR,
 573                                                     index_before_incr,
 574                                                     nitems_step));
 575         }
 576       *iv_step = step;
 577       *compare_step = nitems_step;
 578       return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
 579                                                        : index_before_incr;
 580     }
 581
 582   /* Create increment IV.  */
 583   create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
 584              loop, &incr_gsi, insert_after, &index_before_incr,
 585              &index_after_incr);
 586
 587   tree zero_index = build_int_cst (compare_type, 0);
 588   tree test_index, test_limit, first_limit;
 589   gimple_stmt_iterator *test_gsi;
 590   if (might_wrap_p)
 591     {
 592       /* In principle the loop should stop iterating once the incremented
 593          IV reaches a value greater than or equal to:
 594
 595            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 596
 597          However, there's no guarantee that this addition doesn't overflow
 598          the comparison type, or that the IV hits a value above it before
 599          wrapping around.  We therefore adjust the limit down by one
 600          IV step:
 601
 602            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 603            -[infinite-prec] NITEMS_STEP
 604
 605          and compare the IV against this limit _before_ incrementing it.
 606          Since the comparison type is unsigned, we actually want the
 607          subtraction to saturate at zero:
 608
 609            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 610            -[sat] NITEMS_STEP
 611
 612          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 613
 614            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 615
 616          where the rightmost subtraction can be done directly in
 617          COMPARE_TYPE.  */
 618       test_index = index_before_incr;
 619       tree adjust = gimple_convert (preheader_seq, compare_type,
 620                                     nitems_step);
 621       if (nitems_skip)
 622         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 623                                adjust, nitems_skip);
 624       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 625                                  nitems_total, adjust);
 626       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 627                                  test_limit, adjust);
 628       test_gsi = &incr_gsi;
 629
 630       /* Get a safe limit for the first iteration.  */
 631       if (nitems_skip)
 632         {
 633           /* The first vector iteration can handle at most NITEMS_STEP
 634              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 635              NITEMS_SKIP to that cannot overflow.  */
 636           tree const_limit = build_int_cst (compare_type,
 637                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 638                                             * nitems_per_iter);
 639           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 640                                       nitems_total, const_limit);
 641           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 642                                       first_limit, nitems_skip);
 643         }
 644       else
 645         /* For the first iteration it doesn't matter whether the IV hits
 646            a value above NITEMS_TOTAL.  That only matters for the latch
 647            condition.  */
 648         first_limit = nitems_total;
 649     }
 650   else
 651     {
 652       /* Test the incremented IV, which will always hit a value above
 653          the bound before wrapping.  */
 654       test_index = index_after_incr;
 655       test_limit = nitems_total;
 656       if (nitems_skip)
 657         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 658                                    test_limit, nitems_skip);
 659       test_gsi = &loop_cond_gsi;
 660
 661       first_limit = test_limit;
 662     }
 663
 664   /* Convert the IV value to the comparison type (either a no-op or
 665      a demotion).  */
 666   gimple_seq test_seq = NULL;
 667   test_index = gimple_convert (&test_seq, compare_type, test_index);
 668   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 669
 670   /* Provide a definition of each control in the group.  */
 671   tree next_ctrl = NULL_TREE;
 672   tree ctrl;
 673   unsigned int i;
 674   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 675     {
 676       /* Previous controls will cover BIAS items.  This control covers the
 677          next batch.  */
 678       poly_uint64 bias = nitems_per_ctrl * i;
 679       tree bias_tree = build_int_cst (compare_type, bias);
 680
 681       /* See whether the first iteration of the vector loop is known
 682          to have a full control.  */
 683       poly_uint64 const_limit;
 684       bool first_iteration_full
 685         = (poly_int_tree_p (first_limit, &const_limit)
 686            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 687
 688       /* Rather than have a new IV that starts at BIAS and goes up to
 689          TEST_LIMIT, prefer to use the same 0-based IV for each control
 690          and adjust the bound down by BIAS.  */
 691       tree this_test_limit = test_limit;
 692       if (i != 0)
 693         {
 694           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 695                                           compare_type, this_test_limit,
 696                                           bias_tree);
 697           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 698                                           compare_type, this_test_limit,
 699                                           bias_tree);
 700         }
 701
 702       /* Create the initial control.  First include all items that
 703          are within the loop limit.  */
 704       tree init_ctrl = NULL_TREE;
 705       if (!first_iteration_full)
 706         {
 707           tree start, end;
 708           if (first_limit == test_limit)
 709             {
 710               /* Use a natural test between zero (the initial IV value)
 711                  and the loop limit.  The "else" block would be valid too,
 712                  but this choice can avoid the need to load BIAS_TREE into
 713                  a register.  */
 714               start = zero_index;
 715               end = this_test_limit;
 716             }
 717           else
 718             {
 719               /* FIRST_LIMIT is the maximum number of items handled by the
 720                  first iteration of the vector loop.  Test the portion
 721                  associated with this control.  */
 722               start = bias_tree;
 723               end = first_limit;
 724             }
 725
 726           if (use_masks_p)
 727             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 728                                         start, end, "max_mask");
 729           else
 730             {
 731               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 732               gimple_seq seq = vect_gen_len (init_ctrl, start,
 733                                              end, length_limit);
 734               gimple_seq_add_seq (preheader_seq, seq);
 735             }
 736         }
 737
 738       /* Now AND out the bits that are within the number of skipped
 739          items.  */
 740       poly_uint64 const_skip;
 741       if (nitems_skip
 742           && !(poly_int_tree_p (nitems_skip, &const_skip)
 743                && known_le (const_skip, bias)))
 744         {
 745           gcc_assert (use_masks_p);
 746           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 747                                                     bias_tree, nitems_skip);
 748           if (init_ctrl)
 749             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 750                                       init_ctrl, unskipped_mask);
 751           else
 752             init_ctrl = unskipped_mask;
 753         }
 754
 755       if (!init_ctrl)
 756         {
 757           /* First iteration is full.  */
 758           if (use_masks_p)
 759             init_ctrl = build_minus_one_cst (ctrl_type);
 760           else
 761             init_ctrl = length_limit;
 762         }
 763
 764       /* Get the control value for the next iteration of the loop.  */
 765       if (use_masks_p)
 766         {
 767           gimple_seq stmts = NULL;
 768           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 769                                       this_test_limit, "next_mask");
 770           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 771         }
 772       else
 773         {
 774           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 775           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 776                                          length_limit);
 777           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 778         }
 779
 780       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 781     }
 782
 783   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 784   if (partial_load_bias != 0)
 785     {
 786       tree adjusted_len = rgc->bias_adjusted_ctrl;
 787       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 788                                             rgc->controls[0],
 789                                             build_int_cst
 790                                             (TREE_TYPE (rgc->controls[0]),
 791                                              partial_load_bias));
 792       gimple_seq_add_stmt (header_seq, minus);
 793     }
 794
 795   return next_ctrl;
 796 }
 797
 798 /* Set up the iteration condition and rgroup controls for LOOP, given
 799    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 800    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 801    the number of iterations of the original scalar loop that should be
 802    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 803    for vect_set_loop_condition.
 804
 805    Insert the branch-back condition before LOOP_COND_GSI and return the
 806    final gcond.  */
 807
 808 static gcond *
 809 vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
 810                                          loop_vec_info loop_vinfo, tree niters,
 811                                          tree final_iv, bool niters_maybe_zero,
 812                                          gimple_stmt_iterator loop_cond_gsi)
 813 {
 814   gimple_seq preheader_seq = NULL;
 815   gimple_seq header_seq = NULL;
 816
 817   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 818   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 819   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 820   tree orig_niters = niters;
 821
 822   /* Type of the initial value of NITERS.  */
 823   tree ni_actual_type = TREE_TYPE (niters);
 824   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 825   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 826   if (niters_skip)
 827     niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
 828
 829   /* Convert NITERS to the same size as the compare.  */
 830   if (compare_precision > ni_actual_precision
 831       && niters_maybe_zero)
 832     {
 833       /* We know that there is always at least one iteration, so if the
 834          count is zero then it must have wrapped.  Cope with this by
 835          subtracting 1 before the conversion and adding 1 to the result.  */
 836       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 837       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 838                              niters, build_minus_one_cst (ni_actual_type));
 839       niters = gimple_convert (&preheader_seq, compare_type, niters);
 840       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 841                              niters, build_one_cst (compare_type));
 842     }
 843   else
 844     niters = gimple_convert (&preheader_seq, compare_type, niters);
 845
 846   /* Iterate over all the rgroups and fill in their controls.  We could use
 847      the first control from any rgroup for the loop condition; here we
 848      arbitrarily pick the last.  */
 849   tree test_ctrl = NULL_TREE;
 850   tree iv_step = NULL_TREE;
 851   tree compare_step = NULL_TREE;
 852   rgroup_controls *rgc;
 853   rgroup_controls *iv_rgc = nullptr;
 854   unsigned int i;
 855   auto_vec<rgroup_controls> *controls = use_masks_p
 856                                           ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
 857                                           : &LOOP_VINFO_LENS (loop_vinfo);
 858   FOR_EACH_VEC_ELT (*controls, i, rgc)
 859     if (!rgc->controls.is_empty ())
 860       {
 861         /* First try using permutes.  This adds a single vector
 862            instruction to the loop for each mask, but needs no extra
 863            loop invariants or IVs.  */
 864         unsigned int nmasks = i + 1;
 865         if (use_masks_p && (nmasks & 1) == 0)
 866           {
 867             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 868             if (!half_rgc->controls.is_empty ()
 869                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 870               continue;
 871           }
 872
 873         if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 874             || !iv_rgc
 875             || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
 876                 != rgc->max_nscalars_per_iter * rgc->factor))
 877           {
 878             /* See whether zero-based IV would ever generate all-false masks
 879                or zero length before wrapping around.  */
 880             bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 881
 882             /* Set up all controls for this group.  */
 883             test_ctrl
 884               = vect_set_loop_controls_directly (loop, loop_vinfo,
 885                                                  &preheader_seq, &header_seq,
 886                                                  loop_cond_gsi, rgc, niters,
 887                                                  niters_skip, might_wrap_p,
 888                                                  &iv_step, &compare_step);
 889
 890             iv_rgc = rgc;
 891           }
 892
 893         if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 894             && rgc->controls.length () > 1)
 895           {
 896             /* vect_set_loop_controls_directly creates an IV whose step
 897                is equal to the expected sum of RGC->controls.  Use that
 898                information to populate RGC->controls.  */
 899             tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 900             gcc_assert (iv_step);
 901             vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
 902           }
 903       }
 904
 905   /* Emit all accumulated statements.  */
 906   add_preheader_seq (loop, preheader_seq);
 907   add_header_seq (loop, header_seq);
 908
 909   /* Get a boolean result that tells us whether to iterate.  */
 910   gcond *cond_stmt;
 911   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 912       && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 913     {
 914       gcc_assert (compare_step);
 915       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
 916       cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
 917                                      NULL_TREE);
 918     }
 919   else
 920     {
 921       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 922       tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 923       cond_stmt
 924         = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
 925     }
 926   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 927
 928   /* The loop iterates (NITERS - 1) / VF + 1 times.
 929      Subtract one from this to get the latch count.  */
 930   tree step = build_int_cst (compare_type,
 931                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 932   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 933                                        build_minus_one_cst (compare_type));
 934   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 935                                      niters_minus_one, step);
 936
 937   if (final_iv)
 938     {
 939       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 940       gsi_insert_on_edge_immediate (exit_edge, assign);
 941     }
 942
 943   return cond_stmt;
 944 }
 945
 946 /* Set up the iteration condition and rgroup controls for LOOP in AVX512
 947    style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
 948    vectorized loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 949    the number of iterations of the original scalar loop that should be
 950    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 951    for vect_set_loop_condition.
 952
 953    Insert the branch-back condition before LOOP_COND_GSI and return the
 954    final gcond.  */
 955
 956 static gcond *
 957 vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
 958                                          edge exit_edge,
 959                                          loop_vec_info loop_vinfo, tree niters,
 960                                          tree final_iv,
 961                                          bool niters_maybe_zero,
 962                                          gimple_stmt_iterator loop_cond_gsi)
 963 {
 964   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 965   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 966   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 967   tree orig_niters = niters;
 968   gimple_seq preheader_seq = NULL;
 969
 970   /* Create an IV that counts down from niters and whose step
 971      is the number of iterations processed in the current iteration.
 972      Produce the controls with compares like the following.
 973
 974        # iv_2 = PHI <niters, iv_3>
 975        rem_4 = MIN <iv_2, VF>;
 976        remv_6 = { rem_4, rem_4, rem_4, ... }
 977        mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
 978        iv_3 = iv_2 - VF;
 979        if (iv_2 > VF)
 980          continue;
 981
 982      Where the constant is built with elements at most VF - 1 and
 983      repetitions according to max_nscalars_per_iter which is guarnateed
 984      to be the same within a group.  */
 985
 986   /* Convert NITERS to the determined IV type.  */
 987   if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
 988       && niters_maybe_zero)
 989     {
 990       /* We know that there is always at least one iteration, so if the
 991          count is zero then it must have wrapped.  Cope with this by
 992          subtracting 1 before the conversion and adding 1 to the result.  */
 993       gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
 994       niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
 995                              niters, build_minus_one_cst (TREE_TYPE (niters)));
 996       niters = gimple_convert (&preheader_seq, iv_type, niters);
 997       niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
 998                              niters, build_one_cst (iv_type));
 999     }
1000   else
1001     niters = gimple_convert (&preheader_seq, iv_type, niters);
1002
1003   /* Bias the initial value of the IV in case we need to skip iterations
1004      at the beginning.  */
1005   tree niters_adj = niters;
1006   if (niters_skip)
1007     {
1008       tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1009       niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1010                                  iv_type, niters, skip);
1011     }
1012
1013   /* The iteration step is the vectorization factor.  */
1014   tree iv_step = build_int_cst (iv_type, vf);
1015
1016   /* Create the decrement IV.  */
1017   tree index_before_incr, index_after_incr;
1018   gimple_stmt_iterator incr_gsi;
1019   bool insert_after;
1020   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1021   create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1022              &incr_gsi, insert_after, &index_before_incr,
1023              &index_after_incr);
1024
1025   /* Iterate over all the rgroups and fill in their controls.  */
1026   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1027     {
1028       if (rgc.controls.is_empty ())
1029         continue;
1030
1031       tree ctrl_type = rgc.type;
1032       poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1033
1034       tree vectype = rgc.compare_type;
1035
1036       /* index_after_incr is the IV specifying the remaining iterations in
1037          the next iteration.  */
1038       tree rem = index_after_incr;
1039       /* When the data type for the compare to produce the mask is
1040          smaller than the IV type we need to saturate.  Saturate to
1041          the smallest possible value (IV_TYPE) so we only have to
1042          saturate once (CSE will catch redundant ones we add).  */
1043       if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1044         rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1045                             UNKNOWN_LOCATION,
1046                             MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1047       rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1048                             UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1049
1050       /* Build a data vector composed of the remaining iterations.  */
1051       rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1052                                           UNKNOWN_LOCATION, vectype, rem);
1053
1054       /* Provide a definition of each vector in the control group.  */
1055       tree next_ctrl = NULL_TREE;
1056       tree first_rem = NULL_TREE;
1057       tree ctrl;
1058       unsigned int i;
1059       FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1060         {
1061           /* Previous controls will cover BIAS items.  This control covers the
1062              next batch.  */
1063           poly_uint64 bias = nitems_per_ctrl * i;
1064
1065           /* Build the constant to compare the remaining iters against,
1066              this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1067              split into pieces.  */
1068           unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1069           tree_vector_builder builder (vectype, n, 1);
1070           for (unsigned i = 0; i < n; ++i)
1071             {
1072               unsigned HOST_WIDE_INT val
1073                 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1074               gcc_assert (val < vf.to_constant ());
1075               builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1076             }
1077           tree cmp_series = builder.build ();
1078
1079           /* Create the initial control.  First include all items that
1080              are within the loop limit.  */
1081           tree init_ctrl = NULL_TREE;
1082           poly_uint64 const_limit;
1083           /* See whether the first iteration of the vector loop is known
1084              to have a full control.  */
1085           if (poly_int_tree_p (niters, &const_limit)
1086               && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1087             init_ctrl = build_minus_one_cst (ctrl_type);
1088           else
1089             {
1090               /* The remaining work items initially are niters.  Saturate,
1091                  splat and compare.  */
1092               if (!first_rem)
1093                 {
1094                   first_rem = niters;
1095                   if (TYPE_PRECISION (TREE_TYPE (vectype))
1096                       < TYPE_PRECISION (iv_type))
1097                     first_rem = gimple_build (&preheader_seq,
1098                                               MIN_EXPR, TREE_TYPE (first_rem),
1099                                               first_rem, iv_step);
1100                   first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1101                                               first_rem);
1102                   first_rem = gimple_build_vector_from_val (&preheader_seq,
1103                                                             vectype, first_rem);
1104                 }
1105               init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1106                                         cmp_series, first_rem);
1107             }
1108
1109           /* Now AND out the bits that are within the number of skipped
1110              items.  */
1111           poly_uint64 const_skip;
1112           if (niters_skip
1113               && !(poly_int_tree_p (niters_skip, &const_skip)
1114                    && known_le (const_skip, bias)))
1115             {
1116               /* For integer mode masks it's cheaper to shift out the bits
1117                  since that avoids loading a constant.  */
1118               gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1119               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1120                                         lang_hooks.types.type_for_mode
1121                                           (TYPE_MODE (ctrl_type), 1),
1122                                         init_ctrl);
1123               /* ???  But when the shift amount isn't constant this requires
1124                  a round-trip to GRPs.  We could apply the bias to either
1125                  side of the compare instead.  */
1126               tree shift = gimple_build (&preheader_seq, MULT_EXPR,
1127                                          TREE_TYPE (niters_skip), niters_skip,
1128                                          build_int_cst (TREE_TYPE (niters_skip),
1129                                                         rgc.max_nscalars_per_iter));
1130               init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1131                                         TREE_TYPE (init_ctrl),
1132                                         init_ctrl, shift);
1133               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1134                                         ctrl_type, init_ctrl);
1135             }
1136
1137           /* Get the control value for the next iteration of the loop.  */
1138           next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1139                                     UNKNOWN_LOCATION,
1140                                     LT_EXPR, ctrl_type, cmp_series, rem);
1141
1142           vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1143         }
1144     }
1145
1146   /* Emit all accumulated statements.  */
1147   add_preheader_seq (loop, preheader_seq);
1148
1149   /* Adjust the exit test using the decrementing IV.  */
1150   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1151   /* When we peel for alignment with niter_skip != 0 this can
1152      cause niter + niter_skip to wrap and since we are comparing the
1153      value before the decrement here we get a false early exit.
1154      We can't compare the value after decrement either because that
1155      decrement could wrap as well as we're not doing a saturating
1156      decrement.  To avoid this situation we force a larger
1157      iv_type.  */
1158   gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1159                                         NULL_TREE, NULL_TREE);
1160   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1161
1162   /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1163      Subtract one from this to get the latch count.  */
1164   tree niters_minus_one
1165     = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1166                    build_minus_one_cst (TREE_TYPE (orig_niters)));
1167   tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1168   if (niters_skip)
1169     niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1170                                fold_convert (iv_type, niters_skip));
1171   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1172                                      niters_adj2, iv_step);
1173
1174   if (final_iv)
1175     {
1176       gassign *assign = gimple_build_assign (final_iv, orig_niters);
1177       gsi_insert_on_edge_immediate (single_exit (loop), assign);
1178     }
1179
1180   return cond_stmt;
1181 }
1182
1183
1184 /* Like vect_set_loop_condition, but handle the case in which the vector
1185    loop handles exactly VF scalars per iteration.  */
1186
1187 static gcond *
1188 vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1189                                 class loop *loop, tree niters, tree step,
1190                                 tree final_iv, bool niters_maybe_zero,
1191                                 gimple_stmt_iterator loop_cond_gsi)
1192 {
1193   tree indx_before_incr, indx_after_incr;
1194   gcond *cond_stmt;
1195   gcond *orig_cond;
1196   edge pe = loop_preheader_edge (loop);
1197   gimple_stmt_iterator incr_gsi;
1198   bool insert_after;
1199   enum tree_code code;
1200   tree niters_type = TREE_TYPE (niters);
1201
1202   orig_cond = get_loop_exit_condition (exit_edge);
1203   gcc_assert (orig_cond);
1204   loop_cond_gsi = gsi_for_stmt (orig_cond);
1205
1206   tree init, limit;
1207   if (!niters_maybe_zero && integer_onep (step))
1208     {
1209       /* In this case we can use a simple 0-based IV:
1210
1211          A:
1212            x = 0;
1213            do
1214              {
1215                ...
1216                x += 1;
1217              }
1218            while (x < NITERS);  */
1219       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1220       init = build_zero_cst (niters_type);
1221       limit = niters;
1222     }
1223   else
1224     {
1225       /* The following works for all values of NITERS except 0:
1226
1227          B:
1228            x = 0;
1229            do
1230              {
1231                ...
1232                x += STEP;
1233              }
1234            while (x <= NITERS - STEP);
1235
1236          so that the loop continues to iterate if x + STEP - 1 < NITERS
1237          but stops if x + STEP - 1 >= NITERS.
1238
1239          However, if NITERS is zero, x never hits a value above NITERS - STEP
1240          before wrapping around.  There are two obvious ways of dealing with
1241          this:
1242
1243          - start at STEP - 1 and compare x before incrementing it
1244          - start at -1 and compare x after incrementing it
1245
1246          The latter is simpler and is what we use.  The loop in this case
1247          looks like:
1248
1249          C:
1250            x = -1;
1251            do
1252              {
1253                ...
1254                x += STEP;
1255              }
1256            while (x < NITERS - STEP);
1257
1258          In both cases the loop limit is NITERS - STEP.  */
1259       gimple_seq seq = NULL;
1260       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1261       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1262       if (seq)
1263         {
1264           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1265           gcc_assert (!new_bb);
1266         }
1267       if (niters_maybe_zero)
1268         {
1269           /* Case C.  */
1270           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1271           init = build_all_ones_cst (niters_type);
1272         }
1273       else
1274         {
1275           /* Case B.  */
1276           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1277           init = build_zero_cst (niters_type);
1278         }
1279     }
1280
1281   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1282   create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1283              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1284   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1285                                               true, NULL_TREE, true,
1286                                               GSI_SAME_STMT);
1287   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1288                                      true, GSI_SAME_STMT);
1289
1290   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1291                                  NULL_TREE);
1292
1293   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1294
1295   /* Record the number of latch iterations.  */
1296   if (limit == niters)
1297     /* Case A: the loop iterates NITERS times.  Subtract one to get the
1298        latch count.  */
1299     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1300                                        build_int_cst (niters_type, 1));
1301   else
1302     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1303        Subtract one from this to get the latch count.  */
1304     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1305                                        limit, step);
1306
1307   if (final_iv)
1308     {
1309       gassign *assign;
1310       gcc_assert (single_pred_p (exit_edge->dest));
1311       tree phi_dest
1312         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1313       /* Make sure to maintain LC SSA form here and elide the subtraction
1314          if the value is zero.  */
1315       gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1316       add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1317       if (!integer_zerop (init))
1318         {
1319           assign = gimple_build_assign (final_iv, MINUS_EXPR,
1320                                         phi_dest, init);
1321           gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1322           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1323         }
1324     }
1325
1326   return cond_stmt;
1327 }
1328
1329 /* If we're using fully-masked loops, make LOOP iterate:
1330
1331       N == (NITERS - 1) / STEP + 1
1332
1333    times.  When NITERS is zero, this is equivalent to making the loop
1334    execute (1 << M) / STEP times, where M is the precision of NITERS.
1335    NITERS_MAYBE_ZERO is true if this last case might occur.
1336
1337    If we're not using fully-masked loops, make LOOP iterate:
1338
1339       N == (NITERS - STEP) / STEP + 1
1340
1341    times, where NITERS is known to be outside the range [1, STEP - 1].
1342    This is equivalent to making the loop execute NITERS / STEP times
1343    when NITERS is nonzero and (1 << M) / STEP times otherwise.
1344    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1345
1346    If FINAL_IV is nonnull, it is an SSA name that should be set to
1347    N * STEP on exit from the loop.
1348
1349    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
1350
1351 void
1352 vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1353                          tree niters, tree step, tree final_iv,
1354                          bool niters_maybe_zero)
1355 {
1356   gcond *cond_stmt;
1357   gcond *orig_cond = get_loop_exit_condition (loop_e);
1358   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1359
1360   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1361     {
1362       if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1363         cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1364                                                                     loop_vinfo,
1365                                                                     niters, final_iv,
1366                                                                     niters_maybe_zero,
1367                                                                     loop_cond_gsi);
1368       else
1369         cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1370                                                              loop_vinfo,
1371                                                              niters, final_iv,
1372                                                              niters_maybe_zero,
1373                                                              loop_cond_gsi);
1374     }
1375   else
1376     cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1377                                                 niters,
1378                                                 step, final_iv,
1379                                                 niters_maybe_zero,
1380                                                 loop_cond_gsi);
1381
1382   /* Remove old loop exit test.  */
1383   stmt_vec_info orig_cond_info;
1384   if (loop_vinfo
1385       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1386     loop_vinfo->remove_stmt (orig_cond_info);
1387   else
1388     gsi_remove (&loop_cond_gsi, true);
1389
1390   if (dump_enabled_p ())
1391     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1392                      (gimple *) cond_stmt);
1393 }
1394
1395 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
1396    For all PHI arguments in FROM->dest and TO->dest from those
1397    edges ensure that TO->dest PHI arguments have current_def
1398    to that in from.  */
1399
1400 static void
1401 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
1402 {
1403   gimple_stmt_iterator gsi_from, gsi_to;
1404
1405   for (gsi_from = gsi_start_phis (from->dest),
1406        gsi_to = gsi_start_phis (to->dest);
1407        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
1408     {
1409       gimple *from_phi = gsi_stmt (gsi_from);
1410       gimple *to_phi = gsi_stmt (gsi_to);
1411       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1412       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1413       if (virtual_operand_p (from_arg))
1414         {
1415           gsi_next (&gsi_from);
1416           continue;
1417         }
1418       if (virtual_operand_p (to_arg))
1419         {
1420           gsi_next (&gsi_to);
1421           continue;
1422         }
1423       if (TREE_CODE (from_arg) != SSA_NAME)
1424         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1425       else if (TREE_CODE (to_arg) == SSA_NAME
1426                && from_arg != to_arg)
1427         {
1428           if (get_current_def (to_arg) == NULL_TREE)
1429             {
1430               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1431                                               TREE_TYPE (get_current_def
1432                                                            (from_arg))));
1433               set_current_def (to_arg, get_current_def (from_arg));
1434             }
1435         }
1436       gsi_next (&gsi_from);
1437       gsi_next (&gsi_to);
1438     }
1439
1440   gphi *from_phi = get_virtual_phi (from->dest);
1441   gphi *to_phi = get_virtual_phi (to->dest);
1442   if (from_phi)
1443     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1444                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1445 }
1446
1447 /* Given LOOP this function generates a new copy of it and puts it
1448    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1449    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1450    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1451    entry or exit of LOOP.  If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1452    continuation.  This is correct for cases where one loop continues from the
1453    other like in the vectorizer, but not true for uses in e.g. loop distribution
1454    where the contents of the loop body are split but the iteration space of both
1455    copies remains the same.
1456
1457    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1458    dominators were updated during the peeling.  */
1459
1460 class loop *
1461 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1462                                         class loop *scalar_loop,
1463                                         edge scalar_exit, edge e, edge *new_e,
1464                                         bool flow_loops)
1465 {
1466   class loop *new_loop;
1467   basic_block *new_bbs, *bbs, *pbbs;
1468   bool at_exit;
1469   bool was_imm_dom;
1470   basic_block exit_dest;
1471   edge exit, new_exit;
1472   bool duplicate_outer_loop = false;
1473
1474   exit = loop_exit;
1475   at_exit = (e == exit);
1476   if (!at_exit && e != loop_preheader_edge (loop))
1477     return NULL;
1478
1479   if (scalar_loop == NULL)
1480     {
1481       scalar_loop = loop;
1482       scalar_exit = loop_exit;
1483     }
1484   else if (scalar_loop == loop)
1485     scalar_exit = loop_exit;
1486   else
1487     {
1488       /* Loop has been version, match exits up using the aux index.  */
1489       for (edge exit : get_loop_exit_edges (scalar_loop))
1490         if (exit->aux == loop_exit->aux)
1491           {
1492             scalar_exit = exit;
1493             break;
1494           }
1495
1496       gcc_assert (scalar_exit);
1497     }
1498
1499   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1500   pbbs = bbs + 1;
1501   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1502   /* Allow duplication of outer loops.  */
1503   if (scalar_loop->inner)
1504     duplicate_outer_loop = true;
1505
1506   /* Generate new loop structure.  */
1507   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1508   duplicate_subloops (scalar_loop, new_loop);
1509
1510   exit_dest = exit->dest;
1511   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1512                                           exit_dest) == loop->header ?
1513                  true : false);
1514
1515   /* Also copy the pre-header, this avoids jumping through hoops to
1516      duplicate the loop entry PHI arguments.  Create an empty
1517      pre-header unconditionally for this.  */
1518   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1519   edge entry_e = single_pred_edge (preheader);
1520   bbs[0] = preheader;
1521   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1522
1523   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1524             &scalar_exit, 1, &new_exit, NULL,
1525             at_exit ? loop->latch : e->src, true);
1526   exit = loop_exit;
1527   basic_block new_preheader = new_bbs[0];
1528
1529   gcc_assert (new_exit);
1530
1531   /* Record the new loop exit information.  new_loop doesn't have SCEV data and
1532      so we must initialize the exit information.  */
1533   if (new_e)
1534     *new_e = new_exit;
1535
1536   /* Before installing PHI arguments make sure that the edges
1537      into them match that of the scalar loop we analyzed.  This
1538      makes sure the SLP tree matches up between the main vectorized
1539      loop and the epilogue vectorized copies.  */
1540   if (single_succ_edge (preheader)->dest_idx
1541       != single_succ_edge (new_bbs[0])->dest_idx)
1542     {
1543       basic_block swap_bb = new_bbs[1];
1544       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1545       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1546       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1547       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1548     }
1549   if (duplicate_outer_loop)
1550     {
1551       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1552       if (loop_preheader_edge (scalar_loop)->dest_idx
1553           != loop_preheader_edge (new_inner_loop)->dest_idx)
1554         {
1555           basic_block swap_bb = new_inner_loop->header;
1556           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1557           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1558           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1559           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1560         }
1561     }
1562
1563   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1564
1565   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1566   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1567     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1568
1569   /* Rename the exit uses.  */
1570   for (edge exit : get_loop_exit_edges (new_loop))
1571     for (auto gsi = gsi_start_phis (exit->dest);
1572          !gsi_end_p (gsi); gsi_next (&gsi))
1573       {
1574         tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1575         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1576         if (MAY_HAVE_DEBUG_BIND_STMTS)
1577           adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1578       }
1579
1580   /* This condition happens when the loop has been versioned. e.g. due to ifcvt
1581      versioning the loop.  */
1582   if (scalar_loop != loop)
1583     {
1584       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1585          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1586          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1587          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1588          header) to have current_def set, so copy them over.  */
1589       slpeel_duplicate_current_defs_from_edges (scalar_exit, exit);
1590       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1591                                                            0),
1592                                                 EDGE_SUCC (loop->latch, 0));
1593     }
1594
1595   auto loop_exits = get_loop_exit_edges (loop);
1596   auto_vec<basic_block> doms;
1597
1598   if (at_exit) /* Add the loop copy at exit.  */
1599     {
1600       if (scalar_loop != loop && new_exit->dest != exit_dest)
1601         {
1602           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1603           flush_pending_stmts (new_exit);
1604         }
1605
1606       auto_vec <gimple *> new_phis;
1607       hash_map <tree, tree> new_phi_args;
1608       /* First create the empty phi nodes so that when we flush the
1609          statements they can be filled in.   However because there is no order
1610          between the PHI nodes in the exits and the loop headers we need to
1611          order them base on the order of the two headers.  First record the new
1612          phi nodes.  */
1613       for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
1614            !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1615         {
1616           gimple *from_phi = gsi_stmt (gsi_from);
1617           tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1618           gphi *res = create_phi_node (new_res, new_preheader);
1619           new_phis.safe_push (res);
1620         }
1621
1622       /* Then redirect the edges and flush the changes.  This writes out the new
1623          SSA names.  */
1624       for (edge exit : loop_exits)
1625         {
1626           edge temp_e = redirect_edge_and_branch (exit, new_preheader);
1627           flush_pending_stmts (temp_e);
1628         }
1629       /* Record the new SSA names in the cache so that we can skip materializing
1630          them again when we fill in the rest of the LCSSA variables.  */
1631       for (auto phi : new_phis)
1632         {
1633           tree new_arg = gimple_phi_arg (phi, 0)->def;
1634
1635           if (!SSA_VAR_P (new_arg))
1636             continue;
1637           /* If the PHI MEM node dominates the loop then we shouldn't create
1638               a new LC-SSSA PHI for it in the intermediate block.   */
1639           /* A MEM phi that consitutes a new DEF for the vUSE chain can either
1640              be a .VDEF or a PHI that operates on MEM. And said definition
1641              must not be inside the main loop.  Or we must be a parameter.
1642              In the last two cases we may remove a non-MEM PHI node, but since
1643              they dominate both loops the removal is unlikely to cause trouble
1644              as the exits must already be using them.  */
1645           if (virtual_operand_p (new_arg)
1646               && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
1647                   || !flow_bb_inside_loop_p (loop,
1648                                 gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
1649             {
1650               auto gsi = gsi_for_stmt (phi);
1651               remove_phi_node (&gsi, true);
1652               continue;
1653             }
1654           new_phi_args.put (new_arg, gimple_phi_result (phi));
1655
1656           if (TREE_CODE (new_arg) != SSA_NAME)
1657             continue;
1658           /* If the PHI node dominates the loop then we shouldn't create
1659               a new LC-SSSA PHI for it in the intermediate block.  Unless the
1660               the loop has been versioned.  If it has then we need the PHI
1661               node such that later when the loop guard is added the original
1662               dominating PHI can be found.  */
1663           basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (new_arg));
1664           if (loop == scalar_loop
1665               && (!def_bb || !flow_bb_inside_loop_p (loop, def_bb)))
1666             {
1667               auto gsi = gsi_for_stmt (phi);
1668               remove_phi_node (&gsi, true);
1669             }
1670         }
1671
1672       /* Copy the current loop LC PHI nodes between the original loop exit
1673          block and the new loop header.  This allows us to later split the
1674          preheader block and still find the right LC nodes.  */
1675       edge loop_entry = single_succ_edge (new_preheader);
1676       if (flow_loops)
1677         for (auto gsi_from = gsi_start_phis (loop->header),
1678              gsi_to = gsi_start_phis (new_loop->header);
1679              !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1680              gsi_next (&gsi_from), gsi_next (&gsi_to))
1681           {
1682             gimple *from_phi = gsi_stmt (gsi_from);
1683             gimple *to_phi = gsi_stmt (gsi_to);
1684             tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1685                                                   loop_latch_edge (loop));
1686
1687             /* Check if we've already created a new phi node during edge
1688                redirection.  If we have, only propagate the value downwards.  */
1689             if (tree *res = new_phi_args.get (new_arg))
1690               {
1691                 adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1692                 continue;
1693               }
1694
1695             tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1696             gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1697
1698             /* Main loop exit should use the final iter value.  */
1699             add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
1700
1701             adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1702           }
1703
1704       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1705
1706       if (was_imm_dom || duplicate_outer_loop)
1707         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1708
1709       /* And remove the non-necessary forwarder again.  Keep the other
1710          one so we have a proper pre-header for the loop at the exit edge.  */
1711       redirect_edge_pred (single_succ_edge (preheader),
1712                           single_pred (preheader));
1713       delete_basic_block (preheader);
1714       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1715                                loop_preheader_edge (scalar_loop)->src);
1716     }
1717   else /* Add the copy at entry.  */
1718     {
1719       /* Copy the current loop LC PHI nodes between the original loop exit
1720          block and the new loop header.  This allows us to later split the
1721          preheader block and still find the right LC nodes.  */
1722       if (flow_loops)
1723         for (auto gsi_from = gsi_start_phis (new_loop->header),
1724              gsi_to = gsi_start_phis (loop->header);
1725              !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1726              gsi_next (&gsi_from), gsi_next (&gsi_to))
1727           {
1728             gimple *from_phi = gsi_stmt (gsi_from);
1729             gimple *to_phi = gsi_stmt (gsi_to);
1730             tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1731                                                   loop_latch_edge (new_loop));
1732             adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1733                                         new_arg);
1734           }
1735
1736       if (scalar_loop != loop)
1737         {
1738           /* Remove the non-necessary forwarder of scalar_loop again.  */
1739           redirect_edge_pred (single_succ_edge (preheader),
1740                               single_pred (preheader));
1741           delete_basic_block (preheader);
1742           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1743                                    loop_preheader_edge (scalar_loop)->src);
1744           preheader = split_edge (loop_preheader_edge (loop));
1745           entry_e = single_pred_edge (preheader);
1746         }
1747
1748       redirect_edge_and_branch_force (entry_e, new_preheader);
1749       flush_pending_stmts (entry_e);
1750       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1751
1752       redirect_edge_and_branch_force (new_exit, preheader);
1753       flush_pending_stmts (new_exit);
1754       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1755
1756       /* And remove the non-necessary forwarder again.  Keep the other
1757          one so we have a proper pre-header for the loop at the exit edge.  */
1758       redirect_edge_pred (single_succ_edge (new_preheader),
1759                           single_pred (new_preheader));
1760       delete_basic_block (new_preheader);
1761       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1762                                loop_preheader_edge (new_loop)->src);
1763     }
1764
1765   free (new_bbs);
1766   free (bbs);
1767
1768   checking_verify_dominators (CDI_DOMINATORS);
1769
1770   return new_loop;
1771 }
1772
1773
1774 /* Given the condition expression COND, put it as the last statement of
1775    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1776    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1777    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1778    new edge as irreducible if IRREDUCIBLE_P is true.  */
1779
1780 static edge
1781 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1782                        basic_block guard_to, basic_block dom_bb,
1783                        profile_probability probability, bool irreducible_p)
1784 {
1785   gimple_stmt_iterator gsi;
1786   edge new_e, enter_e;
1787   gcond *cond_stmt;
1788   gimple_seq gimplify_stmt_list = NULL;
1789
1790   enter_e = EDGE_SUCC (guard_bb, 0);
1791   enter_e->flags &= ~EDGE_FALLTHRU;
1792   enter_e->flags |= EDGE_FALSE_VALUE;
1793   gsi = gsi_last_bb (guard_bb);
1794
1795   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1796                                  is_gimple_condexpr_for_cond, NULL_TREE);
1797   if (gimplify_stmt_list)
1798     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1799
1800   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1801   gsi = gsi_last_bb (guard_bb);
1802   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1803
1804   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1805   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1806
1807   new_e->probability = probability;
1808   if (irreducible_p)
1809     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1810
1811   enter_e->probability = probability.invert ();
1812   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1813
1814   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1815   if (enter_e->dest->loop_father->header == enter_e->dest)
1816     split_edge (enter_e);
1817
1818   return new_e;
1819 }
1820
1821
1822 /* This function verifies that the following restrictions apply to LOOP:
1823    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1824        for innermost loop and 5 basic blocks for outer-loop.
1825    (2) it is single entry, single exit
1826    (3) its exit condition is the last stmt in the header
1827    (4) E is the entry/exit edge of LOOP.
1828  */
1829
1830 bool
1831 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
1832                              const_edge e)
1833 {
1834   edge entry_e = loop_preheader_edge (loop);
1835   gcond *orig_cond = get_loop_exit_condition (exit_e);
1836   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1837   unsigned int num_bb = loop->inner? 5 : 2;
1838
1839   /* All loops have an outer scope; the only case loop->outer is NULL is for
1840      the function itself.  */
1841   if (!loop_outer (loop)
1842       || loop->num_nodes != num_bb
1843       || !empty_block_p (loop->latch)
1844       || !exit_e
1845       /* Verify that new loop exit condition can be trivially modified.  */
1846       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1847       || (e != exit_e && e != entry_e))
1848     return false;
1849
1850   basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1851   get_loop_body_with_size (loop, bbs, loop->num_nodes);
1852   bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1853   free (bbs);
1854   return ret;
1855 }
1856
1857 /* Function find_loop_location.
1858
1859    Extract the location of the loop in the source code.
1860    If the loop is not well formed for vectorization, an estimated
1861    location is calculated.
1862    Return the loop location if succeed and NULL if not.  */
1863
1864 dump_user_location_t
1865 find_loop_location (class loop *loop)
1866 {
1867   gimple *stmt = NULL;
1868   basic_block bb;
1869   gimple_stmt_iterator si;
1870
1871   if (!loop)
1872     return dump_user_location_t ();
1873
1874   if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
1875     {
1876       /* We only care about the loop location, so use any exit with location
1877          information.  */
1878       for (edge e : get_loop_exit_edges (loop))
1879         {
1880           stmt = get_loop_exit_condition (e);
1881
1882           if (stmt
1883               && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1884             return stmt;
1885         }
1886     }
1887
1888   /* If we got here the loop is probably not "well formed",
1889      try to estimate the loop location */
1890
1891   if (!loop->header)
1892     return dump_user_location_t ();
1893
1894   bb = loop->header;
1895
1896   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1897     {
1898       stmt = gsi_stmt (si);
1899       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1900         return stmt;
1901     }
1902
1903   return dump_user_location_t ();
1904 }
1905
1906 /* Return true if the phi described by STMT_INFO defines an IV of the
1907    loop to be vectorized.  */
1908
1909 static bool
1910 iv_phi_p (stmt_vec_info stmt_info)
1911 {
1912   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1913   if (virtual_operand_p (PHI_RESULT (phi)))
1914     return false;
1915
1916   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1917       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1918     return false;
1919
1920   return true;
1921 }
1922
1923 /* Return true if vectorizer can peel for nonlinear iv.  */
1924 static bool
1925 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
1926                               stmt_vec_info stmt_info)
1927 {
1928   enum vect_induction_op_type induction_type
1929     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
1930   tree niters_skip;
1931   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
1932      if niters or vf is unkown:
1933      For shift, when shift mount >= precision, there would be UD.
1934      For mult, don't known how to generate
1935      init_expr * pow (step, niters) for variable niters.
1936      For neg, it should be ok, since niters of vectorized main loop
1937      will always be multiple of 2.  */
1938   if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1939        || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
1940       && induction_type != vect_step_op_neg)
1941     {
1942       if (dump_enabled_p ())
1943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944                          "Peeling for epilogue is not supported"
1945                          " for nonlinear induction except neg"
1946                          " when iteration count is unknown.\n");
1947       return false;
1948     }
1949
1950   /* Avoid compile time hog on vect_peel_nonlinear_iv_init.  */
1951   if (induction_type == vect_step_op_mul)
1952     {
1953       tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
1954       tree type = TREE_TYPE (step_expr);
1955
1956       if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
1957           && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
1958         {
1959           if (dump_enabled_p ())
1960             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1961                              "Avoid compile time hog on"
1962                              " vect_peel_nonlinear_iv_init"
1963                              " for nonlinear induction vec_step_op_mul"
1964                              " when iteration count is too big.\n");
1965           return false;
1966         }
1967     }
1968
1969   /* Also doens't support peel for neg when niter is variable.
1970      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
1971   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
1972   if ((niters_skip != NULL_TREE
1973        && (TREE_CODE (niters_skip) != INTEGER_CST
1974            || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
1975       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
1976           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
1977     {
1978       if (dump_enabled_p ())
1979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980                          "Peeling for alignement is not supported"
1981                          " for nonlinear induction when niters_skip"
1982                          " is not constant.\n");
1983       return false;
1984     }
1985
1986   return true;
1987 }
1988
1989 /* Function vect_can_advance_ivs_p
1990
1991    In case the number of iterations that LOOP iterates is unknown at compile
1992    time, an epilog loop will be generated, and the loop induction variables
1993    (IVs) will be "advanced" to the value they are supposed to take just before
1994    the epilog loop.  Here we check that the access function of the loop IVs
1995    and the expression that represents the loop bound are simple enough.
1996    These restrictions will be relaxed in the future.  */
1997
1998 bool
1999 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2000 {
2001   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2002   basic_block bb = loop->header;
2003   gphi_iterator gsi;
2004
2005   /* Analyze phi functions of the loop header.  */
2006
2007   if (dump_enabled_p ())
2008     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2009   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2010     {
2011       tree evolution_part;
2012       enum vect_induction_op_type induction_type;
2013
2014       gphi *phi = gsi.phi ();
2015       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2018                          phi_info->stmt);
2019
2020       /* Skip virtual phi's. The data dependences that are associated with
2021          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2022
2023          Skip reduction phis.  */
2024       if (!iv_phi_p (phi_info))
2025         {
2026           if (dump_enabled_p ())
2027             dump_printf_loc (MSG_NOTE, vect_location,
2028                              "reduc or virtual phi. skip.\n");
2029           continue;
2030         }
2031
2032       induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2033       if (induction_type != vect_step_op_add)
2034         {
2035           if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2036             return false;
2037
2038           continue;
2039         }
2040
2041       /* Analyze the evolution function.  */
2042
2043       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2044       if (evolution_part == NULL_TREE)
2045         {
2046           if (dump_enabled_p ())
2047             dump_printf (MSG_MISSED_OPTIMIZATION,
2048                          "No access function or evolution.\n");
2049           return false;
2050         }
2051
2052       /* FORNOW: We do not transform initial conditions of IVs
2053          which evolution functions are not invariants in the loop.  */
2054
2055       if (!expr_invariant_in_loop_p (loop, evolution_part))
2056         {
2057           if (dump_enabled_p ())
2058             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059                              "evolution not invariant in loop.\n");
2060           return false;
2061         }
2062
2063       /* FORNOW: We do not transform initial conditions of IVs
2064          which evolution functions are a polynomial of degree >= 2.  */
2065
2066       if (tree_is_chrec (evolution_part))
2067         {
2068           if (dump_enabled_p ())
2069             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2070                              "evolution is chrec.\n");
2071           return false;
2072         }
2073     }
2074
2075   return true;
2076 }
2077
2078
2079 /*   Function vect_update_ivs_after_vectorizer.
2080
2081      "Advance" the induction variables of LOOP to the value they should take
2082      after the execution of LOOP.  This is currently necessary because the
2083      vectorizer does not handle induction variables that are used after the
2084      loop.  Such a situation occurs when the last iterations of LOOP are
2085      peeled, because:
2086      1. We introduced new uses after LOOP for IVs that were not originally used
2087         after LOOP: the IVs of LOOP are now used by an epilog loop.
2088      2. LOOP is going to be vectorized; this means that it will iterate N/VF
2089         times, whereas the loop IVs should be bumped N times.
2090
2091      Input:
2092      - LOOP - a loop that is going to be vectorized. The last few iterations
2093               of LOOP were peeled.
2094      - NITERS - the number of iterations that LOOP executes (before it is
2095                 vectorized). i.e, the number of times the ivs should be bumped.
2096      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2097                   coming out from LOOP on which there are uses of the LOOP ivs
2098                   (this is the path from LOOP->exit to epilog_loop->preheader).
2099
2100                   The new definitions of the ivs are placed in LOOP->exit.
2101                   The phi args associated with the edge UPDATE_E in the bb
2102                   UPDATE_E->dest are updated accordingly.
2103
2104      Assumption 1: Like the rest of the vectorizer, this function assumes
2105      a single loop exit that has a single predecessor.
2106
2107      Assumption 2: The phi nodes in the LOOP header and in update_bb are
2108      organized in the same order.
2109
2110      Assumption 3: The access function of the ivs is simple enough (see
2111      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
2112
2113      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2114      coming out of LOOP on which the ivs of LOOP are used (this is the path
2115      that leads to the epilog loop; other paths skip the epilog loop).  This
2116      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2117      needs to have its phis updated.
2118  */
2119
2120 static void
2121 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2122                                   tree niters, edge update_e)
2123 {
2124   gphi_iterator gsi, gsi1;
2125   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2126   basic_block update_bb = update_e->dest;
2127
2128   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2129
2130   /* Make sure there exists a single-predecessor exit bb:  */
2131   gcc_assert (single_pred_p (exit_bb));
2132   gcc_assert (single_succ_edge (exit_bb) == update_e);
2133
2134   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2135        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2136        gsi_next (&gsi), gsi_next (&gsi1))
2137     {
2138       tree init_expr;
2139       tree step_expr, off;
2140       tree type;
2141       tree var, ni, ni_name;
2142       gimple_stmt_iterator last_gsi;
2143
2144       gphi *phi = gsi.phi ();
2145       gphi *phi1 = gsi1.phi ();
2146       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2147       if (dump_enabled_p ())
2148         dump_printf_loc (MSG_NOTE, vect_location,
2149                          "vect_update_ivs_after_vectorizer: phi: %G",
2150                          (gimple *) phi);
2151
2152       /* Skip reduction and virtual phis.  */
2153       if (!iv_phi_p (phi_info))
2154         {
2155           if (dump_enabled_p ())
2156             dump_printf_loc (MSG_NOTE, vect_location,
2157                              "reduc or virtual phi. skip.\n");
2158           continue;
2159         }
2160
2161       type = TREE_TYPE (gimple_phi_result (phi));
2162       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2163       step_expr = unshare_expr (step_expr);
2164
2165       /* FORNOW: We do not support IVs whose evolution function is a polynomial
2166          of degree >= 2 or exponential.  */
2167       gcc_assert (!tree_is_chrec (step_expr));
2168
2169       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2170       gimple_seq stmts = NULL;
2171       enum vect_induction_op_type induction_type
2172         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2173
2174       if (induction_type == vect_step_op_add)
2175         {
2176           tree stype = TREE_TYPE (step_expr);
2177           off = fold_build2 (MULT_EXPR, stype,
2178                              fold_convert (stype, niters), step_expr);
2179           if (POINTER_TYPE_P (type))
2180             ni = fold_build_pointer_plus (init_expr, off);
2181           else
2182             ni = fold_convert (type,
2183                                fold_build2 (PLUS_EXPR, stype,
2184                                             fold_convert (stype, init_expr),
2185                                             off));
2186         }
2187       /* Don't bother call vect_peel_nonlinear_iv_init.  */
2188       else if (induction_type == vect_step_op_neg)
2189         ni = init_expr;
2190       else
2191         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2192                                           niters, step_expr,
2193                                           induction_type);
2194
2195       var = create_tmp_var (type, "tmp");
2196
2197       last_gsi = gsi_last_bb (exit_bb);
2198       gimple_seq new_stmts = NULL;
2199       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2200       /* Exit_bb shouldn't be empty.  */
2201       if (!gsi_end_p (last_gsi))
2202         {
2203           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2204           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2205         }
2206       else
2207         {
2208           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2209           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2210         }
2211
2212       /* Fix phi expressions in the successor bb.  */
2213       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2214     }
2215 }
2216
2217 /* Return a gimple value containing the misalignment (measured in vector
2218    elements) for the loop described by LOOP_VINFO, i.e. how many elements
2219    it is away from a perfectly aligned address.  Add any new statements
2220    to SEQ.  */
2221
2222 static tree
2223 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2224 {
2225   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2226   stmt_vec_info stmt_info = dr_info->stmt;
2227   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2228
2229   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2230   unsigned HOST_WIDE_INT target_align_c;
2231   tree target_align_minus_1;
2232
2233   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2234                                         size_zero_node) < 0;
2235   tree offset = (negative
2236                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2237                              * TREE_INT_CST_LOW
2238                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2239                  : size_zero_node);
2240   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2241                                                           stmt_info, seq,
2242                                                           offset);
2243   tree type = unsigned_type_for (TREE_TYPE (start_addr));
2244   if (target_align.is_constant (&target_align_c))
2245     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2246   else
2247     {
2248       tree vla = build_int_cst (type, target_align);
2249       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2250                                     fold_build2 (MINUS_EXPR, type,
2251                                                  build_int_cst (type, 0), vla));
2252       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2253                                           build_int_cst (type, 1));
2254     }
2255
2256   HOST_WIDE_INT elem_size
2257     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2258   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2259
2260   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
2261   tree int_start_addr = fold_convert (type, start_addr);
2262   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2263                                         target_align_minus_1);
2264
2265   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
2266   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2267                                         elem_size_log);
2268
2269   return misalign_in_elems;
2270 }
2271
2272 /* Function vect_gen_prolog_loop_niters
2273
2274    Generate the number of iterations which should be peeled as prolog for the
2275    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
2276    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2277    As a result, after the execution of this loop, the data reference DR will
2278    refer to an aligned location.  The following computation is generated:
2279
2280    If the misalignment of DR is known at compile time:
2281      addr_mis = int mis = DR_MISALIGNMENT (dr);
2282    Else, compute address misalignment in bytes:
2283      addr_mis = addr & (target_align - 1)
2284
2285    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2286
2287    (elem_size = element type size; an element is the scalar element whose type
2288    is the inner type of the vectype)
2289
2290    The computations will be emitted at the end of BB.  We also compute and
2291    store upper bound (included) of the result in BOUND.
2292
2293    When the step of the data-ref in the loop is not 1 (as in interleaved data
2294    and SLP), the number of iterations of the prolog must be divided by the step
2295    (which is equal to the size of interleaved group).
2296
2297    The above formulas assume that VF == number of elements in the vector. This
2298    may not hold when there are multiple-types in the loop.
2299    In this case, for some data-references in the loop the VF does not represent
2300    the number of elements that fit in the vector.  Therefore, instead of VF we
2301    use TYPE_VECTOR_SUBPARTS.  */
2302
2303 static tree
2304 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2305                              basic_block bb, int *bound)
2306 {
2307   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2308   tree var;
2309   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2310   gimple_seq stmts = NULL, new_stmts = NULL;
2311   tree iters, iters_name;
2312   stmt_vec_info stmt_info = dr_info->stmt;
2313   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2314   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2315
2316   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2317     {
2318       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2319
2320       if (dump_enabled_p ())
2321         dump_printf_loc (MSG_NOTE, vect_location,
2322                          "known peeling = %d.\n", npeel);
2323
2324       iters = build_int_cst (niters_type, npeel);
2325       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2326     }
2327   else
2328     {
2329       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2330       tree type = TREE_TYPE (misalign_in_elems);
2331       HOST_WIDE_INT elem_size
2332         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2333       /* We only do prolog peeling if the target alignment is known at compile
2334          time.  */
2335       poly_uint64 align_in_elems =
2336         exact_div (target_align, elem_size);
2337       tree align_in_elems_minus_1 =
2338         build_int_cst (type, align_in_elems - 1);
2339       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2340
2341       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
2342                                  & (align_in_elems - 1)).  */
2343       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2344                                             size_zero_node) < 0;
2345       if (negative)
2346         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2347                              align_in_elems_tree);
2348       else
2349         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2350                              misalign_in_elems);
2351       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2352       iters = fold_convert (niters_type, iters);
2353       unsigned HOST_WIDE_INT align_in_elems_c;
2354       if (align_in_elems.is_constant (&align_in_elems_c))
2355         *bound = align_in_elems_c - 1;
2356       else
2357         *bound = -1;
2358     }
2359
2360   if (dump_enabled_p ())
2361     dump_printf_loc (MSG_NOTE, vect_location,
2362                      "niters for prolog loop: %T\n", iters);
2363
2364   var = create_tmp_var (niters_type, "prolog_loop_niters");
2365   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2366
2367   if (new_stmts)
2368     gimple_seq_add_seq (&stmts, new_stmts);
2369   if (stmts)
2370     {
2371       gcc_assert (single_succ_p (bb));
2372       gimple_stmt_iterator gsi = gsi_last_bb (bb);
2373       if (gsi_end_p (gsi))
2374         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2375       else
2376         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2377     }
2378   return iters_name;
2379 }
2380
2381
2382 /* Function vect_update_init_of_dr
2383
2384    If CODE is PLUS, the vector loop starts NITERS iterations after the
2385    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2386    iterations before the scalar one (using masking to skip inactive
2387    elements).  This function updates the information recorded in DR to
2388    account for the difference.  Specifically, it updates the OFFSET
2389    field of DR_INFO.  */
2390
2391 static void
2392 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2393 {
2394   struct data_reference *dr = dr_info->dr;
2395   tree offset = dr_info->offset;
2396   if (!offset)
2397     offset = build_zero_cst (sizetype);
2398
2399   niters = fold_build2 (MULT_EXPR, sizetype,
2400                         fold_convert (sizetype, niters),
2401                         fold_convert (sizetype, DR_STEP (dr)));
2402   offset = fold_build2 (code, sizetype,
2403                         fold_convert (sizetype, offset), niters);
2404   dr_info->offset = offset;
2405 }
2406
2407
2408 /* Function vect_update_inits_of_drs
2409
2410    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2411    CODE and NITERS are as for vect_update_inits_of_dr.  */
2412
2413 void
2414 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2415                           tree_code code)
2416 {
2417   unsigned int i;
2418   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2419   struct data_reference *dr;
2420
2421   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2422
2423   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
2424      here, but since we might use these niters to update the epilogues niters
2425      and data references we can't insert them here as this definition might not
2426      always dominate its uses.  */
2427   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2428     niters = fold_convert (sizetype, niters);
2429
2430   FOR_EACH_VEC_ELT (datarefs, i, dr)
2431     {
2432       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2433       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2434           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2435         vect_update_init_of_dr (dr_info, niters, code);
2436     }
2437 }
2438
2439 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2440    by masking.  This involves calculating the number of iterations to
2441    be peeled and then aligning all memory references appropriately.  */
2442
2443 void
2444 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2445 {
2446   tree misalign_in_elems;
2447   tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2448
2449   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2450
2451   /* From the information recorded in LOOP_VINFO get the number of iterations
2452      that need to be skipped via masking.  */
2453   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2454     {
2455       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2456                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2457       misalign_in_elems = build_int_cst (type, misalign);
2458     }
2459   else
2460     {
2461       gimple_seq seq1 = NULL, seq2 = NULL;
2462       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2463       misalign_in_elems = fold_convert (type, misalign_in_elems);
2464       misalign_in_elems = force_gimple_operand (misalign_in_elems,
2465                                                 &seq2, true, NULL_TREE);
2466       gimple_seq_add_seq (&seq1, seq2);
2467       if (seq1)
2468         {
2469           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2470           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2471           gcc_assert (!new_bb);
2472         }
2473     }
2474
2475   if (dump_enabled_p ())
2476     dump_printf_loc (MSG_NOTE, vect_location,
2477                      "misalignment for fully-masked loop: %T\n",
2478                      misalign_in_elems);
2479
2480   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2481
2482   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2483 }
2484
2485 /* This function builds ni_name = number of iterations.  Statements
2486    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
2487    it to TRUE if new ssa_var is generated.  */
2488
2489 tree
2490 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2491 {
2492   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2493   if (TREE_CODE (ni) == INTEGER_CST)
2494     return ni;
2495   else
2496     {
2497       tree ni_name, var;
2498       gimple_seq stmts = NULL;
2499       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2500
2501       var = create_tmp_var (TREE_TYPE (ni), "niters");
2502       ni_name = force_gimple_operand (ni, &stmts, false, var);
2503       if (stmts)
2504         {
2505           gsi_insert_seq_on_edge_immediate (pe, stmts);
2506           if (new_var_p != NULL)
2507             *new_var_p = true;
2508         }
2509
2510       return ni_name;
2511     }
2512 }
2513
2514 /* Calculate the number of iterations above which vectorized loop will be
2515    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
2516    of prolog loop.  If it's integer const, the integer number is also passed
2517    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
2518    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
2519    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
2520    threshold below which the scalar (rather than vectorized) loop will be
2521    executed.  This function stores the upper bound (inclusive) of the result
2522    in BOUND_SCALAR.  */
2523
2524 static tree
2525 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2526                              int bound_prolog, poly_int64 bound_epilog, int th,
2527                              poly_uint64 *bound_scalar,
2528                              bool check_profitability)
2529 {
2530   tree type = TREE_TYPE (niters_prolog);
2531   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2532                              build_int_cst (type, bound_epilog));
2533
2534   *bound_scalar = bound_prolog + bound_epilog;
2535   if (check_profitability)
2536     {
2537       /* TH indicates the minimum niters of vectorized loop, while we
2538          compute the maximum niters of scalar loop.  */
2539       th--;
2540       /* Peeling for constant times.  */
2541       if (int_niters_prolog >= 0)
2542         {
2543           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2544           return build_int_cst (type, *bound_scalar);
2545         }
2546       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
2547          and BOUND_EPILOG are inclusive upper bounds.  */
2548       if (known_ge (th, bound_prolog + bound_epilog))
2549         {
2550           *bound_scalar = th;
2551           return build_int_cst (type, th);
2552         }
2553       /* Need to do runtime comparison.  */
2554       else if (maybe_gt (th, bound_epilog))
2555         {
2556           *bound_scalar = upper_bound (*bound_scalar, th);
2557           return fold_build2 (MAX_EXPR, type,
2558                               build_int_cst (type, th), niters);
2559         }
2560     }
2561   return niters;
2562 }
2563
2564 /* NITERS is the number of times that the original scalar loop executes
2565    after peeling.  Work out the maximum number of iterations N that can
2566    be handled by the vectorized form of the loop and then either:
2567
2568    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2569
2570         niters_vector = N
2571
2572    b) set *STEP_VECTOR_PTR to one and generate:
2573
2574         niters_vector = N / vf
2575
2576    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2577    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
2578    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
2579
2580 void
2581 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2582                              tree *niters_vector_ptr, tree *step_vector_ptr,
2583                              bool niters_no_overflow)
2584 {
2585   tree ni_minus_gap, var;
2586   tree niters_vector, step_vector, type = TREE_TYPE (niters);
2587   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2588   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2589   tree log_vf = NULL_TREE;
2590
2591   /* If epilogue loop is required because of data accesses with gaps, we
2592      subtract one iteration from the total number of iterations here for
2593      correct calculation of RATIO.  */
2594   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2595     {
2596       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2597                                   build_one_cst (type));
2598       if (!is_gimple_val (ni_minus_gap))
2599         {
2600           var = create_tmp_var (type, "ni_gap");
2601           gimple *stmts = NULL;
2602           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2603                                                true, var);
2604           gsi_insert_seq_on_edge_immediate (pe, stmts);
2605         }
2606     }
2607   else
2608     ni_minus_gap = niters;
2609
2610   /* To silence some unexpected warnings, simply initialize to 0. */
2611   unsigned HOST_WIDE_INT const_vf = 0;
2612   if (vf.is_constant (&const_vf)
2613       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2614     {
2615       /* Create: niters >> log2(vf) */
2616       /* If it's known that niters == number of latch executions + 1 doesn't
2617          overflow, we can generate niters >> log2(vf); otherwise we generate
2618          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2619          will be at least one.  */
2620       log_vf = build_int_cst (type, exact_log2 (const_vf));
2621       if (niters_no_overflow)
2622         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2623       else
2624         niters_vector
2625           = fold_build2 (PLUS_EXPR, type,
2626                          fold_build2 (RSHIFT_EXPR, type,
2627                                       fold_build2 (MINUS_EXPR, type,
2628                                                    ni_minus_gap,
2629                                                    build_int_cst (type, vf)),
2630                                       log_vf),
2631                          build_int_cst (type, 1));
2632       step_vector = build_one_cst (type);
2633     }
2634   else
2635     {
2636       niters_vector = ni_minus_gap;
2637       step_vector = build_int_cst (type, vf);
2638     }
2639
2640   if (!is_gimple_val (niters_vector))
2641     {
2642       var = create_tmp_var (type, "bnd");
2643       gimple_seq stmts = NULL;
2644       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2645       gsi_insert_seq_on_edge_immediate (pe, stmts);
2646       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2647          we set range information to make niters analyzer's life easier.
2648          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2649          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2650       if (stmts != NULL && log_vf)
2651         {
2652           if (niters_no_overflow)
2653             {
2654               value_range vr (type,
2655                               wi::one (TYPE_PRECISION (type)),
2656                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2657                                                          TYPE_SIGN (type)),
2658                                           exact_log2 (const_vf),
2659                                           TYPE_SIGN (type)));
2660               set_range_info (niters_vector, vr);
2661             }
2662           /* For VF == 1 the vector IV might also overflow so we cannot
2663              assert a minimum value of 1.  */
2664           else if (const_vf > 1)
2665             {
2666               value_range vr (type,
2667                               wi::one (TYPE_PRECISION (type)),
2668                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2669                                                          TYPE_SIGN (type))
2670                                           - (const_vf - 1),
2671                                           exact_log2 (const_vf), TYPE_SIGN (type))
2672                               + 1);
2673               set_range_info (niters_vector, vr);
2674             }
2675         }
2676     }
2677   *niters_vector_ptr = niters_vector;
2678   *step_vector_ptr = step_vector;
2679
2680   return;
2681 }
2682
2683 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2684    loop specified by LOOP_VINFO after vectorization, compute the number
2685    of iterations before vectorization (niters_vector * vf) and store it
2686    to NITERS_VECTOR_MULT_VF_PTR.  */
2687
2688 static void
2689 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2690                                      tree niters_vector,
2691                                      tree *niters_vector_mult_vf_ptr)
2692 {
2693   /* We should be using a step_vector of VF if VF is variable.  */
2694   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2695   tree type = TREE_TYPE (niters_vector);
2696   tree log_vf = build_int_cst (type, exact_log2 (vf));
2697   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2698
2699   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2700   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2701                                             niters_vector, log_vf);
2702   if (!is_gimple_val (niters_vector_mult_vf))
2703     {
2704       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2705       gimple_seq stmts = NULL;
2706       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2707                                                     &stmts, true, var);
2708       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2709       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2710     }
2711   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2712 }
2713
2714 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2715    this function searches for the corresponding lcssa phi node in exit
2716    bb of LOOP following the LCSSA_EDGE to the exit node.  If it is found,
2717    return the phi result; otherwise return NULL.  */
2718
2719 static tree
2720 find_guard_arg (class loop *loop ATTRIBUTE_UNUSED, const_edge loop_e,
2721                 class loop *epilog ATTRIBUTE_UNUSED, gphi *lcssa_phi,
2722                 int lcssa_edge = 0)
2723 {
2724   gphi_iterator gsi;
2725
2726   for (gsi = gsi_start_phis (loop_e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2727     {
2728       gphi *phi = gsi.phi ();
2729       /* Nested loops with multiple exits can have different no# phi node
2730         arguments between the main loop and epilog as epilog falls to the
2731         second loop.  */
2732       if (gimple_phi_num_args (phi) > loop_e->dest_idx)
2733         {
2734          tree var = PHI_ARG_DEF (phi, loop_e->dest_idx);
2735          if (TREE_CODE (var) != SSA_NAME)
2736             continue;
2737          tree def = get_current_def (var);
2738          if (!def)
2739            continue;
2740          if (operand_equal_p (def,
2741                               PHI_ARG_DEF (lcssa_phi, lcssa_edge), 0))
2742            return PHI_RESULT (phi);
2743         }
2744     }
2745   return NULL_TREE;
2746 }
2747
2748 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2749    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2750    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2751    appear like below:
2752
2753        guard_bb:
2754          if (cond)
2755            goto merge_bb;
2756          else
2757            goto skip_loop;
2758
2759      skip_loop:
2760        header_a:
2761          i_1 = PHI<i_0, i_2>;
2762          ...
2763          i_2 = i_1 + 1;
2764          if (cond_a)
2765            goto latch_a;
2766          else
2767            goto exit_a;
2768        latch_a:
2769          goto header_a;
2770
2771        exit_a:
2772          i_5 = PHI<i_2>;
2773
2774        merge_bb:
2775          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2776
2777      update_loop:
2778        header_b:
2779          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2780          ...
2781          i_4 = i_3 + 1;
2782          if (cond_b)
2783            goto latch_b;
2784          else
2785            goto exit_bb;
2786        latch_b:
2787          goto header_b;
2788
2789        exit_bb:
2790
2791    This function creates PHI nodes at merge_bb and replaces the use of i_5
2792    in the update_loop's PHI node with the result of new PHI result.  */
2793
2794 static void
2795 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2796                                     class loop *update_loop,
2797                                     edge guard_edge, edge merge_edge)
2798 {
2799   location_t merge_loc, guard_loc;
2800   edge orig_e = loop_preheader_edge (skip_loop);
2801   edge update_e = loop_preheader_edge (update_loop);
2802   gphi_iterator gsi_orig, gsi_update;
2803
2804   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2805         gsi_update = gsi_start_phis (update_loop->header));
2806        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2807        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2808     {
2809       gphi *orig_phi = gsi_orig.phi ();
2810       gphi *update_phi = gsi_update.phi ();
2811
2812       /* Generate new phi node at merge bb of the guard.  */
2813       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2814       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2815
2816       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2817          args in NEW_PHI for these edges.  */
2818       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2819       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2820       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2821       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2822       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2823       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2824
2825       /* Update phi in UPDATE_PHI.  */
2826       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2827     }
2828 }
2829
2830 /* LOOP and EPILOG are two consecutive loops in CFG connected by LOOP_EXIT edge
2831    and EPILOG is copied from LOOP.  Function slpeel_add_loop_guard adds guard
2832    skipping from a point between the two loops to the end of EPILOG.  Edges
2833    GUARD_EDGE and MERGE_EDGE are the two pred edges of merge_bb at the end of
2834    EPILOG.  The CFG looks like:
2835
2836      loop:
2837        header_a:
2838          i_1 = PHI<i_0, i_2>;
2839          ...
2840          i_2 = i_1 + 1;
2841          if (cond_a)
2842            goto latch_a;
2843          else
2844            goto exit_a;
2845        latch_a:
2846          goto header_a;
2847
2848        exit_a:
2849
2850        guard_bb:
2851          if (cond)
2852            goto merge_bb;
2853          else
2854            goto epilog_loop;
2855
2856        ;; fall_through_bb
2857
2858      epilog_loop:
2859        header_b:
2860          i_3 = PHI<i_2, i_4>;
2861          ...
2862          i_4 = i_3 + 1;
2863          if (cond_b)
2864            goto latch_b;
2865          else
2866            goto merge_bb;
2867        latch_b:
2868          goto header_b;
2869
2870        merge_bb:
2871          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2872
2873        exit_bb:
2874          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2875
2876    For each name used out side EPILOG (i.e - for each name that has a lcssa
2877    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2878    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2879    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2880    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2881    in exit_bb will also be updated.  */
2882
2883 static void
2884 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2885                                     const_edge loop_exit,
2886                                     edge guard_edge, edge merge_edge)
2887 {
2888   gphi_iterator gsi;
2889   basic_block merge_bb = guard_edge->dest;
2890
2891   gcc_assert (single_succ_p (merge_bb));
2892   edge e = single_succ_edge (merge_bb);
2893   basic_block exit_bb = e->dest;
2894
2895   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2896     {
2897       gphi *update_phi = gsi.phi ();
2898       tree old_arg = PHI_ARG_DEF (update_phi, e->dest_idx);
2899
2900       tree merge_arg = NULL_TREE;
2901
2902       /* If the old argument is a SSA_NAME use its current_def.  */
2903       if (TREE_CODE (old_arg) == SSA_NAME)
2904         merge_arg = get_current_def (old_arg);
2905       /* If it's a constant or doesn't have a current_def, just use the old
2906          argument.  */
2907       if (!merge_arg)
2908         merge_arg = old_arg;
2909
2910       tree guard_arg = find_guard_arg (loop, loop_exit, epilog,
2911                                        update_phi, e->dest_idx);
2912       /* If the var is live after loop but not a reduction, we simply
2913          use the old arg.  */
2914       if (!guard_arg)
2915         guard_arg = old_arg;
2916
2917       /* Create new phi node in MERGE_BB:  */
2918       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2919       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2920
2921       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2922          the two PHI args in merge_phi for these edges.  */
2923       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2924       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2925
2926       /* Update the original phi in exit_bb.  */
2927       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2928     }
2929 }
2930
2931 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2932    Return a value that equals:
2933
2934    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2935    - SKIP_VALUE when the main loop is skipped.  */
2936
2937 tree
2938 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2939                            tree skip_value)
2940 {
2941   gcc_assert (loop_vinfo->main_loop_edge);
2942
2943   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2944   basic_block bb = loop_vinfo->main_loop_edge->dest;
2945   gphi *new_phi = create_phi_node (phi_result, bb);
2946   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2947                UNKNOWN_LOCATION);
2948   add_phi_arg (new_phi, skip_value,
2949                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2950   return phi_result;
2951 }
2952
2953 /* Function vect_do_peeling.
2954
2955    Input:
2956    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2957
2958        preheader:
2959      LOOP:
2960        header_bb:
2961          loop_body
2962          if (exit_loop_cond) goto exit_bb
2963          else                goto header_bb
2964        exit_bb:
2965
2966    - NITERS: The number of iterations of the loop.
2967    - NITERSM1: The number of iterations of the loop's latch.
2968    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2969    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2970                               CHECK_PROFITABILITY is true.
2971    Output:
2972    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2973      iterate after vectorization; see vect_set_loop_condition for details.
2974    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2975      should be set to the number of scalar iterations handled by the
2976      vector loop.  The SSA name is only used on exit from the loop.
2977
2978    This function peels prolog and epilog from the loop, adds guards skipping
2979    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2980    would look like:
2981
2982        guard_bb_1:
2983          if (prefer_scalar_loop) goto merge_bb_1
2984          else                    goto guard_bb_2
2985
2986        guard_bb_2:
2987          if (skip_prolog) goto merge_bb_2
2988          else             goto prolog_preheader
2989
2990        prolog_preheader:
2991      PROLOG:
2992        prolog_header_bb:
2993          prolog_body
2994          if (exit_prolog_cond) goto prolog_exit_bb
2995          else                  goto prolog_header_bb
2996        prolog_exit_bb:
2997
2998        merge_bb_2:
2999
3000        vector_preheader:
3001      VECTOR LOOP:
3002        vector_header_bb:
3003          vector_body
3004          if (exit_vector_cond) goto vector_exit_bb
3005          else                  goto vector_header_bb
3006        vector_exit_bb:
3007
3008        guard_bb_3:
3009          if (skip_epilog) goto merge_bb_3
3010          else             goto epilog_preheader
3011
3012        merge_bb_1:
3013
3014        epilog_preheader:
3015      EPILOG:
3016        epilog_header_bb:
3017          epilog_body
3018          if (exit_epilog_cond) goto merge_bb_3
3019          else                  goto epilog_header_bb
3020
3021        merge_bb_3:
3022
3023    Note this function peels prolog and epilog only if it's necessary,
3024    as well as guards.
3025    This function returns the epilogue loop if a decision was made to vectorize
3026    it, otherwise NULL.
3027
3028    The analysis resulting in this epilogue loop's loop_vec_info was performed
3029    in the same vect_analyze_loop call as the main loop's.  At that time
3030    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3031    vectorization factors than the main loop.  This list is stored in the main
3032    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
3033    vectorize the epilogue loop for a lower vectorization factor,  the
3034    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
3035    updated and linked to the epilogue loop.  This is later used to vectorize
3036    the epilogue.  The reason the loop_vec_info needs updating is that it was
3037    constructed based on the original main loop, and the epilogue loop is a
3038    copy of this loop, so all links pointing to statements in the original loop
3039    need updating.  Furthermore, these loop_vec_infos share the
3040    data_reference's records, which will also need to be updated.
3041
3042    TODO: Guard for prefer_scalar_loop should be emitted along with
3043    versioning conditions if loop versioning is needed.  */
3044
3045
3046 class loop *
3047 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3048                  tree *niters_vector, tree *step_vector,
3049                  tree *niters_vector_mult_vf_var, int th,
3050                  bool check_profitability, bool niters_no_overflow,
3051                  tree *advance)
3052 {
3053   edge e, guard_e;
3054   tree type = TREE_TYPE (niters), guard_cond;
3055   basic_block guard_bb, guard_to;
3056   profile_probability prob_prolog, prob_vector, prob_epilog;
3057   int estimated_vf;
3058   int prolog_peeling = 0;
3059   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
3060   /* We currently do not support prolog peeling if the target alignment is not
3061      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
3062      target alignment being constant.  */
3063   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3064   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
3065     return NULL;
3066
3067   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3068     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3069
3070   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3071   poly_uint64 bound_epilog = 0;
3072   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3073       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3074     bound_epilog += vf - 1;
3075   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3076     bound_epilog += 1;
3077   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3078   poly_uint64 bound_scalar = bound_epilog;
3079
3080   if (!prolog_peeling && !epilog_peeling)
3081     return NULL;
3082
3083   /* Before doing any peeling make sure to reset debug binds outside of
3084      the loop refering to defs not in LC SSA.  */
3085   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3086   for (unsigned i = 0; i < loop->num_nodes; ++i)
3087     {
3088       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3089       imm_use_iterator ui;
3090       gimple *use_stmt;
3091       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3092            gsi_next (&gsi))
3093         {
3094           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3095             if (gimple_debug_bind_p (use_stmt)
3096                 && loop != gimple_bb (use_stmt)->loop_father
3097                 && !flow_loop_nested_p (loop,
3098                                         gimple_bb (use_stmt)->loop_father))
3099               {
3100                 gimple_debug_bind_reset_value (use_stmt);
3101                 update_stmt (use_stmt);
3102               }
3103         }
3104       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3105            gsi_next (&gsi))
3106         {
3107           ssa_op_iter op_iter;
3108           def_operand_p def_p;
3109           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3110             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3111               if (gimple_debug_bind_p (use_stmt)
3112                   && loop != gimple_bb (use_stmt)->loop_father
3113                   && !flow_loop_nested_p (loop,
3114                                           gimple_bb (use_stmt)->loop_father))
3115                 {
3116                   gimple_debug_bind_reset_value (use_stmt);
3117                   update_stmt (use_stmt);
3118                 }
3119         }
3120     }
3121
3122   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3123   estimated_vf = vect_vf_for_cost (loop_vinfo);
3124   if (estimated_vf == 2)
3125     estimated_vf = 3;
3126   prob_prolog = prob_epilog = profile_probability::guessed_always ()
3127                         .apply_scale (estimated_vf - 1, estimated_vf);
3128
3129   class loop *prolog, *epilog = NULL;
3130   class loop *first_loop = loop;
3131   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3132
3133   /* SSA form needs to be up-to-date since we are going to manually
3134      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3135      update SSA state after that, so we have to make sure to not lose any
3136      pending update needs.  */
3137   gcc_assert (!need_ssa_update_p (cfun));
3138
3139   /* If we're vectorizing an epilogue loop, we have ensured that the
3140      virtual operand is in SSA form throughout the vectorized main loop.
3141      Normally it is possible to trace the updated
3142      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3143      back to scalar-stmt vuses, meaning that the effect of the SSA update
3144      remains local to the main loop.  However, there are rare cases in
3145      which the vectorized loop should have vdefs even when the original scalar
3146      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
3147      introduces clobbers of the temporary vector array, which in turn
3148      needs new vdefs.  If the scalar loop doesn't write to memory, these
3149      new vdefs will be the only ones in the vector loop.
3150      We are currently defering updating virtual SSA form and creating
3151      of a virtual PHI for this case so we do not have to make sure the
3152      newly introduced virtual def is in LCSSA form.  */
3153
3154   if (MAY_HAVE_DEBUG_BIND_STMTS)
3155     {
3156       gcc_assert (!adjust_vec.exists ());
3157       adjust_vec.create (32);
3158     }
3159   initialize_original_copy_tables ();
3160
3161   /* Record the anchor bb at which the guard should be placed if the scalar
3162      loop might be preferred.  */
3163   basic_block anchor = loop_preheader_edge (loop)->src;
3164
3165   /* Generate the number of iterations for the prolog loop.  We do this here
3166      so that we can also get the upper bound on the number of iterations.  */
3167   tree niters_prolog;
3168   int bound_prolog = 0;
3169   if (prolog_peeling)
3170     {
3171       niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3172                                                     &bound_prolog);
3173       /* If algonment peeling is known, we will always execute prolog.  */
3174       if (TREE_CODE (niters_prolog) == INTEGER_CST)
3175         prob_prolog = profile_probability::always ();
3176     }
3177   else
3178     niters_prolog = build_int_cst (type, 0);
3179
3180   loop_vec_info epilogue_vinfo = NULL;
3181   if (vect_epilogues)
3182     {
3183       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
3184       loop_vinfo->epilogue_vinfos.ordered_remove (0);
3185     }
3186
3187   tree niters_vector_mult_vf = NULL_TREE;
3188   /* Saving NITERs before the loop, as this may be changed by prologue.  */
3189   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3190   edge update_e = NULL, skip_e = NULL;
3191   unsigned int lowest_vf = constant_lower_bound (vf);
3192   /* Prolog loop may be skipped.  */
3193   bool skip_prolog = (prolog_peeling != 0);
3194   /* Skip this loop to epilog when there are not enough iterations to enter this
3195      vectorized loop.  If true we should perform runtime checks on the NITERS
3196      to check whether we should skip the current vectorized loop.  If we know
3197      the number of scalar iterations we may choose to add a runtime check if
3198      this number "maybe" smaller than the number of iterations required
3199      when we know the number of scalar iterations may potentially
3200      be smaller than the number of iterations required to enter this loop, for
3201      this we use the upper bounds on the prolog and epilog peeling.  When we
3202      don't know the number of iterations and don't require versioning it is
3203      because we have asserted that there are enough scalar iterations to enter
3204      the main loop, so this skip is not necessary.  When we are versioning then
3205      we only add such a skip if we have chosen to vectorize the epilogue.  */
3206   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3207                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3208                                   bound_prolog + bound_epilog)
3209                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3210                          || vect_epilogues));
3211   /* Epilog loop must be executed if the number of iterations for epilog
3212      loop is known at compile time, otherwise we need to add a check at
3213      the end of vector loop and skip to the end of epilog loop.  */
3214   bool skip_epilog = (prolog_peeling < 0
3215                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3216                       || !vf.is_constant ());
3217   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
3218   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3219     skip_epilog = false;
3220
3221   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3222   auto_vec<profile_count> original_counts;
3223   basic_block *original_bbs = NULL;
3224
3225   if (skip_vector)
3226     {
3227       split_edge (loop_preheader_edge (loop));
3228
3229       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3230         {
3231           original_bbs = get_loop_body (loop);
3232           for (unsigned int i = 0; i < loop->num_nodes; i++)
3233             original_counts.safe_push(original_bbs[i]->count);
3234         }
3235
3236       /* Due to the order in which we peel prolog and epilog, we first
3237          propagate probability to the whole loop.  The purpose is to
3238          avoid adjusting probabilities of both prolog and vector loops
3239          separately.  Note in this case, the probability of epilog loop
3240          needs to be scaled back later.  */
3241       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3242       if (prob_vector.initialized_p ())
3243         {
3244           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3245           scale_loop_profile (loop, prob_vector, -1);
3246         }
3247     }
3248
3249   if (vect_epilogues)
3250     {
3251       /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3252          use the original scalar loop as remaining epilogue if necessary.  */
3253       LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3254         = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3255       LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3256         = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3257     }
3258
3259   if (prolog_peeling)
3260     {
3261       e = loop_preheader_edge (loop);
3262       edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3263       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e));
3264
3265       /* Peel prolog and put it on preheader edge of loop.  */
3266       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3267       edge prolog_e = NULL;
3268       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3269                                                        scalar_loop, scalar_e,
3270                                                        e, &prolog_e);
3271       gcc_assert (prolog);
3272       prolog->force_vectorize = false;
3273
3274       first_loop = prolog;
3275       reset_original_copy_tables ();
3276
3277       /* Update the number of iterations for prolog loop.  */
3278       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3279       vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3280                                step_prolog, NULL_TREE, false);
3281
3282       /* Skip the prolog loop.  */
3283       if (skip_prolog)
3284         {
3285           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3286                                     niters_prolog, build_int_cst (type, 0));
3287           guard_bb = loop_preheader_edge (prolog)->src;
3288           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3289           guard_to = split_edge (loop_preheader_edge (loop));
3290           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3291                                            guard_to, guard_bb,
3292                                            prob_prolog.invert (),
3293                                            irred_flag);
3294           e = EDGE_PRED (guard_to, 0);
3295           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3296           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3297
3298           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3299           scale_loop_profile (prolog, prob_prolog, bound_prolog - 1);
3300         }
3301
3302       /* Update init address of DRs.  */
3303       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3304       /* Update niters for vector loop.  */
3305       LOOP_VINFO_NITERS (loop_vinfo)
3306         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3307       LOOP_VINFO_NITERSM1 (loop_vinfo)
3308         = fold_build2 (MINUS_EXPR, type,
3309                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3310       bool new_var_p = false;
3311       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3312       /* It's guaranteed that vector loop bound before vectorization is at
3313          least VF, so set range information for newly generated var.  */
3314       if (new_var_p)
3315         {
3316           value_range vr (type,
3317                           wi::to_wide (build_int_cst (type, lowest_vf)),
3318                           wi::to_wide (TYPE_MAX_VALUE (type)));
3319           set_range_info (niters, vr);
3320         }
3321
3322       /* Prolog iterates at most bound_prolog times, latch iterates at
3323          most bound_prolog - 1 times.  */
3324       record_niter_bound (prolog, bound_prolog - 1, false, true);
3325       delete_update_ssa ();
3326       adjust_vec_debug_stmts ();
3327       scev_reset ();
3328     }
3329   basic_block bb_before_epilog = NULL;
3330
3331   if (epilog_peeling)
3332     {
3333       e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3334       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3335
3336       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
3337          said epilog then we should use a copy of the main loop as a starting
3338          point.  This loop may have already had some preliminary transformations
3339          to allow for more optimal vectorization, for example if-conversion.
3340          If we are not vectorizing the epilog then we should use the scalar loop
3341          as the transformations mentioned above make less or no sense when not
3342          vectorizing.  */
3343       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3344       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3345       edge epilog_e = vect_epilogues ? e : scalar_e;
3346       edge new_epilog_e = NULL;
3347       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog,
3348                                                        epilog_e, e,
3349                                                        &new_epilog_e);
3350       LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3351       gcc_assert (epilog);
3352       epilog->force_vectorize = false;
3353       bb_before_epilog = loop_preheader_edge (epilog)->src;
3354
3355       /* Scalar version loop may be preferred.  In this case, add guard
3356          and skip to epilog.  Note this only happens when the number of
3357          iterations of loop is unknown at compile time, otherwise this
3358          won't be vectorized.  */
3359       if (skip_vector)
3360         {
3361           /* Additional epilogue iteration is peeled if gap exists.  */
3362           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3363                                                 bound_prolog, bound_epilog,
3364                                                 th, &bound_scalar,
3365                                                 check_profitability);
3366           /* Build guard against NITERSM1 since NITERS may overflow.  */
3367           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3368           guard_bb = anchor;
3369           guard_to = split_edge (loop_preheader_edge (epilog));
3370           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3371                                            guard_to, guard_bb,
3372                                            prob_vector.invert (),
3373                                            irred_flag);
3374           skip_e = guard_e;
3375           e = EDGE_PRED (guard_to, 0);
3376           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3377           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3378
3379           /* Simply propagate profile info from guard_bb to guard_to which is
3380              a merge point of control flow.  */
3381           profile_count old_count = guard_to->count;
3382           guard_to->count = guard_bb->count;
3383
3384           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3385           if (vect_epilogues || scalar_loop == NULL)
3386             {
3387               gcc_assert(epilog->num_nodes == loop->num_nodes);
3388               basic_block *bbs = get_loop_body (epilog);
3389               for (unsigned int i = 0; i < epilog->num_nodes; i++)
3390                 {
3391                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3392                   bbs[i]->count = original_counts[i];
3393                 }
3394               free (bbs);
3395               free (original_bbs);
3396             }
3397           else if (old_count.nonzero_p ())
3398             scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3399
3400           /* Only need to handle basic block before epilog loop if it's not
3401              the guard_bb, which is the case when skip_vector is true.  */
3402           if (guard_bb != bb_before_epilog)
3403             bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3404           bb_before_epilog = loop_preheader_edge (epilog)->src;
3405         }
3406       /* If loop is peeled for non-zero constant times, now niters refers to
3407          orig_niters - prolog_peeling, it won't overflow even the orig_niters
3408          overflows.  */
3409       niters_no_overflow |= (prolog_peeling > 0);
3410       vect_gen_vector_loop_niters (loop_vinfo, niters,
3411                                    niters_vector, step_vector,
3412                                    niters_no_overflow);
3413       if (!integer_onep (*step_vector))
3414         {
3415           /* On exit from the loop we will have an easy way of calcalating
3416              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3417              until then.  */
3418           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3419           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3420           *niters_vector_mult_vf_var = niters_vector_mult_vf;
3421         }
3422       else
3423         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3424                                              &niters_vector_mult_vf);
3425       /* Update IVs of original loop as if they were advanced by
3426          niters_vector_mult_vf steps.  */
3427       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3428       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3429       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3430                                         update_e);
3431
3432       if (skip_epilog)
3433         {
3434           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3435                                     niters, niters_vector_mult_vf);
3436           guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3437           edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3438           guard_to = split_edge (epilog_e);
3439           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3440                                            skip_vector ? anchor : guard_bb,
3441                                            prob_epilog.invert (),
3442                                            irred_flag);
3443           if (vect_epilogues)
3444             epilogue_vinfo->skip_this_loop_edge = guard_e;
3445           edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3446           slpeel_update_phi_nodes_for_guard2 (loop, epilog, main_iv, guard_e,
3447                                               epilog_e);
3448           /* Only need to handle basic block before epilog loop if it's not
3449              the guard_bb, which is the case when skip_vector is true.  */
3450           if (guard_bb != bb_before_epilog)
3451             {
3452               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3453
3454               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3455             }
3456           scale_loop_profile (epilog, prob_epilog, -1);
3457         }
3458
3459       unsigned HOST_WIDE_INT bound;
3460       if (bound_scalar.is_constant (&bound))
3461         {
3462           gcc_assert (bound != 0);
3463           /* -1 to convert loop iterations to latch iterations.  */
3464           record_niter_bound (epilog, bound - 1, false, true);
3465           scale_loop_profile (epilog, profile_probability::always (),
3466                               bound - 1);
3467         }
3468
3469       delete_update_ssa ();
3470       adjust_vec_debug_stmts ();
3471       scev_reset ();
3472     }
3473
3474   if (vect_epilogues)
3475     {
3476       epilog->aux = epilogue_vinfo;
3477       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3478       LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3479         = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3480
3481       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3482
3483       /* We now must calculate the number of NITERS performed by the previous
3484          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3485       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3486                                  niters_prolog, niters_vector_mult_vf);
3487
3488       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3489          determine whether we are coming from the previous vectorized loop
3490          using the update_e edge or the skip_vector basic block using the
3491          skip_e edge.  */
3492       if (skip_vector)
3493         {
3494           gcc_assert (update_e != NULL && skip_e != NULL);
3495           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3496                                            update_e->dest);
3497           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3498           gimple *stmt = gimple_build_assign (new_ssa, niters);
3499           gimple_stmt_iterator gsi;
3500           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3501               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3502             {
3503               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3504               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3505             }
3506           else
3507             {
3508               gsi = gsi_last_bb (update_e->src);
3509               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3510             }
3511
3512           niters = new_ssa;
3513           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3514           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3515                        UNKNOWN_LOCATION);
3516           niters = PHI_RESULT (new_phi);
3517           epilogue_vinfo->main_loop_edge = update_e;
3518           epilogue_vinfo->skip_main_loop_edge = skip_e;
3519         }
3520
3521       /* Set ADVANCE to the number of iterations performed by the previous
3522          loop and its prologue.  */
3523       *advance = niters;
3524
3525       /* Subtract the number of iterations performed by the vectorized loop
3526          from the number of total iterations.  */
3527       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3528                                           before_loop_niters,
3529                                           niters);
3530
3531       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3532       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3533         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3534                        epilogue_niters,
3535                        build_one_cst (TREE_TYPE (epilogue_niters)));
3536
3537       /* Decide what to do if the number of epilogue iterations is not
3538          a multiple of the epilogue loop's vectorization factor.
3539          We should have rejected the loop during the analysis phase
3540          if this fails.  */
3541       bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo);
3542       gcc_assert (res);
3543     }
3544
3545   adjust_vec.release ();
3546   free_original_copy_tables ();
3547
3548   return vect_epilogues ? epilog : NULL;
3549 }
3550
3551 /* Function vect_create_cond_for_niters_checks.
3552
3553    Create a conditional expression that represents the run-time checks for
3554    loop's niter.  The loop is guaranteed to terminate if the run-time
3555    checks hold.
3556
3557    Input:
3558    COND_EXPR  - input conditional expression.  New conditions will be chained
3559                 with logical AND operation.  If it is NULL, then the function
3560                 is used to return the number of alias checks.
3561    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3562                 to be checked.
3563
3564    Output:
3565    COND_EXPR - conditional expression.
3566
3567    The returned COND_EXPR is the conditional expression to be used in the
3568    if statement that controls which version of the loop gets executed at
3569    runtime.  */
3570
3571 static void
3572 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3573 {
3574   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3575
3576   if (*cond_expr)
3577     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3578                               *cond_expr, part_cond_expr);
3579   else
3580     *cond_expr = part_cond_expr;
3581 }
3582
3583 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3584    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3585
3586 static void
3587 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3588 {
3589   if (*cond_expr)
3590     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3591                               *cond_expr, part_cond_expr);
3592   else
3593     *cond_expr = part_cond_expr;
3594 }
3595
3596 /* Function vect_create_cond_for_align_checks.
3597
3598    Create a conditional expression that represents the alignment checks for
3599    all of data references (array element references) whose alignment must be
3600    checked at runtime.
3601
3602    Input:
3603    COND_EXPR  - input conditional expression.  New conditions will be chained
3604                 with logical AND operation.
3605    LOOP_VINFO - two fields of the loop information are used.
3606                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3607                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3608
3609    Output:
3610    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3611                          expression.
3612    The returned value is the conditional expression to be used in the if
3613    statement that controls which version of the loop gets executed at runtime.
3614
3615    The algorithm makes two assumptions:
3616      1) The number of bytes "n" in a vector is a power of 2.
3617      2) An address "a" is aligned if a%n is zero and that this
3618         test can be done as a&(n-1) == 0.  For example, for 16
3619         byte vectors the test is a&0xf == 0.  */
3620
3621 static void
3622 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3623                                    tree *cond_expr,
3624                                    gimple_seq *cond_expr_stmt_list)
3625 {
3626   const vec<stmt_vec_info> &may_misalign_stmts
3627     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3628   stmt_vec_info stmt_info;
3629   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3630   tree mask_cst;
3631   unsigned int i;
3632   tree int_ptrsize_type;
3633   char tmp_name[20];
3634   tree or_tmp_name = NULL_TREE;
3635   tree and_tmp_name;
3636   gimple *and_stmt;
3637   tree ptrsize_zero;
3638   tree part_cond_expr;
3639
3640   /* Check that mask is one less than a power of 2, i.e., mask is
3641      all zeros followed by all ones.  */
3642   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3643
3644   int_ptrsize_type = signed_type_for (ptr_type_node);
3645
3646   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3647      of the first vector of the i'th data reference. */
3648
3649   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3650     {
3651       gimple_seq new_stmt_list = NULL;
3652       tree addr_base;
3653       tree addr_tmp_name;
3654       tree new_or_tmp_name;
3655       gimple *addr_stmt, *or_stmt;
3656       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3657       bool negative = tree_int_cst_compare
3658         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3659       tree offset = negative
3660         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3661                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3662         : size_zero_node;
3663
3664       /* create: addr_tmp = (int)(address_of_first_vector) */
3665       addr_base =
3666         vect_create_addr_base_for_vector_ref (loop_vinfo,
3667                                               stmt_info, &new_stmt_list,
3668                                               offset);
3669       if (new_stmt_list != NULL)
3670         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3671
3672       sprintf (tmp_name, "addr2int%d", i);
3673       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3674       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3675       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3676
3677       /* The addresses are OR together.  */
3678
3679       if (or_tmp_name != NULL_TREE)
3680         {
3681           /* create: or_tmp = or_tmp | addr_tmp */
3682           sprintf (tmp_name, "orptrs%d", i);
3683           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3684           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3685                                          or_tmp_name, addr_tmp_name);
3686           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3687           or_tmp_name = new_or_tmp_name;
3688         }
3689       else
3690         or_tmp_name = addr_tmp_name;
3691
3692     } /* end for i */
3693
3694   mask_cst = build_int_cst (int_ptrsize_type, mask);
3695
3696   /* create: and_tmp = or_tmp & mask  */
3697   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3698
3699   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3700                                   or_tmp_name, mask_cst);
3701   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3702
3703   /* Make and_tmp the left operand of the conditional test against zero.
3704      if and_tmp has a nonzero bit then some address is unaligned.  */
3705   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3706   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3707                                 and_tmp_name, ptrsize_zero);
3708   chain_cond_expr (cond_expr, part_cond_expr);
3709 }
3710
3711 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3712    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3713    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3714    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3715
3716 static void
3717 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3718 {
3719   const vec<vec_object_pair> &pairs
3720     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3721   unsigned int i;
3722   vec_object_pair *pair;
3723   FOR_EACH_VEC_ELT (pairs, i, pair)
3724     {
3725       tree addr1 = build_fold_addr_expr (pair->first);
3726       tree addr2 = build_fold_addr_expr (pair->second);
3727       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3728                                          addr1, addr2);
3729       chain_cond_expr (cond_expr, part_cond_expr);
3730     }
3731 }
3732
3733 /* Create an expression that is true when all lower-bound conditions for
3734    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3735
3736 static void
3737 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3738 {
3739   const vec<vec_lower_bound> &lower_bounds
3740     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3741   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3742     {
3743       tree expr = lower_bounds[i].expr;
3744       tree type = unsigned_type_for (TREE_TYPE (expr));
3745       expr = fold_convert (type, expr);
3746       poly_uint64 bound = lower_bounds[i].min_value;
3747       if (!lower_bounds[i].unsigned_p)
3748         {
3749           expr = fold_build2 (PLUS_EXPR, type, expr,
3750                               build_int_cstu (type, bound - 1));
3751           bound += bound - 1;
3752         }
3753       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3754                                          build_int_cstu (type, bound));
3755       chain_cond_expr (cond_expr, part_cond_expr);
3756     }
3757 }
3758
3759 /* Function vect_create_cond_for_alias_checks.
3760
3761    Create a conditional expression that represents the run-time checks for
3762    overlapping of address ranges represented by a list of data references
3763    relations passed as input.
3764
3765    Input:
3766    COND_EXPR  - input conditional expression.  New conditions will be chained
3767                 with logical AND operation.  If it is NULL, then the function
3768                 is used to return the number of alias checks.
3769    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3770                 to be checked.
3771
3772    Output:
3773    COND_EXPR - conditional expression.
3774
3775    The returned COND_EXPR is the conditional expression to be used in the if
3776    statement that controls which version of the loop gets executed at runtime.
3777 */
3778
3779 void
3780 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3781 {
3782   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3783     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3784
3785   if (comp_alias_ddrs.is_empty ())
3786     return;
3787
3788   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3789                                &comp_alias_ddrs, cond_expr);
3790   if (dump_enabled_p ())
3791     dump_printf_loc (MSG_NOTE, vect_location,
3792                      "created %u versioning for alias checks.\n",
3793                      comp_alias_ddrs.length ());
3794 }
3795
3796
3797 /* Function vect_loop_versioning.
3798
3799    If the loop has data references that may or may not be aligned or/and
3800    has data reference relations whose independence was not proven then
3801    two versions of the loop need to be generated, one which is vectorized
3802    and one which isn't.  A test is then generated to control which of the
3803    loops is executed.  The test checks for the alignment of all of the
3804    data references that may or may not be aligned.  An additional
3805    sequence of runtime tests is generated for each pairs of DDRs whose
3806    independence was not proven.  The vectorized version of loop is
3807    executed only if both alias and alignment tests are passed.
3808
3809    The test generated to check which version of loop is executed
3810    is modified to also check for profitability as indicated by the
3811    cost model threshold TH.
3812
3813    The versioning precondition(s) are placed in *COND_EXPR and
3814    *COND_EXPR_STMT_LIST.  */
3815
3816 class loop *
3817 vect_loop_versioning (loop_vec_info loop_vinfo,
3818                       gimple *loop_vectorized_call)
3819 {
3820   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3821   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3822   basic_block condition_bb;
3823   gphi_iterator gsi;
3824   gimple_stmt_iterator cond_exp_gsi;
3825   basic_block merge_bb;
3826   basic_block new_exit_bb;
3827   edge new_exit_e, e;
3828   gphi *orig_phi, *new_phi;
3829   tree cond_expr = NULL_TREE;
3830   gimple_seq cond_expr_stmt_list = NULL;
3831   tree arg;
3832   profile_probability prob = profile_probability::likely ();
3833   gimple_seq gimplify_stmt_list = NULL;
3834   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3835   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3836   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3837   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3838   poly_uint64 versioning_threshold
3839     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3840   tree version_simd_if_cond
3841     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3842   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3843
3844   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3845       && !ordered_p (th, versioning_threshold))
3846     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3847                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3848                                             th - 1));
3849   if (maybe_ne (versioning_threshold, 0U))
3850     {
3851       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3852                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3853                                               versioning_threshold - 1));
3854       if (cond_expr)
3855         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3856                                  expr, cond_expr);
3857       else
3858         cond_expr = expr;
3859     }
3860
3861   tree cost_name = NULL_TREE;
3862   profile_probability prob2 = profile_probability::always ();
3863   if (cond_expr
3864       && EXPR_P (cond_expr)
3865       && (version_niter
3866           || version_align
3867           || version_alias
3868           || version_simd_if_cond))
3869     {
3870       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3871                                                       &cond_expr_stmt_list,
3872                                                       is_gimple_val, NULL_TREE);
3873       /* Split prob () into two so that the overall probability of passing
3874          both the cost-model and versioning checks is the orig prob.  */
3875       prob2 = prob = prob.sqrt ();
3876     }
3877
3878   if (version_niter)
3879     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3880
3881   if (cond_expr)
3882     {
3883       gimple_seq tem = NULL;
3884       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3885                                           &tem, is_gimple_condexpr_for_cond,
3886                                           NULL_TREE);
3887       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3888     }
3889
3890   if (version_align)
3891     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3892                                        &cond_expr_stmt_list);
3893
3894   if (version_alias)
3895     {
3896       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3897       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3898       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3899     }
3900
3901   if (version_simd_if_cond)
3902     {
3903       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3904       if (flag_checking)
3905         if (basic_block bb
3906             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3907           gcc_assert (bb != loop->header
3908                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3909                       && (scalar_loop == NULL
3910                           || (bb != scalar_loop->header
3911                               && dominated_by_p (CDI_DOMINATORS,
3912                                                  scalar_loop->header, bb))));
3913       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3914       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3915                             version_simd_if_cond, zero);
3916       if (cond_expr)
3917         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3918                                  c, cond_expr);
3919       else
3920         cond_expr = c;
3921       if (dump_enabled_p ())
3922         dump_printf_loc (MSG_NOTE, vect_location,
3923                          "created versioning for simd if condition check.\n");
3924     }
3925
3926   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3927                                       &gimplify_stmt_list,
3928                                       is_gimple_condexpr_for_cond, NULL_TREE);
3929   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3930
3931   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3932      invariant in.  */
3933   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3934   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3935        !gsi_end_p (gsi); gsi_next (&gsi))
3936     {
3937       gimple *stmt = gsi_stmt (gsi);
3938       update_stmt (stmt);
3939       ssa_op_iter iter;
3940       use_operand_p use_p;
3941       basic_block def_bb;
3942       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3943         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3944             && flow_bb_inside_loop_p (outermost, def_bb))
3945           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3946     }
3947
3948   /* Search for the outermost loop we can version.  Avoid versioning of
3949      non-perfect nests but allow if-conversion versioned loops inside.  */
3950   class loop *loop_to_version = loop;
3951   if (flow_loop_nested_p (outermost, loop))
3952     {
3953       if (dump_enabled_p ())
3954         dump_printf_loc (MSG_NOTE, vect_location,
3955                          "trying to apply versioning to outer loop %d\n",
3956                          outermost->num);
3957       if (outermost->num == 0)
3958         outermost = superloop_at_depth (loop, 1);
3959       /* And avoid applying versioning on non-perfect nests.  */
3960       while (loop_to_version != outermost
3961              && (e = single_exit (loop_outer (loop_to_version)))
3962              && !(e->flags & EDGE_COMPLEX)
3963              && (!loop_outer (loop_to_version)->inner->next
3964                  || vect_loop_vectorized_call (loop_to_version))
3965              && (!loop_outer (loop_to_version)->inner->next
3966                  || !loop_outer (loop_to_version)->inner->next->next))
3967         loop_to_version = loop_outer (loop_to_version);
3968     }
3969
3970   /* Apply versioning.  If there is already a scalar version created by
3971      if-conversion re-use that.  Note we cannot re-use the copy of
3972      an if-converted outer-loop when vectorizing the inner loop only.  */
3973   gcond *cond;
3974   if ((!loop_to_version->inner || loop == loop_to_version)
3975       && loop_vectorized_call)
3976     {
3977       gcc_assert (scalar_loop);
3978       condition_bb = gimple_bb (loop_vectorized_call);
3979       cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
3980       gimple_cond_set_condition_from_tree (cond, cond_expr);
3981       update_stmt (cond);
3982
3983       if (cond_expr_stmt_list)
3984         {
3985           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3986           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3987                                  GSI_SAME_STMT);
3988         }
3989
3990       /* if-conversion uses profile_probability::always () for both paths,
3991          reset the paths probabilities appropriately.  */
3992       edge te, fe;
3993       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3994       te->probability = prob;
3995       fe->probability = prob.invert ();
3996       /* We can scale loops counts immediately but have to postpone
3997          scaling the scalar loop because we re-use it during peeling.
3998
3999          Ifcvt duplicates loop preheader, loop body and produces an basic
4000          block after loop exit.  We need to scale all that.  */
4001       basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4002       preheader->count = preheader->count.apply_probability (prob * prob2);
4003       scale_loop_frequencies (loop_to_version, prob * prob2);
4004       single_exit (loop_to_version)->dest->count = preheader->count;
4005       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4006
4007       nloop = scalar_loop;
4008       if (dump_enabled_p ())
4009         dump_printf_loc (MSG_NOTE, vect_location,
4010                          "reusing %sloop version created by if conversion\n",
4011                          loop_to_version != loop ? "outer " : "");
4012     }
4013   else
4014     {
4015       if (loop_to_version != loop
4016           && dump_enabled_p ())
4017         dump_printf_loc (MSG_NOTE, vect_location,
4018                          "applying loop versioning to outer loop %d\n",
4019                          loop_to_version->num);
4020
4021       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4022
4023       initialize_original_copy_tables ();
4024       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4025                             prob * prob2, (prob * prob2).invert (),
4026                             prob * prob2, (prob * prob2).invert (),
4027                             true);
4028       /* We will later insert second conditional so overall outcome of
4029          both is prob * prob2.  */
4030       edge true_e, false_e;
4031       extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4032       true_e->probability = prob;
4033       false_e->probability = prob.invert ();
4034       gcc_assert (nloop);
4035       nloop = get_loop_copy (loop);
4036
4037       /* For cycle vectorization with SLP we rely on the PHI arguments
4038          appearing in the same order as the SLP node operands which for the
4039          loop PHI nodes means the preheader edge dest index needs to remain
4040          the same for the analyzed loop which also becomes the vectorized one.
4041          Make it so in case the state after versioning differs by redirecting
4042          the first edge into the header to the same destination which moves
4043          it last.  */
4044       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4045         {
4046           edge e = EDGE_PRED (loop->header, 0);
4047           ssa_redirect_edge (e, e->dest);
4048           flush_pending_stmts (e);
4049         }
4050       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4051
4052       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4053          reap those otherwise;  they also refer to the original
4054          loops.  */
4055       class loop *l = loop;
4056       while (gimple *call = vect_loop_vectorized_call (l))
4057         {
4058           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4059           fold_loop_internal_call (call, boolean_false_node);
4060           l = loop_outer (l);
4061         }
4062       free_original_copy_tables ();
4063
4064       if (cond_expr_stmt_list)
4065         {
4066           cond_exp_gsi = gsi_last_bb (condition_bb);
4067           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4068                                  GSI_SAME_STMT);
4069         }
4070
4071       /* Loop versioning violates an assumption we try to maintain during
4072          vectorization - that the loop exit block has a single predecessor.
4073          After versioning, the exit block of both loop versions is the same
4074          basic block (i.e. it has two predecessors). Just in order to simplify
4075          following transformations in the vectorizer, we fix this situation
4076          here by adding a new (empty) block on the exit-edge of the loop,
4077          with the proper loop-exit phis to maintain loop-closed-form.
4078          If loop versioning wasn't done from loop, but scalar_loop instead,
4079          merge_bb will have already just a single successor.  */
4080
4081       merge_bb = single_exit (loop_to_version)->dest;
4082       if (EDGE_COUNT (merge_bb->preds) >= 2)
4083         {
4084           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4085           new_exit_bb = split_edge (single_exit (loop_to_version));
4086           new_exit_e = single_exit (loop_to_version);
4087           e = EDGE_SUCC (new_exit_bb, 0);
4088
4089           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4090                gsi_next (&gsi))
4091             {
4092               tree new_res;
4093               orig_phi = gsi.phi ();
4094               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4095               new_phi = create_phi_node (new_res, new_exit_bb);
4096               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4097               add_phi_arg (new_phi, arg, new_exit_e,
4098                            gimple_phi_arg_location_from_edge (orig_phi, e));
4099               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4100             }
4101         }
4102
4103       update_ssa (TODO_update_ssa_no_phi);
4104     }
4105
4106   /* Split the cost model check off to a separate BB.  Costing assumes
4107      this is the only thing we perform when we enter the scalar loop
4108      from a failed cost decision.  */
4109   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4110     {
4111       gimple *def = SSA_NAME_DEF_STMT (cost_name);
4112       gcc_assert (gimple_bb (def) == condition_bb);
4113       /* All uses of the cost check are 'true' after the check we
4114          are going to insert.  */
4115       replace_uses_by (cost_name, boolean_true_node);
4116       /* And we're going to build the new single use of it.  */
4117       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4118                                        NULL_TREE, NULL_TREE);
4119       edge e = split_block (gimple_bb (def), def);
4120       gimple_stmt_iterator gsi = gsi_for_stmt (def);
4121       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4122       edge true_e, false_e;
4123       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4124       e->flags &= ~EDGE_FALLTHRU;
4125       e->flags |= EDGE_TRUE_VALUE;
4126       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4127       e->probability = prob2;
4128       e2->probability = prob2.invert ();
4129       e->dest->count = e->count ();
4130       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4131       auto_vec<basic_block, 3> adj;
4132       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4133            son;
4134            son = next_dom_son (CDI_DOMINATORS, son))
4135         if (EDGE_COUNT (son->preds) > 1)
4136           adj.safe_push (son);
4137       for (auto son : adj)
4138         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4139       //debug_bb (condition_bb);
4140       //debug_bb (e->src);
4141     }
4142
4143   if (version_niter)
4144     {
4145       /* The versioned loop could be infinite, we need to clear existing
4146          niter information which is copied from the original loop.  */
4147       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4148       vect_free_loop_info_assumptions (nloop);
4149     }
4150
4151   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4152       && dump_enabled_p ())
4153     {
4154       if (version_alias)
4155         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4156                          vect_location,
4157                          "loop versioned for vectorization because of "
4158                          "possible aliasing\n");
4159       if (version_align)
4160         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4161                          vect_location,
4162                          "loop versioned for vectorization to enhance "
4163                          "alignment\n");
4164
4165     }
4166
4167   return nloop;
4168 }