gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53 #include "langhooks.h"
  54 #include "tree-vector-builder.h"
  55 #include "optabs-tree.h"
  56
  57 /*************************************************************************
  58   Simple Loop Peeling Utilities
  59
  60   Utilities to support loop peeling for vectorization purposes.
  61  *************************************************************************/
  62
  63
  64 /* Renames the use *OP_P.  */
  65
  66 static void
  67 rename_use_op (use_operand_p op_p)
  68 {
  69   tree new_name;
  70
  71   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  72     return;
  73
  74   new_name = get_current_def (USE_FROM_PTR (op_p));
  75
  76   /* Something defined outside of the loop.  */
  77   if (!new_name)
  78     return;
  79
  80   /* An ordinary ssa name defined in the loop.  */
  81
  82   SET_USE (op_p, new_name);
  83 }
  84
  85
  86 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  87    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  88    true.  */
  89
  90 static void
  91 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  92 {
  93   gimple *stmt;
  94   use_operand_p use_p;
  95   ssa_op_iter iter;
  96   edge e;
  97   edge_iterator ei;
  98   class loop *loop = bb->loop_father;
  99   class loop *outer_loop = NULL;
 100
 101   if (rename_from_outer_loop)
 102     {
 103       gcc_assert (loop);
 104       outer_loop = loop_outer (loop);
 105     }
 106
 107   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 108        gsi_next (&gsi))
 109     {
 110       stmt = gsi_stmt (gsi);
 111       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 112         rename_use_op (use_p);
 113     }
 114
 115   FOR_EACH_EDGE (e, ei, bb->preds)
 116     {
 117       if (!flow_bb_inside_loop_p (loop, e->src))
 118         {
 119           if (!rename_from_outer_loop)
 120             continue;
 121           if (e->src != outer_loop->header)
 122             {
 123               if (outer_loop->inner->next)
 124                 {
 125                   /* If outer_loop has 2 inner loops, allow there to
 126                      be an extra basic block which decides which of the
 127                      two loops to use using LOOP_VECTORIZED.  */
 128                   if (!single_pred_p (e->src)
 129                       || single_pred (e->src) != outer_loop->header)
 130                     continue;
 131                 }
 132             }
 133         }
 134       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 135            gsi_next (&gsi))
 136         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 137     }
 138 }
 139
 140
 141 struct adjust_info
 142 {
 143   tree from, to;
 144   basic_block bb;
 145 };
 146
 147 /* A stack of values to be adjusted in debug stmts.  We have to
 148    process them LIFO, so that the closest substitution applies.  If we
 149    processed them FIFO, without the stack, we might substitute uses
 150    with a PHI DEF that would soon become non-dominant, and when we got
 151    to the suitable one, it wouldn't have anything to substitute any
 152    more.  */
 153 static vec<adjust_info, va_heap> adjust_vec;
 154
 155 /* Adjust any debug stmts that referenced AI->from values to use the
 156    loop-closed AI->to, if the references are dominated by AI->bb and
 157    not by the definition of AI->from.  */
 158
 159 static void
 160 adjust_debug_stmts_now (adjust_info *ai)
 161 {
 162   basic_block bbphi = ai->bb;
 163   tree orig_def = ai->from;
 164   tree new_def = ai->to;
 165   imm_use_iterator imm_iter;
 166   gimple *stmt;
 167   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 168
 169   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 170
 171   /* Adjust any debug stmts that held onto non-loop-closed
 172      references.  */
 173   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 174     {
 175       use_operand_p use_p;
 176       basic_block bbuse;
 177
 178       if (!is_gimple_debug (stmt))
 179         continue;
 180
 181       gcc_assert (gimple_debug_bind_p (stmt));
 182
 183       bbuse = gimple_bb (stmt);
 184
 185       if ((bbuse == bbphi
 186            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 187           && !(bbuse == bbdef
 188                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 189         {
 190           if (new_def)
 191             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 192               SET_USE (use_p, new_def);
 193           else
 194             {
 195               gimple_debug_bind_reset_value (stmt);
 196               update_stmt (stmt);
 197             }
 198         }
 199     }
 200 }
 201
 202 /* Adjust debug stmts as scheduled before.  */
 203
 204 static void
 205 adjust_vec_debug_stmts (void)
 206 {
 207   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 208     return;
 209
 210   gcc_assert (adjust_vec.exists ());
 211
 212   while (!adjust_vec.is_empty ())
 213     {
 214       adjust_debug_stmts_now (&adjust_vec.last ());
 215       adjust_vec.pop ();
 216     }
 217 }
 218
 219 /* Adjust any debug stmts that referenced FROM values to use the
 220    loop-closed TO, if the references are dominated by BB and not by
 221    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 222    will be postponed until adjust_vec_debug_stmts is called.  */
 223
 224 static void
 225 adjust_debug_stmts (tree from, tree to, basic_block bb)
 226 {
 227   adjust_info ai;
 228
 229   if (MAY_HAVE_DEBUG_BIND_STMTS
 230       && TREE_CODE (from) == SSA_NAME
 231       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 232       && ! virtual_operand_p (from))
 233     {
 234       ai.from = from;
 235       ai.to = to;
 236       ai.bb = bb;
 237
 238       if (adjust_vec.exists ())
 239         adjust_vec.safe_push (ai);
 240       else
 241         adjust_debug_stmts_now (&ai);
 242     }
 243 }
 244
 245 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 246    to adjust any debug stmts that referenced the old phi arg,
 247    presumably non-loop-closed references left over from other
 248    transformations.  */
 249
 250 static void
 251 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 252 {
 253   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 254
 255   gcc_assert (TREE_CODE (orig_def) != SSA_NAME
 256               || orig_def != new_def);
 257
 258   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 259
 260   if (MAY_HAVE_DEBUG_BIND_STMTS)
 261     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 262                         gimple_bb (update_phi));
 263 }
 264
 265 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 266    that the control should have during the first iteration and NEXT_CTRL is the
 267    value that it should have on subsequent iterations.  */
 268
 269 static void
 270 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 271                        tree next_ctrl)
 272 {
 273   gphi *phi = create_phi_node (ctrl, loop->header);
 274   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 275   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 276 }
 277
 278 /* Add SEQ to the end of LOOP's preheader block.  */
 279
 280 static void
 281 add_preheader_seq (class loop *loop, gimple_seq seq)
 282 {
 283   if (seq)
 284     {
 285       edge pe = loop_preheader_edge (loop);
 286       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 287       gcc_assert (!new_bb);
 288     }
 289 }
 290
 291 /* Add SEQ to the beginning of LOOP's header block.  */
 292
 293 static void
 294 add_header_seq (class loop *loop, gimple_seq seq)
 295 {
 296   if (seq)
 297     {
 298       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 299       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 300     }
 301 }
 302
 303 /* Return true if the target can interleave elements of two vectors.
 304    OFFSET is 0 if the first half of the vectors should be interleaved
 305    or 1 if the second half should.  When returning true, store the
 306    associated permutation in INDICES.  */
 307
 308 static bool
 309 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 310                         unsigned int offset)
 311 {
 312   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 313   poly_uint64 base = exact_div (nelts, 2) * offset;
 314   vec_perm_builder sel (nelts, 2, 3);
 315   for (unsigned int i = 0; i < 3; ++i)
 316     {
 317       sel.quick_push (base + i);
 318       sel.quick_push (base + i + nelts);
 319     }
 320   indices->new_vector (sel, 2, nelts);
 321   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 322                                *indices);
 323 }
 324
 325 /* Try to use permutes to define the masks in DEST_RGM using the masks
 326    in SRC_RGM, given that the former has twice as many masks as the
 327    latter.  Return true on success, adding any new statements to SEQ.  */
 328
 329 static bool
 330 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 331                                rgroup_controls *src_rgm)
 332 {
 333   tree src_masktype = src_rgm->type;
 334   tree dest_masktype = dest_rgm->type;
 335   machine_mode src_mode = TYPE_MODE (src_masktype);
 336   insn_code icode1, icode2;
 337   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 338       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 339                                   src_mode)) != CODE_FOR_nothing
 340       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 341                                   src_mode)) != CODE_FOR_nothing)
 342     {
 343       /* Unpacking the source masks gives at least as many mask bits as
 344          we need.  We can then VIEW_CONVERT any excess bits away.  */
 345       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 346       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 347       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 348       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 349         {
 350           tree src = src_rgm->controls[i / 2];
 351           tree dest = dest_rgm->controls[i];
 352           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 353                             ? VEC_UNPACK_HI_EXPR
 354                             : VEC_UNPACK_LO_EXPR);
 355           gassign *stmt;
 356           if (dest_masktype == unpack_masktype)
 357             stmt = gimple_build_assign (dest, code, src);
 358           else
 359             {
 360               tree temp = make_ssa_name (unpack_masktype);
 361               stmt = gimple_build_assign (temp, code, src);
 362               gimple_seq_add_stmt (seq, stmt);
 363               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 364                                           build1 (VIEW_CONVERT_EXPR,
 365                                                   dest_masktype, temp));
 366             }
 367           gimple_seq_add_stmt (seq, stmt);
 368         }
 369       return true;
 370     }
 371   vec_perm_indices indices[2];
 372   if (dest_masktype == src_masktype
 373       && interleave_supported_p (&indices[0], src_masktype, 0)
 374       && interleave_supported_p (&indices[1], src_masktype, 1))
 375     {
 376       /* The destination requires twice as many mask bits as the source, so
 377          we can use interleaving permutes to double up the number of bits.  */
 378       tree masks[2];
 379       for (unsigned int i = 0; i < 2; ++i)
 380         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 381       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 382         {
 383           tree src = src_rgm->controls[i / 2];
 384           tree dest = dest_rgm->controls[i];
 385           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 386                                               src, src, masks[i & 1]);
 387           gimple_seq_add_stmt (seq, stmt);
 388         }
 389       return true;
 390     }
 391   return false;
 392 }
 393
 394 /* Populate DEST_RGM->controls, given that they should add up to STEP.
 395
 396      STEP = MIN_EXPR <ivtmp_34, VF>;
 397
 398      First length (MIN (X, VF/N)):
 399        loop_len_15 = MIN_EXPR <STEP, VF/N>;
 400
 401      Second length:
 402        tmp = STEP - loop_len_15;
 403        loop_len_16 = MIN (tmp, VF/N);
 404
 405      Third length:
 406        tmp2 = tmp - loop_len_16;
 407        loop_len_17 = MIN (tmp2, VF/N);
 408
 409      Last length:
 410        loop_len_18 = tmp2 - loop_len_17;
 411 */
 412
 413 static void
 414 vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
 415                                rgroup_controls *dest_rgm, tree step)
 416 {
 417   tree ctrl_type = dest_rgm->type;
 418   poly_uint64 nitems_per_ctrl
 419     = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
 420   tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
 421
 422   for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 423     {
 424       tree ctrl = dest_rgm->controls[i];
 425       if (i == 0)
 426         {
 427           /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
 428           gassign *assign
 429             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 430           gimple_seq_add_stmt (seq, assign);
 431         }
 432       else if (i == dest_rgm->controls.length () - 1)
 433         {
 434           /* Last iteration: Remain capped to the range [0, VF/N].  */
 435           gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
 436                                                  dest_rgm->controls[i - 1]);
 437           gimple_seq_add_stmt (seq, assign);
 438         }
 439       else
 440         {
 441           /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
 442           step = gimple_build (seq, MINUS_EXPR, iv_type, step,
 443                                dest_rgm->controls[i - 1]);
 444           gassign *assign
 445             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 446           gimple_seq_add_stmt (seq, assign);
 447         }
 448     }
 449 }
 450
 451 /* Stores the standard position for induction variable increment in belonging to
 452    LOOP_EXIT (just before the exit condition of the given exit to BSI.
 453    INSERT_AFTER is set to true if the increment should be inserted after
 454    *BSI.  */
 455
 456 void
 457 vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
 458                             bool *insert_after)
 459 {
 460   basic_block bb = loop_exit->src;
 461   *bsi = gsi_last_bb (bb);
 462   *insert_after = false;
 463 }
 464
 465 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 466    for all the rgroup controls in RGC and return a control that is nonzero
 467    when the loop needs to iterate.  Add any new preheader statements to
 468    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 469
 470    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 471    times and has been vectorized according to LOOP_VINFO.
 472
 473    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 474    starts with NITERS_SKIP dummy iterations of the scalar loop before
 475    the real work starts.  The mask elements for these dummy iterations
 476    must be 0, to ensure that the extra iterations do not have an effect.
 477
 478    It is known that:
 479
 480      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 481
 482    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 483    variable that starts at 0 and has step:
 484
 485      VF * RGC->max_nscalars_per_iter * RGC->factor
 486
 487    might overflow before hitting a value above:
 488
 489      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 490
 491    This means that we cannot guarantee that such an induction variable
 492    would ever hit a value that produces a set of all-false masks or zero
 493    lengths for RGC.
 494
 495    Note: the cost of the code generated by this function is modeled
 496    by vect_estimate_min_profitable_iters, so changes here may need
 497    corresponding changes there.  */
 498
 499 static tree
 500 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 501                                  gimple_seq *preheader_seq,
 502                                  gimple_seq *header_seq,
 503                                  gimple_stmt_iterator loop_cond_gsi,
 504                                  rgroup_controls *rgc, tree niters,
 505                                  tree niters_skip, bool might_wrap_p,
 506                                  tree *iv_step, tree *compare_step)
 507 {
 508   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 509   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 510   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 511
 512   tree ctrl_type = rgc->type;
 513   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 514   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 515   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 516   tree length_limit = NULL_TREE;
 517   /* For length, we need length_limit to ensure length in range.  */
 518   if (!use_masks_p)
 519     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 520
 521   /* Calculate the maximum number of item values that the rgroup
 522      handles in total, the number that it handles for each iteration
 523      of the vector loop, and the number that it should skip during the
 524      first iteration of the vector loop.  */
 525   tree nitems_total = niters;
 526   tree nitems_step = build_int_cst (iv_type, vf);
 527   tree nitems_skip = niters_skip;
 528   if (nitems_per_iter != 1)
 529     {
 530       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 531          these multiplications don't overflow.  */
 532       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 533       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 534       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 535                                    nitems_total, compare_factor);
 536       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 537                                   nitems_step, iv_factor);
 538       if (nitems_skip)
 539         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 540                                     nitems_skip, compare_factor);
 541     }
 542
 543   /* Create an induction variable that counts the number of items
 544      processed.  */
 545   tree index_before_incr, index_after_incr;
 546   gimple_stmt_iterator incr_gsi;
 547   bool insert_after;
 548   edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
 549   vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
 550   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
 551     {
 552       /* Create an IV that counts down from niters_total and whose step
 553          is the (variable) amount processed in the current iteration:
 554            ...
 555            _10 = (unsigned long) count_12(D);
 556            ...
 557            # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
 558            _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
 559            ...
 560            vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
 561            ...
 562            ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
 563            ...
 564            if (ivtmp_9 > POLY_INT_CST [4, 4])
 565              goto <bb 4>; [83.33%]
 566            else
 567              goto <bb 5>; [16.67%]
 568       */
 569       nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
 570       tree step = rgc->controls.length () == 1 ? rgc->controls[0]
 571                                                : make_ssa_name (iv_type);
 572       /* Create decrement IV.  */
 573       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 574         {
 575           create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
 576                      insert_after, &index_before_incr, &index_after_incr);
 577           tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
 578                                    index_before_incr, nitems_step);
 579           gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
 580         }
 581       else
 582         {
 583           create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
 584                      &incr_gsi, insert_after, &index_before_incr,
 585                      &index_after_incr);
 586           gimple_seq_add_stmt (header_seq,
 587                                gimple_build_assign (step, MIN_EXPR,
 588                                                     index_before_incr,
 589                                                     nitems_step));
 590         }
 591       *iv_step = step;
 592       *compare_step = nitems_step;
 593       return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
 594                                                        : index_before_incr;
 595     }
 596
 597   /* Create increment IV.  */
 598   create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
 599              loop, &incr_gsi, insert_after, &index_before_incr,
 600              &index_after_incr);
 601
 602   tree zero_index = build_int_cst (compare_type, 0);
 603   tree test_index, test_limit, first_limit;
 604   gimple_stmt_iterator *test_gsi;
 605   if (might_wrap_p)
 606     {
 607       /* In principle the loop should stop iterating once the incremented
 608          IV reaches a value greater than or equal to:
 609
 610            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 611
 612          However, there's no guarantee that this addition doesn't overflow
 613          the comparison type, or that the IV hits a value above it before
 614          wrapping around.  We therefore adjust the limit down by one
 615          IV step:
 616
 617            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 618            -[infinite-prec] NITEMS_STEP
 619
 620          and compare the IV against this limit _before_ incrementing it.
 621          Since the comparison type is unsigned, we actually want the
 622          subtraction to saturate at zero:
 623
 624            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 625            -[sat] NITEMS_STEP
 626
 627          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 628
 629            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 630
 631          where the rightmost subtraction can be done directly in
 632          COMPARE_TYPE.  */
 633       test_index = index_before_incr;
 634       tree adjust = gimple_convert (preheader_seq, compare_type,
 635                                     nitems_step);
 636       if (nitems_skip)
 637         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 638                                adjust, nitems_skip);
 639       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 640                                  nitems_total, adjust);
 641       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 642                                  test_limit, adjust);
 643       test_gsi = &incr_gsi;
 644
 645       /* Get a safe limit for the first iteration.  */
 646       if (nitems_skip)
 647         {
 648           /* The first vector iteration can handle at most NITEMS_STEP
 649              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 650              NITEMS_SKIP to that cannot overflow.  */
 651           tree const_limit = build_int_cst (compare_type,
 652                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 653                                             * nitems_per_iter);
 654           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 655                                       nitems_total, const_limit);
 656           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 657                                       first_limit, nitems_skip);
 658         }
 659       else
 660         /* For the first iteration it doesn't matter whether the IV hits
 661            a value above NITEMS_TOTAL.  That only matters for the latch
 662            condition.  */
 663         first_limit = nitems_total;
 664     }
 665   else
 666     {
 667       /* Test the incremented IV, which will always hit a value above
 668          the bound before wrapping.  */
 669       test_index = index_after_incr;
 670       test_limit = nitems_total;
 671       if (nitems_skip)
 672         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 673                                    test_limit, nitems_skip);
 674       test_gsi = &loop_cond_gsi;
 675
 676       first_limit = test_limit;
 677     }
 678
 679   /* Convert the IV value to the comparison type (either a no-op or
 680      a demotion).  */
 681   gimple_seq test_seq = NULL;
 682   test_index = gimple_convert (&test_seq, compare_type, test_index);
 683   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 684
 685   /* Provide a definition of each control in the group.  */
 686   tree next_ctrl = NULL_TREE;
 687   tree ctrl;
 688   unsigned int i;
 689   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 690     {
 691       /* Previous controls will cover BIAS items.  This control covers the
 692          next batch.  */
 693       poly_uint64 bias = nitems_per_ctrl * i;
 694       tree bias_tree = build_int_cst (compare_type, bias);
 695
 696       /* See whether the first iteration of the vector loop is known
 697          to have a full control.  */
 698       poly_uint64 const_limit;
 699       bool first_iteration_full
 700         = (poly_int_tree_p (first_limit, &const_limit)
 701            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 702
 703       /* Rather than have a new IV that starts at BIAS and goes up to
 704          TEST_LIMIT, prefer to use the same 0-based IV for each control
 705          and adjust the bound down by BIAS.  */
 706       tree this_test_limit = test_limit;
 707       if (i != 0)
 708         {
 709           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 710                                           compare_type, this_test_limit,
 711                                           bias_tree);
 712           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 713                                           compare_type, this_test_limit,
 714                                           bias_tree);
 715         }
 716
 717       /* Create the initial control.  First include all items that
 718          are within the loop limit.  */
 719       tree init_ctrl = NULL_TREE;
 720       if (!first_iteration_full)
 721         {
 722           tree start, end;
 723           if (first_limit == test_limit)
 724             {
 725               /* Use a natural test between zero (the initial IV value)
 726                  and the loop limit.  The "else" block would be valid too,
 727                  but this choice can avoid the need to load BIAS_TREE into
 728                  a register.  */
 729               start = zero_index;
 730               end = this_test_limit;
 731             }
 732           else
 733             {
 734               /* FIRST_LIMIT is the maximum number of items handled by the
 735                  first iteration of the vector loop.  Test the portion
 736                  associated with this control.  */
 737               start = bias_tree;
 738               end = first_limit;
 739             }
 740
 741           if (use_masks_p)
 742             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 743                                         start, end, "max_mask");
 744           else
 745             {
 746               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 747               gimple_seq seq = vect_gen_len (init_ctrl, start,
 748                                              end, length_limit);
 749               gimple_seq_add_seq (preheader_seq, seq);
 750             }
 751         }
 752
 753       /* Now AND out the bits that are within the number of skipped
 754          items.  */
 755       poly_uint64 const_skip;
 756       if (nitems_skip
 757           && !(poly_int_tree_p (nitems_skip, &const_skip)
 758                && known_le (const_skip, bias)))
 759         {
 760           gcc_assert (use_masks_p);
 761           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 762                                                     bias_tree, nitems_skip);
 763           if (init_ctrl)
 764             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 765                                       init_ctrl, unskipped_mask);
 766           else
 767             init_ctrl = unskipped_mask;
 768         }
 769
 770       if (!init_ctrl)
 771         {
 772           /* First iteration is full.  */
 773           if (use_masks_p)
 774             init_ctrl = build_minus_one_cst (ctrl_type);
 775           else
 776             init_ctrl = length_limit;
 777         }
 778
 779       /* Get the control value for the next iteration of the loop.  */
 780       if (use_masks_p)
 781         {
 782           gimple_seq stmts = NULL;
 783           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 784                                       this_test_limit, "next_mask");
 785           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 786         }
 787       else
 788         {
 789           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 790           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 791                                          length_limit);
 792           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 793         }
 794
 795       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 796     }
 797
 798   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 799   if (partial_load_bias != 0)
 800     {
 801       tree adjusted_len = rgc->bias_adjusted_ctrl;
 802       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 803                                             rgc->controls[0],
 804                                             build_int_cst
 805                                             (TREE_TYPE (rgc->controls[0]),
 806                                              partial_load_bias));
 807       gimple_seq_add_stmt (header_seq, minus);
 808     }
 809
 810   return next_ctrl;
 811 }
 812
 813 /* Set up the iteration condition and rgroup controls for LOOP, given
 814    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 815    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 816    the number of iterations of the original scalar loop that should be
 817    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 818    for vect_set_loop_condition.
 819
 820    Insert the branch-back condition before LOOP_COND_GSI and return the
 821    final gcond.  */
 822
 823 static gcond *
 824 vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
 825                                          loop_vec_info loop_vinfo, tree niters,
 826                                          tree final_iv, bool niters_maybe_zero,
 827                                          gimple_stmt_iterator loop_cond_gsi)
 828 {
 829   gimple_seq preheader_seq = NULL;
 830   gimple_seq header_seq = NULL;
 831
 832   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 833   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 834   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 835   tree orig_niters = niters;
 836
 837   /* Type of the initial value of NITERS.  */
 838   tree ni_actual_type = TREE_TYPE (niters);
 839   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 840   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 841   if (niters_skip)
 842     niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
 843
 844   /* Convert NITERS to the same size as the compare.  */
 845   if (compare_precision > ni_actual_precision
 846       && niters_maybe_zero)
 847     {
 848       /* We know that there is always at least one iteration, so if the
 849          count is zero then it must have wrapped.  Cope with this by
 850          subtracting 1 before the conversion and adding 1 to the result.  */
 851       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 852       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 853                              niters, build_minus_one_cst (ni_actual_type));
 854       niters = gimple_convert (&preheader_seq, compare_type, niters);
 855       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 856                              niters, build_one_cst (compare_type));
 857     }
 858   else
 859     niters = gimple_convert (&preheader_seq, compare_type, niters);
 860
 861   /* Iterate over all the rgroups and fill in their controls.  We could use
 862      the first control from any rgroup for the loop condition; here we
 863      arbitrarily pick the last.  */
 864   tree test_ctrl = NULL_TREE;
 865   tree iv_step = NULL_TREE;
 866   tree compare_step = NULL_TREE;
 867   rgroup_controls *rgc;
 868   rgroup_controls *iv_rgc = nullptr;
 869   unsigned int i;
 870   auto_vec<rgroup_controls> *controls = use_masks_p
 871                                           ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
 872                                           : &LOOP_VINFO_LENS (loop_vinfo);
 873   FOR_EACH_VEC_ELT (*controls, i, rgc)
 874     if (!rgc->controls.is_empty ())
 875       {
 876         /* First try using permutes.  This adds a single vector
 877            instruction to the loop for each mask, but needs no extra
 878            loop invariants or IVs.  */
 879         unsigned int nmasks = i + 1;
 880         if (use_masks_p && (nmasks & 1) == 0)
 881           {
 882             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 883             if (!half_rgc->controls.is_empty ()
 884                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 885               continue;
 886           }
 887
 888         if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 889             || !iv_rgc
 890             || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
 891                 != rgc->max_nscalars_per_iter * rgc->factor))
 892           {
 893             /* See whether zero-based IV would ever generate all-false masks
 894                or zero length before wrapping around.  */
 895             bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 896
 897             /* Set up all controls for this group.  */
 898             test_ctrl
 899               = vect_set_loop_controls_directly (loop, loop_vinfo,
 900                                                  &preheader_seq, &header_seq,
 901                                                  loop_cond_gsi, rgc, niters,
 902                                                  niters_skip, might_wrap_p,
 903                                                  &iv_step, &compare_step);
 904
 905             iv_rgc = rgc;
 906           }
 907
 908         if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 909             && rgc->controls.length () > 1)
 910           {
 911             /* vect_set_loop_controls_directly creates an IV whose step
 912                is equal to the expected sum of RGC->controls.  Use that
 913                information to populate RGC->controls.  */
 914             tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 915             gcc_assert (iv_step);
 916             vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
 917           }
 918       }
 919
 920   /* Emit all accumulated statements.  */
 921   add_preheader_seq (loop, preheader_seq);
 922   add_header_seq (loop, header_seq);
 923
 924   /* Get a boolean result that tells us whether to iterate.  */
 925   gcond *cond_stmt;
 926   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 927       && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 928     {
 929       gcc_assert (compare_step);
 930       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
 931       cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
 932                                      NULL_TREE);
 933     }
 934   else
 935     {
 936       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 937       tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 938       cond_stmt
 939         = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
 940     }
 941   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 942
 943   /* The loop iterates (NITERS - 1) / VF + 1 times.
 944      Subtract one from this to get the latch count.  */
 945   tree step = build_int_cst (compare_type,
 946                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 947   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 948                                        build_minus_one_cst (compare_type));
 949   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 950                                      niters_minus_one, step);
 951
 952   if (final_iv)
 953     {
 954       gassign *assign;
 955       /* If vectorizing an inverted early break loop we have to restart the
 956          scalar loop at niters - vf.  This matches what we do in
 957          vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
 958       if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 959         {
 960           tree ftype = TREE_TYPE (orig_niters);
 961           tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 962           assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
 963         }
 964        else
 965         assign = gimple_build_assign (final_iv, orig_niters);
 966       gsi_insert_on_edge_immediate (exit_edge, assign);
 967     }
 968
 969   return cond_stmt;
 970 }
 971
 972 /* Set up the iteration condition and rgroup controls for LOOP in AVX512
 973    style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
 974    vectorized loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 975    the number of iterations of the original scalar loop that should be
 976    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 977    for vect_set_loop_condition.
 978
 979    Insert the branch-back condition before LOOP_COND_GSI and return the
 980    final gcond.  */
 981
 982 static gcond *
 983 vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
 984                                          edge exit_edge,
 985                                          loop_vec_info loop_vinfo, tree niters,
 986                                          tree final_iv,
 987                                          bool niters_maybe_zero,
 988                                          gimple_stmt_iterator loop_cond_gsi)
 989 {
 990   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 991   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 992   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 993   tree orig_niters = niters;
 994   gimple_seq preheader_seq = NULL;
 995
 996   /* Create an IV that counts down from niters and whose step
 997      is the number of iterations processed in the current iteration.
 998      Produce the controls with compares like the following.
 999
1000        # iv_2 = PHI <niters, iv_3>
1001        rem_4 = MIN <iv_2, VF>;
1002        remv_6 = { rem_4, rem_4, rem_4, ... }
1003        mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1004        iv_3 = iv_2 - VF;
1005        if (iv_2 > VF)
1006          continue;
1007
1008      Where the constant is built with elements at most VF - 1 and
1009      repetitions according to max_nscalars_per_iter which is guarnateed
1010      to be the same within a group.  */
1011
1012   /* Convert NITERS to the determined IV type.  */
1013   if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1014       && niters_maybe_zero)
1015     {
1016       /* We know that there is always at least one iteration, so if the
1017          count is zero then it must have wrapped.  Cope with this by
1018          subtracting 1 before the conversion and adding 1 to the result.  */
1019       gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1020       niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1021                              niters, build_minus_one_cst (TREE_TYPE (niters)));
1022       niters = gimple_convert (&preheader_seq, iv_type, niters);
1023       niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1024                              niters, build_one_cst (iv_type));
1025     }
1026   else
1027     niters = gimple_convert (&preheader_seq, iv_type, niters);
1028
1029   /* Bias the initial value of the IV in case we need to skip iterations
1030      at the beginning.  */
1031   tree niters_adj = niters;
1032   if (niters_skip)
1033     {
1034       tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1035       niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1036                                  iv_type, niters, skip);
1037     }
1038
1039   /* The iteration step is the vectorization factor.  */
1040   tree iv_step = build_int_cst (iv_type, vf);
1041
1042   /* Create the decrement IV.  */
1043   tree index_before_incr, index_after_incr;
1044   gimple_stmt_iterator incr_gsi;
1045   bool insert_after;
1046   vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1047   create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1048              &incr_gsi, insert_after, &index_before_incr,
1049              &index_after_incr);
1050
1051   /* Iterate over all the rgroups and fill in their controls.  */
1052   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1053     {
1054       if (rgc.controls.is_empty ())
1055         continue;
1056
1057       tree ctrl_type = rgc.type;
1058       poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1059
1060       tree vectype = rgc.compare_type;
1061
1062       /* index_after_incr is the IV specifying the remaining iterations in
1063          the next iteration.  */
1064       tree rem = index_after_incr;
1065       /* When the data type for the compare to produce the mask is
1066          smaller than the IV type we need to saturate.  Saturate to
1067          the smallest possible value (IV_TYPE) so we only have to
1068          saturate once (CSE will catch redundant ones we add).  */
1069       if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1070         rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1071                             UNKNOWN_LOCATION,
1072                             MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1073       rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1074                             UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1075
1076       /* Build a data vector composed of the remaining iterations.  */
1077       rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078                                           UNKNOWN_LOCATION, vectype, rem);
1079
1080       /* Provide a definition of each vector in the control group.  */
1081       tree next_ctrl = NULL_TREE;
1082       tree first_rem = NULL_TREE;
1083       tree ctrl;
1084       unsigned int i;
1085       FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1086         {
1087           /* Previous controls will cover BIAS items.  This control covers the
1088              next batch.  */
1089           poly_uint64 bias = nitems_per_ctrl * i;
1090
1091           /* Build the constant to compare the remaining iters against,
1092              this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1093              split into pieces.  */
1094           unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1095           tree_vector_builder builder (vectype, n, 1);
1096           for (unsigned i = 0; i < n; ++i)
1097             {
1098               unsigned HOST_WIDE_INT val
1099                 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1100               gcc_assert (val < vf.to_constant ());
1101               builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1102             }
1103           tree cmp_series = builder.build ();
1104
1105           /* Create the initial control.  First include all items that
1106              are within the loop limit.  */
1107           tree init_ctrl = NULL_TREE;
1108           poly_uint64 const_limit;
1109           /* See whether the first iteration of the vector loop is known
1110              to have a full control.  */
1111           if (poly_int_tree_p (niters, &const_limit)
1112               && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1113             init_ctrl = build_minus_one_cst (ctrl_type);
1114           else
1115             {
1116               /* The remaining work items initially are niters.  Saturate,
1117                  splat and compare.  */
1118               if (!first_rem)
1119                 {
1120                   first_rem = niters;
1121                   if (TYPE_PRECISION (TREE_TYPE (vectype))
1122                       < TYPE_PRECISION (iv_type))
1123                     first_rem = gimple_build (&preheader_seq,
1124                                               MIN_EXPR, TREE_TYPE (first_rem),
1125                                               first_rem, iv_step);
1126                   first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1127                                               first_rem);
1128                   first_rem = gimple_build_vector_from_val (&preheader_seq,
1129                                                             vectype, first_rem);
1130                 }
1131               init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1132                                         cmp_series, first_rem);
1133             }
1134
1135           /* Now AND out the bits that are within the number of skipped
1136              items.  */
1137           poly_uint64 const_skip;
1138           if (niters_skip
1139               && !(poly_int_tree_p (niters_skip, &const_skip)
1140                    && known_le (const_skip, bias)))
1141             {
1142               /* For integer mode masks it's cheaper to shift out the bits
1143                  since that avoids loading a constant.  */
1144               gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1145               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1146                                         lang_hooks.types.type_for_mode
1147                                           (TYPE_MODE (ctrl_type), 1),
1148                                         init_ctrl);
1149               /* ???  But when the shift amount isn't constant this requires
1150                  a round-trip to GRPs.  We could apply the bias to either
1151                  side of the compare instead.  */
1152               tree shift = gimple_build (&preheader_seq, MULT_EXPR,
1153                                          TREE_TYPE (niters_skip), niters_skip,
1154                                          build_int_cst (TREE_TYPE (niters_skip),
1155                                                         rgc.max_nscalars_per_iter));
1156               init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1157                                         TREE_TYPE (init_ctrl),
1158                                         init_ctrl, shift);
1159               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1160                                         ctrl_type, init_ctrl);
1161             }
1162
1163           /* Get the control value for the next iteration of the loop.  */
1164           next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1165                                     UNKNOWN_LOCATION,
1166                                     LT_EXPR, ctrl_type, cmp_series, rem);
1167
1168           vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1169         }
1170     }
1171
1172   /* Emit all accumulated statements.  */
1173   add_preheader_seq (loop, preheader_seq);
1174
1175   /* Adjust the exit test using the decrementing IV.  */
1176   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1177   /* When we peel for alignment with niter_skip != 0 this can
1178      cause niter + niter_skip to wrap and since we are comparing the
1179      value before the decrement here we get a false early exit.
1180      We can't compare the value after decrement either because that
1181      decrement could wrap as well as we're not doing a saturating
1182      decrement.  To avoid this situation we force a larger
1183      iv_type.  */
1184   gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1185                                         NULL_TREE, NULL_TREE);
1186   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1187
1188   /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1189      Subtract one from this to get the latch count.  */
1190   tree niters_minus_one
1191     = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1192                    build_minus_one_cst (TREE_TYPE (orig_niters)));
1193   tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1194   if (niters_skip)
1195     niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1196                                fold_convert (iv_type, niters_skip));
1197   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1198                                      niters_adj2, iv_step);
1199
1200   if (final_iv)
1201     {
1202       gassign *assign;
1203       /* If vectorizing an inverted early break loop we have to restart the
1204          scalar loop at niters - vf.  This matches what we do in
1205          vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
1206       if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1207         {
1208           tree ftype = TREE_TYPE (orig_niters);
1209           tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1210           assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1211         }
1212        else
1213         assign = gimple_build_assign (final_iv, orig_niters);
1214       gsi_insert_on_edge_immediate (exit_edge, assign);
1215     }
1216
1217   return cond_stmt;
1218 }
1219
1220
1221 /* Like vect_set_loop_condition, but handle the case in which the vector
1222    loop handles exactly VF scalars per iteration.  */
1223
1224 static gcond *
1225 vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1226                                 class loop *loop, tree niters, tree step,
1227                                 tree final_iv, bool niters_maybe_zero,
1228                                 gimple_stmt_iterator loop_cond_gsi)
1229 {
1230   tree indx_before_incr, indx_after_incr;
1231   gcond *cond_stmt;
1232   gcond *orig_cond;
1233   edge pe = loop_preheader_edge (loop);
1234   gimple_stmt_iterator incr_gsi;
1235   bool insert_after;
1236   enum tree_code code;
1237   tree niters_type = TREE_TYPE (niters);
1238
1239   orig_cond = get_loop_exit_condition (exit_edge);
1240   gcc_assert (orig_cond);
1241   loop_cond_gsi = gsi_for_stmt (orig_cond);
1242
1243   tree init, limit;
1244   if (!niters_maybe_zero && integer_onep (step))
1245     {
1246       /* In this case we can use a simple 0-based IV:
1247
1248          A:
1249            x = 0;
1250            do
1251              {
1252                ...
1253                x += 1;
1254              }
1255            while (x < NITERS);  */
1256       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1257       init = build_zero_cst (niters_type);
1258       limit = niters;
1259     }
1260   else
1261     {
1262       /* The following works for all values of NITERS except 0:
1263
1264          B:
1265            x = 0;
1266            do
1267              {
1268                ...
1269                x += STEP;
1270              }
1271            while (x <= NITERS - STEP);
1272
1273          so that the loop continues to iterate if x + STEP - 1 < NITERS
1274          but stops if x + STEP - 1 >= NITERS.
1275
1276          However, if NITERS is zero, x never hits a value above NITERS - STEP
1277          before wrapping around.  There are two obvious ways of dealing with
1278          this:
1279
1280          - start at STEP - 1 and compare x before incrementing it
1281          - start at -1 and compare x after incrementing it
1282
1283          The latter is simpler and is what we use.  The loop in this case
1284          looks like:
1285
1286          C:
1287            x = -1;
1288            do
1289              {
1290                ...
1291                x += STEP;
1292              }
1293            while (x < NITERS - STEP);
1294
1295          In both cases the loop limit is NITERS - STEP.  */
1296       gimple_seq seq = NULL;
1297       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1298       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1299       if (seq)
1300         {
1301           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1302           gcc_assert (!new_bb);
1303         }
1304       if (niters_maybe_zero)
1305         {
1306           /* Case C.  */
1307           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1308           init = build_all_ones_cst (niters_type);
1309         }
1310       else
1311         {
1312           /* Case B.  */
1313           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1314           init = build_zero_cst (niters_type);
1315         }
1316     }
1317
1318   vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1319   create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1320              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1321   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1322                                               true, NULL_TREE, true,
1323                                               GSI_SAME_STMT);
1324   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1325                                      true, GSI_SAME_STMT);
1326
1327   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1328                                  NULL_TREE);
1329
1330   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1331
1332   /* Record the number of latch iterations.  */
1333   if (limit == niters)
1334     /* Case A: the loop iterates NITERS times.  Subtract one to get the
1335        latch count.  */
1336     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1337                                        build_int_cst (niters_type, 1));
1338   else
1339     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1340        Subtract one from this to get the latch count.  */
1341     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1342                                        limit, step);
1343
1344   if (final_iv)
1345     {
1346       gassign *assign;
1347       gcc_assert (single_pred_p (exit_edge->dest));
1348       tree phi_dest
1349         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1350       /* Make sure to maintain LC SSA form here and elide the subtraction
1351          if the value is zero.  */
1352       gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1353       add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1354       if (!integer_zerop (init))
1355         {
1356           assign = gimple_build_assign (final_iv, MINUS_EXPR,
1357                                         phi_dest, init);
1358           gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1359           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1360         }
1361     }
1362
1363   return cond_stmt;
1364 }
1365
1366 /* If we're using fully-masked loops, make LOOP iterate:
1367
1368       N == (NITERS - 1) / STEP + 1
1369
1370    times.  When NITERS is zero, this is equivalent to making the loop
1371    execute (1 << M) / STEP times, where M is the precision of NITERS.
1372    NITERS_MAYBE_ZERO is true if this last case might occur.
1373
1374    If we're not using fully-masked loops, make LOOP iterate:
1375
1376       N == (NITERS - STEP) / STEP + 1
1377
1378    times, where NITERS is known to be outside the range [1, STEP - 1].
1379    This is equivalent to making the loop execute NITERS / STEP times
1380    when NITERS is nonzero and (1 << M) / STEP times otherwise.
1381    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1382
1383    If FINAL_IV is nonnull, it is an SSA name that should be set to
1384    N * STEP on exit from the loop.
1385
1386    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
1387
1388 void
1389 vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1390                          tree niters, tree step, tree final_iv,
1391                          bool niters_maybe_zero)
1392 {
1393   gcond *cond_stmt;
1394   gcond *orig_cond = get_loop_exit_condition (loop_e);
1395   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1396
1397   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1398     {
1399       if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1400         cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1401                                                                     loop_vinfo,
1402                                                                     niters, final_iv,
1403                                                                     niters_maybe_zero,
1404                                                                     loop_cond_gsi);
1405       else
1406         cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1407                                                              loop_vinfo,
1408                                                              niters, final_iv,
1409                                                              niters_maybe_zero,
1410                                                              loop_cond_gsi);
1411     }
1412   else
1413     cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1414                                                 niters,
1415                                                 step, final_iv,
1416                                                 niters_maybe_zero,
1417                                                 loop_cond_gsi);
1418
1419   /* Remove old loop exit test.  */
1420   stmt_vec_info orig_cond_info;
1421   if (loop_vinfo
1422       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1423     loop_vinfo->remove_stmt (orig_cond_info);
1424   else
1425     gsi_remove (&loop_cond_gsi, true);
1426
1427   if (dump_enabled_p ())
1428     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1429                      (gimple *) cond_stmt);
1430 }
1431
1432 /* Given LOOP this function generates a new copy of it and puts it
1433    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1434    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1435    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1436    entry or exit of LOOP.  If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1437    continuation.  This is correct for cases where one loop continues from the
1438    other like in the vectorizer, but not true for uses in e.g. loop distribution
1439    where the contents of the loop body are split but the iteration space of both
1440    copies remains the same.
1441
1442    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1443    dominators were updated during the peeling.  When doing early break vectorization
1444    then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1445    memory references that need to be updated should we decide to vectorize.  */
1446
1447 class loop *
1448 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1449                                         class loop *scalar_loop,
1450                                         edge scalar_exit, edge e, edge *new_e,
1451                                         bool flow_loops,
1452                                         vec<basic_block> *updated_doms)
1453 {
1454   class loop *new_loop;
1455   basic_block *new_bbs, *bbs, *pbbs;
1456   bool at_exit;
1457   bool was_imm_dom;
1458   basic_block exit_dest;
1459   edge exit, new_exit;
1460   bool duplicate_outer_loop = false;
1461
1462   exit = loop_exit;
1463   at_exit = (e == exit);
1464   if (!at_exit && e != loop_preheader_edge (loop))
1465     return NULL;
1466
1467   if (scalar_loop == NULL)
1468     {
1469       scalar_loop = loop;
1470       scalar_exit = loop_exit;
1471     }
1472   else if (scalar_loop == loop)
1473     scalar_exit = loop_exit;
1474   else
1475     {
1476       /* Loop has been version, match exits up using the aux index.  */
1477       for (edge exit : get_loop_exit_edges (scalar_loop))
1478         if (exit->aux == loop_exit->aux)
1479           {
1480             scalar_exit = exit;
1481             break;
1482           }
1483
1484       gcc_assert (scalar_exit);
1485     }
1486
1487   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1488   pbbs = bbs + 1;
1489   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1490   /* Allow duplication of outer loops.  */
1491   if (scalar_loop->inner)
1492     duplicate_outer_loop = true;
1493
1494   /* Generate new loop structure.  */
1495   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1496   duplicate_subloops (scalar_loop, new_loop);
1497
1498   exit_dest = exit->dest;
1499   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1500                                           exit_dest) == loop->header ?
1501                  true : false);
1502
1503   /* Also copy the pre-header, this avoids jumping through hoops to
1504      duplicate the loop entry PHI arguments.  Create an empty
1505      pre-header unconditionally for this.  */
1506   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1507   edge entry_e = single_pred_edge (preheader);
1508   bbs[0] = preheader;
1509   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1510
1511   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1512             &scalar_exit, 1, &new_exit, NULL,
1513             at_exit ? loop->latch : e->src, true);
1514   exit = loop_exit;
1515   basic_block new_preheader = new_bbs[0];
1516
1517   gcc_assert (new_exit);
1518
1519   /* Record the new loop exit information.  new_loop doesn't have SCEV data and
1520      so we must initialize the exit information.  */
1521   if (new_e)
1522     *new_e = new_exit;
1523
1524   /* Before installing PHI arguments make sure that the edges
1525      into them match that of the scalar loop we analyzed.  This
1526      makes sure the SLP tree matches up between the main vectorized
1527      loop and the epilogue vectorized copies.  */
1528   if (single_succ_edge (preheader)->dest_idx
1529       != single_succ_edge (new_bbs[0])->dest_idx)
1530     {
1531       basic_block swap_bb = new_bbs[1];
1532       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1533       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1534       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1535       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1536     }
1537   if (duplicate_outer_loop)
1538     {
1539       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1540       if (loop_preheader_edge (scalar_loop)->dest_idx
1541           != loop_preheader_edge (new_inner_loop)->dest_idx)
1542         {
1543           basic_block swap_bb = new_inner_loop->header;
1544           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1545           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1546           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1547           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1548         }
1549     }
1550
1551   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1552
1553   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1554   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1555     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1556
1557   /* Rename the exit uses.  */
1558   for (edge exit : get_loop_exit_edges (new_loop))
1559     for (auto gsi = gsi_start_phis (exit->dest);
1560          !gsi_end_p (gsi); gsi_next (&gsi))
1561       {
1562         tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1563         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1564         if (MAY_HAVE_DEBUG_BIND_STMTS)
1565           adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1566       }
1567
1568   auto loop_exits = get_loop_exit_edges (loop);
1569   bool multiple_exits_p = loop_exits.length () > 1;
1570   auto_vec<basic_block> doms;
1571   class loop *update_loop = NULL;
1572
1573   if (at_exit) /* Add the loop copy at exit.  */
1574     {
1575       if (scalar_loop != loop && new_exit->dest != exit_dest)
1576         {
1577           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1578           flush_pending_stmts (new_exit);
1579         }
1580
1581       bool need_virtual_phi = get_virtual_phi (loop->header);
1582
1583       /* For the main loop exit preserve the LC PHI nodes.  For vectorization
1584          we need them to continue or finalize reductions.  Since we do not
1585          copy the loop exit blocks we have to materialize PHIs at the
1586          new destination before redirecting edges.  */
1587       for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1588            !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1589         {
1590           tree res = gimple_phi_result (*gsi_from);
1591           create_phi_node (copy_ssa_name (res), new_preheader);
1592         }
1593       edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1594       gcc_assert (e == loop_exit);
1595       flush_pending_stmts (loop_exit);
1596       set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1597
1598       bool multiple_exits_p = loop_exits.length () > 1;
1599       basic_block main_loop_exit_block = new_preheader;
1600       basic_block alt_loop_exit_block = NULL;
1601       /* Create the CFG for multiple exits.
1602                    | loop_exit               | alt1   | altN
1603                    v                         v   ...  v
1604             main_loop_exit_block:       alt_loop_exit_block:
1605                    |                      /
1606                    v                     v
1607             new_preheader:
1608          where in the new preheader we need merge PHIs for
1609          the continuation values into the epilogue header.
1610          Do not bother with exit PHIs for the early exits but
1611          their live virtual operand.  We'll fix up things below.  */
1612       if (multiple_exits_p)
1613         {
1614           edge loop_e = single_succ_edge (new_preheader);
1615           new_preheader = split_edge (loop_e);
1616
1617           gphi *vphi = NULL;
1618           alt_loop_exit_block = new_preheader;
1619           for (auto exit : loop_exits)
1620             if (exit != loop_exit)
1621               {
1622                 tree vphi_def = NULL_TREE;
1623                 if (gphi *evphi = get_virtual_phi (exit->dest))
1624                   vphi_def = gimple_phi_arg_def_from_edge (evphi, exit);
1625                 edge res = redirect_edge_and_branch (exit, alt_loop_exit_block);
1626                 gcc_assert (res == exit);
1627                 redirect_edge_var_map_clear (exit);
1628                 if (alt_loop_exit_block == new_preheader)
1629                   alt_loop_exit_block = split_edge (exit);
1630                 if (!need_virtual_phi)
1631                   continue;
1632                 if (vphi_def)
1633                   {
1634                     if (!vphi)
1635                       vphi = create_phi_node (copy_ssa_name (vphi_def),
1636                                               alt_loop_exit_block);
1637                     else
1638                       /* Edge redirection might re-allocate the PHI node
1639                          so we have to rediscover it.  */
1640                       vphi = get_virtual_phi (alt_loop_exit_block);
1641                     add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1642                   }
1643               }
1644
1645           set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1646                                    loop->header);
1647         }
1648
1649       /* Adjust the epilog loop PHI entry values to continue iteration.
1650          This adds remaining necessary LC PHI nodes to the main exit
1651          and creates merge PHIs when we have multiple exits with
1652          their appropriate continuation.  */
1653       if (flow_loops)
1654         {
1655           edge loop_entry = single_succ_edge (new_preheader);
1656           bool peeled_iters = single_pred (loop->latch) != loop_exit->src;
1657
1658           /* Record the new SSA names in the cache so that we can skip
1659              materializing them again when we fill in the rest of the LC SSA
1660              variables.  */
1661           hash_map <tree, tree> new_phi_args;
1662           for (auto psi = gsi_start_phis (main_loop_exit_block);
1663                !gsi_end_p (psi); gsi_next (&psi))
1664             {
1665               gphi *phi = *psi;
1666               tree new_arg = gimple_phi_arg_def_from_edge (phi, loop_exit);
1667               if (TREE_CODE (new_arg) != SSA_NAME)
1668                 continue;
1669
1670               /* If the loop doesn't have a virtual def then only possibly keep
1671                  the epilog LC PHI for it and avoid creating new defs.  */
1672               if (virtual_operand_p (new_arg) && !need_virtual_phi)
1673                 {
1674                   auto gsi = gsi_for_stmt (phi);
1675                   remove_phi_node (&gsi, true);
1676                   continue;
1677                 }
1678
1679               /* If we decided not to remove the PHI node we should also not
1680                  rematerialize it later on.  */
1681               new_phi_args.put (new_arg, gimple_phi_result (phi));
1682             }
1683
1684           /* Create the merge PHI nodes in new_preheader and populate the
1685              arguments for the exits.  */
1686           if (multiple_exits_p)
1687             {
1688               for (auto gsi_from = gsi_start_phis (loop->header),
1689                    gsi_to = gsi_start_phis (new_loop->header);
1690                    !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1691                    gsi_next (&gsi_from), gsi_next (&gsi_to))
1692                 {
1693                   gimple *from_phi = gsi_stmt (gsi_from);
1694                   gimple *to_phi = gsi_stmt (gsi_to);
1695
1696                   /* When the vector loop is peeled then we need to use the
1697                      value at start of the loop, otherwise the main loop exit
1698                      should use the final iter value.  */
1699                   tree new_arg;
1700                   if (peeled_iters)
1701                     new_arg = gimple_phi_result (from_phi);
1702                   else
1703                     new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1704                                                      loop_latch_edge (loop));
1705
1706                   /* Check if we've already created a new phi node during edge
1707                      redirection and re-use it if so.  Otherwise create a
1708                      LC PHI node to feed the merge PHI.  */
1709                   tree *res;
1710                   if (virtual_operand_p (new_arg))
1711                     {
1712                       /* Use the existing virtual LC SSA from exit block.  */
1713                       gphi *vphi = get_virtual_phi (main_loop_exit_block);
1714                       /* ???  When the exit yields to a path without
1715                          any virtual use we can miss a LC PHI for the
1716                          live virtual operand.  Simply choosing the
1717                          one live at the start of the loop header isn't
1718                          correct, but we should get here only with
1719                          early-exit vectorization which will move all
1720                          defs after the main exit, so leave a temporarily
1721                          wrong virtual operand in place.  This happens
1722                          for gcc.dg/pr113659.c.  */
1723                       if (vphi)
1724                         new_arg = gimple_phi_result (vphi);
1725                       else
1726                         new_arg = gimple_phi_result (from_phi);
1727                     }
1728                   else if ((res = new_phi_args.get (new_arg)))
1729                     new_arg = *res;
1730                   else
1731                     {
1732                       /* Create the LC PHI node for the exit.  */
1733                       tree new_def = copy_ssa_name (new_arg);
1734                       gphi *lc_phi
1735                           = create_phi_node (new_def, main_loop_exit_block);
1736                       SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1737                       new_arg = new_def;
1738                     }
1739
1740                   /* Create the PHI node in the merge block merging the
1741                      main and early exit values.  */
1742                   tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1743                   gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1744                   edge main_e = single_succ_edge (main_loop_exit_block);
1745                   SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1746
1747                   /* And adjust the epilog entry value.  */
1748                   adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1749                 }
1750
1751               /* After creating the merge PHIs handle the early exits those
1752                  should use the values at the start of the loop.  */
1753               for (auto gsi_from = gsi_start_phis (loop->header),
1754                    gsi_to = gsi_start_phis (new_preheader);
1755                    !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1756                    gsi_next (&gsi_from), gsi_next (&gsi_to))
1757                 {
1758                   gimple *from_phi = gsi_stmt (gsi_from);
1759                   gimple *to_phi = gsi_stmt (gsi_to);
1760
1761                   /* Now update the virtual PHI nodes with the right value.  */
1762                   tree alt_arg = gimple_phi_result (from_phi);
1763                   if (virtual_operand_p (alt_arg))
1764                     {
1765                       gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1766                       /* ???  When the exit yields to a path without
1767                          any virtual use we can miss a LC PHI for the
1768                          live virtual operand.  Simply choosing the
1769                          one live at the start of the loop header isn't
1770                          correct, but we should get here only with
1771                          early-exit vectorization which will move all
1772                          defs after the main exit, so leave a temporarily
1773                          wrong virtual operand in place.  This happens
1774                          for gcc.c-torture/execute/20150611-1.c  */
1775                       if (vphi)
1776                         alt_arg = gimple_phi_result (vphi);
1777                     }
1778                   /* For other live args we didn't create LC PHI nodes.
1779                      Do so here.  */
1780                   else
1781                     {
1782                       tree alt_def = copy_ssa_name (alt_arg);
1783                       gphi *lc_phi
1784                         = create_phi_node (alt_def, alt_loop_exit_block);
1785                       for (unsigned i = 0; i < gimple_phi_num_args (lc_phi);
1786                            ++i)
1787                         SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1788                       alt_arg = alt_def;
1789                     }
1790                   edge alt_e = single_succ_edge (alt_loop_exit_block);
1791                   SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1792                 }
1793             }
1794           /* For the single exit case only create the missing LC PHI nodes
1795              for the continuation of the loop IVs that are not also already
1796              reductions and thus had LC PHI nodes on the exit already.  */
1797           else
1798             {
1799               for (auto gsi_from = gsi_start_phis (loop->header),
1800                    gsi_to = gsi_start_phis (new_loop->header);
1801                    !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1802                    gsi_next (&gsi_from), gsi_next (&gsi_to))
1803                 {
1804                   gimple *from_phi = gsi_stmt (gsi_from);
1805                   gimple *to_phi = gsi_stmt (gsi_to);
1806                   tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1807                                                         loop_latch_edge (loop));
1808
1809                   /* Check if we've already created a new phi node during edge
1810                      redirection.  If we have, only propagate the value
1811                      downwards.  */
1812                   if (tree *res = new_phi_args.get (new_arg))
1813                     {
1814                       adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1815                       continue;
1816                     }
1817
1818                   tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1819                   gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1820                   SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg);
1821                   adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1822                 }
1823             }
1824         }
1825
1826       if (was_imm_dom || duplicate_outer_loop)
1827         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1828
1829       /* And remove the non-necessary forwarder again.  Keep the other
1830          one so we have a proper pre-header for the loop at the exit edge.  */
1831       redirect_edge_pred (single_succ_edge (preheader),
1832                           single_pred (preheader));
1833       delete_basic_block (preheader);
1834       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1835                                loop_preheader_edge (scalar_loop)->src);
1836
1837       /* Finally after wiring the new epilogue we need to update its main exit
1838          to the original function exit we recorded.  Other exits are already
1839          correct.  */
1840       if (multiple_exits_p)
1841         {
1842           update_loop = new_loop;
1843           doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1844           for (unsigned i = 0; i < doms.length (); ++i)
1845             if (flow_bb_inside_loop_p (loop, doms[i]))
1846               doms.unordered_remove (i);
1847         }
1848     }
1849   else /* Add the copy at entry.  */
1850     {
1851       /* Copy the current loop LC PHI nodes between the original loop exit
1852          block and the new loop header.  This allows us to later split the
1853          preheader block and still find the right LC nodes.  */
1854       if (flow_loops)
1855         for (auto gsi_from = gsi_start_phis (new_loop->header),
1856              gsi_to = gsi_start_phis (loop->header);
1857              !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1858              gsi_next (&gsi_from), gsi_next (&gsi_to))
1859           {
1860             gimple *from_phi = gsi_stmt (gsi_from);
1861             gimple *to_phi = gsi_stmt (gsi_to);
1862             tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1863                                                   loop_latch_edge (new_loop));
1864             adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1865                                         new_arg);
1866           }
1867
1868       if (scalar_loop != loop)
1869         {
1870           /* Remove the non-necessary forwarder of scalar_loop again.  */
1871           redirect_edge_pred (single_succ_edge (preheader),
1872                               single_pred (preheader));
1873           delete_basic_block (preheader);
1874           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1875                                    loop_preheader_edge (scalar_loop)->src);
1876           preheader = split_edge (loop_preheader_edge (loop));
1877           entry_e = single_pred_edge (preheader);
1878         }
1879
1880       redirect_edge_and_branch_force (entry_e, new_preheader);
1881       flush_pending_stmts (entry_e);
1882       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1883
1884       redirect_edge_and_branch_force (new_exit, preheader);
1885       flush_pending_stmts (new_exit);
1886       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1887
1888       /* And remove the non-necessary forwarder again.  Keep the other
1889          one so we have a proper pre-header for the loop at the exit edge.  */
1890       redirect_edge_pred (single_succ_edge (new_preheader),
1891                           single_pred (new_preheader));
1892       delete_basic_block (new_preheader);
1893       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1894                                loop_preheader_edge (new_loop)->src);
1895
1896       if (multiple_exits_p)
1897         update_loop = loop;
1898     }
1899
1900   if (multiple_exits_p)
1901     {
1902       for (edge e : get_loop_exit_edges (update_loop))
1903         {
1904           edge ex;
1905           edge_iterator ei;
1906           FOR_EACH_EDGE (ex, ei, e->dest->succs)
1907             {
1908               /* Find the first non-fallthrough block as fall-throughs can't
1909                  dominate other blocks.  */
1910               if (single_succ_p (ex->dest))
1911                 {
1912                   doms.safe_push (ex->dest);
1913                   ex = single_succ_edge (ex->dest);
1914                 }
1915               doms.safe_push (ex->dest);
1916             }
1917           doms.safe_push (e->dest);
1918         }
1919
1920       iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1921       if (updated_doms)
1922         updated_doms->safe_splice (doms);
1923     }
1924
1925   free (new_bbs);
1926   free (bbs);
1927
1928   checking_verify_dominators (CDI_DOMINATORS);
1929
1930   return new_loop;
1931 }
1932
1933
1934 /* Given the condition expression COND, put it as the last statement of
1935    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1936    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1937    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1938    new edge as irreducible if IRREDUCIBLE_P is true.  */
1939
1940 static edge
1941 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1942                        basic_block guard_to, basic_block dom_bb,
1943                        profile_probability probability, bool irreducible_p)
1944 {
1945   gimple_stmt_iterator gsi;
1946   edge new_e, enter_e;
1947   gcond *cond_stmt;
1948   gimple_seq gimplify_stmt_list = NULL;
1949
1950   enter_e = EDGE_SUCC (guard_bb, 0);
1951   enter_e->flags &= ~EDGE_FALLTHRU;
1952   enter_e->flags |= EDGE_FALSE_VALUE;
1953   gsi = gsi_last_bb (guard_bb);
1954
1955   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1956                                  is_gimple_condexpr_for_cond, NULL_TREE);
1957   if (gimplify_stmt_list)
1958     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1959
1960   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1961   gsi = gsi_last_bb (guard_bb);
1962   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1963
1964   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1965   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1966
1967   new_e->probability = probability;
1968   if (irreducible_p)
1969     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1970
1971   enter_e->probability = probability.invert ();
1972   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1973
1974   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1975   if (enter_e->dest->loop_father->header == enter_e->dest)
1976     split_edge (enter_e);
1977
1978   return new_e;
1979 }
1980
1981
1982 /* This function verifies that the following restrictions apply to LOOP:
1983    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1984        for innermost loop and 5 basic blocks for outer-loop.
1985    (2) it is single entry, single exit
1986    (3) its exit condition is the last stmt in the header
1987    (4) E is the entry/exit edge of LOOP.
1988  */
1989
1990 bool
1991 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
1992                              const_edge e)
1993 {
1994   edge entry_e = loop_preheader_edge (loop);
1995   gcond *orig_cond = get_loop_exit_condition (exit_e);
1996   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1997
1998   /* All loops have an outer scope; the only case loop->outer is NULL is for
1999      the function itself.  */
2000   if (!loop_outer (loop)
2001       || !empty_block_p (loop->latch)
2002       || !exit_e
2003       /* Verify that new loop exit condition can be trivially modified.  */
2004       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
2005       || (e != exit_e && e != entry_e))
2006     return false;
2007
2008   basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
2009   get_loop_body_with_size (loop, bbs, loop->num_nodes);
2010   bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
2011   free (bbs);
2012   return ret;
2013 }
2014
2015 /* Function find_loop_location.
2016
2017    Extract the location of the loop in the source code.
2018    If the loop is not well formed for vectorization, an estimated
2019    location is calculated.
2020    Return the loop location if succeed and NULL if not.  */
2021
2022 dump_user_location_t
2023 find_loop_location (class loop *loop)
2024 {
2025   gimple *stmt = NULL;
2026   basic_block bb;
2027   gimple_stmt_iterator si;
2028
2029   if (!loop)
2030     return dump_user_location_t ();
2031
2032   /* For the root of the loop tree return the function location.  */
2033   if (!loop_outer (loop))
2034     return dump_user_location_t::from_function_decl (cfun->decl);
2035
2036   if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
2037     {
2038       /* We only care about the loop location, so use any exit with location
2039          information.  */
2040       for (edge e : get_loop_exit_edges (loop))
2041         {
2042           stmt = get_loop_exit_condition (e);
2043
2044           if (stmt
2045               && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2046             return stmt;
2047         }
2048     }
2049
2050   /* If we got here the loop is probably not "well formed",
2051      try to estimate the loop location */
2052
2053   if (!loop->header)
2054     return dump_user_location_t ();
2055
2056   bb = loop->header;
2057
2058   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2059     {
2060       stmt = gsi_stmt (si);
2061       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2062         return stmt;
2063     }
2064
2065   return dump_user_location_t ();
2066 }
2067
2068 /* Return true if the phi described by STMT_INFO defines an IV of the
2069    loop to be vectorized.  */
2070
2071 static bool
2072 iv_phi_p (stmt_vec_info stmt_info)
2073 {
2074   gphi *phi = as_a <gphi *> (stmt_info->stmt);
2075   if (virtual_operand_p (PHI_RESULT (phi)))
2076     return false;
2077
2078   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2079       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2080     return false;
2081
2082   return true;
2083 }
2084
2085 /* Return true if vectorizer can peel for nonlinear iv.  */
2086 static bool
2087 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2088                               stmt_vec_info stmt_info)
2089 {
2090   enum vect_induction_op_type induction_type
2091     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2092   tree niters_skip;
2093   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2094      if niters or vf is unkown:
2095      For shift, when shift mount >= precision, there would be UD.
2096      For mult, don't known how to generate
2097      init_expr * pow (step, niters) for variable niters.
2098      For neg, it should be ok, since niters of vectorized main loop
2099      will always be multiple of 2.  */
2100   if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2101        || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
2102       && induction_type != vect_step_op_neg)
2103     {
2104       if (dump_enabled_p ())
2105         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2106                          "Peeling for epilogue is not supported"
2107                          " for nonlinear induction except neg"
2108                          " when iteration count is unknown.\n");
2109       return false;
2110     }
2111
2112   /* Avoid compile time hog on vect_peel_nonlinear_iv_init.  */
2113   if (induction_type == vect_step_op_mul)
2114     {
2115       tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2116       tree type = TREE_TYPE (step_expr);
2117
2118       if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2119           && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2120         {
2121           if (dump_enabled_p ())
2122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123                              "Avoid compile time hog on"
2124                              " vect_peel_nonlinear_iv_init"
2125                              " for nonlinear induction vec_step_op_mul"
2126                              " when iteration count is too big.\n");
2127           return false;
2128         }
2129     }
2130
2131   /* Also doens't support peel for neg when niter is variable.
2132      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
2133   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2134   if ((niters_skip != NULL_TREE
2135        && (TREE_CODE (niters_skip) != INTEGER_CST
2136            || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2137       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2138           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2139     {
2140       if (dump_enabled_p ())
2141         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2142                          "Peeling for alignement is not supported"
2143                          " for nonlinear induction when niters_skip"
2144                          " is not constant.\n");
2145       return false;
2146     }
2147
2148   /* We can't support partial vectors and early breaks with an induction
2149      type other than add or neg since we require the epilog and can't
2150      perform the peeling.  The below condition mirrors that of
2151      vect_gen_vector_loop_niters  where niters_vector_mult_vf_var then sets
2152      step_vector to VF rather than 1.  This is what creates the nonlinear
2153      IV.  PR113163.  */
2154   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
2155       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2156       && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2157       && induction_type != vect_step_op_neg)
2158     {
2159       if (dump_enabled_p ())
2160         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2161                          "Peeling for epilogue is not supported"
2162                          " for nonlinear induction except neg"
2163                          " when VF is known and early breaks.\n");
2164       return false;
2165     }
2166
2167   return true;
2168 }
2169
2170 /* Function vect_can_advance_ivs_p
2171
2172    In case the number of iterations that LOOP iterates is unknown at compile
2173    time, an epilog loop will be generated, and the loop induction variables
2174    (IVs) will be "advanced" to the value they are supposed to take just before
2175    the epilog loop.  Here we check that the access function of the loop IVs
2176    and the expression that represents the loop bound are simple enough.
2177    These restrictions will be relaxed in the future.  */
2178
2179 bool
2180 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2181 {
2182   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2183   basic_block bb = loop->header;
2184   gphi_iterator gsi;
2185
2186   /* Analyze phi functions of the loop header.  */
2187
2188   if (dump_enabled_p ())
2189     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2190   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2191     {
2192       tree evolution_part;
2193       enum vect_induction_op_type induction_type;
2194
2195       gphi *phi = gsi.phi ();
2196       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2197       if (dump_enabled_p ())
2198         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2199                          phi_info->stmt);
2200
2201       /* Skip virtual phi's. The data dependences that are associated with
2202          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2203
2204          Skip reduction phis.  */
2205       if (!iv_phi_p (phi_info))
2206         {
2207           if (dump_enabled_p ())
2208             dump_printf_loc (MSG_NOTE, vect_location,
2209                              "reduc or virtual phi. skip.\n");
2210           continue;
2211         }
2212
2213       induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2214       if (induction_type != vect_step_op_add)
2215         {
2216           if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2217             return false;
2218
2219           continue;
2220         }
2221
2222       /* Analyze the evolution function.  */
2223
2224       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2225       if (evolution_part == NULL_TREE)
2226         {
2227           if (dump_enabled_p ())
2228             dump_printf (MSG_MISSED_OPTIMIZATION,
2229                          "No access function or evolution.\n");
2230           return false;
2231         }
2232
2233       /* FORNOW: We do not transform initial conditions of IVs
2234          which evolution functions are not invariants in the loop.  */
2235
2236       if (!expr_invariant_in_loop_p (loop, evolution_part))
2237         {
2238           if (dump_enabled_p ())
2239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240                              "evolution not invariant in loop.\n");
2241           return false;
2242         }
2243
2244       /* FORNOW: We do not transform initial conditions of IVs
2245          which evolution functions are a polynomial of degree >= 2.  */
2246
2247       if (tree_is_chrec (evolution_part))
2248         {
2249           if (dump_enabled_p ())
2250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251                              "evolution is chrec.\n");
2252           return false;
2253         }
2254     }
2255
2256   return true;
2257 }
2258
2259
2260 /*   Function vect_update_ivs_after_vectorizer.
2261
2262      "Advance" the induction variables of LOOP to the value they should take
2263      after the execution of LOOP.  This is currently necessary because the
2264      vectorizer does not handle induction variables that are used after the
2265      loop.  Such a situation occurs when the last iterations of LOOP are
2266      peeled, because:
2267      1. We introduced new uses after LOOP for IVs that were not originally used
2268         after LOOP: the IVs of LOOP are now used by an epilog loop.
2269      2. LOOP is going to be vectorized; this means that it will iterate N/VF
2270         times, whereas the loop IVs should be bumped N times.
2271
2272      Input:
2273      - LOOP - a loop that is going to be vectorized. The last few iterations
2274               of LOOP were peeled.
2275      - NITERS - the number of iterations that LOOP executes (before it is
2276                 vectorized). i.e, the number of times the ivs should be bumped.
2277      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2278                   coming out from LOOP on which there are uses of the LOOP ivs
2279                   (this is the path from LOOP->exit to epilog_loop->preheader).
2280
2281                   The new definitions of the ivs are placed in LOOP->exit.
2282                   The phi args associated with the edge UPDATE_E in the bb
2283                   UPDATE_E->dest are updated accordingly.
2284
2285      Assumption 1: Like the rest of the vectorizer, this function assumes
2286      a single loop exit that has a single predecessor.
2287
2288      Assumption 2: The phi nodes in the LOOP header and in update_bb are
2289      organized in the same order.
2290
2291      Assumption 3: The access function of the ivs is simple enough (see
2292      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
2293
2294      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2295      coming out of LOOP on which the ivs of LOOP are used (this is the path
2296      that leads to the epilog loop; other paths skip the epilog loop).  This
2297      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2298      needs to have its phis updated.
2299  */
2300
2301 static void
2302 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2303                                   tree niters, edge update_e)
2304 {
2305   gphi_iterator gsi, gsi1;
2306   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2307   basic_block update_bb = update_e->dest;
2308   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2309   gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2310
2311   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2312        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2313        gsi_next (&gsi), gsi_next (&gsi1))
2314     {
2315       tree init_expr;
2316       tree step_expr, off;
2317       tree type;
2318       tree var, ni, ni_name;
2319
2320       gphi *phi = gsi.phi ();
2321       gphi *phi1 = gsi1.phi ();
2322       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2323       if (dump_enabled_p ())
2324         dump_printf_loc (MSG_NOTE, vect_location,
2325                          "vect_update_ivs_after_vectorizer: phi: %G",
2326                          (gimple *) phi);
2327
2328       /* Skip reduction and virtual phis.  */
2329       if (!iv_phi_p (phi_info))
2330         {
2331           if (dump_enabled_p ())
2332             dump_printf_loc (MSG_NOTE, vect_location,
2333                              "reduc or virtual phi. skip.\n");
2334           continue;
2335         }
2336
2337       type = TREE_TYPE (gimple_phi_result (phi));
2338       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2339       step_expr = unshare_expr (step_expr);
2340
2341       /* FORNOW: We do not support IVs whose evolution function is a polynomial
2342          of degree >= 2 or exponential.  */
2343       gcc_assert (!tree_is_chrec (step_expr));
2344
2345       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2346       gimple_seq stmts = NULL;
2347       enum vect_induction_op_type induction_type
2348         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2349
2350       if (induction_type == vect_step_op_add)
2351         {
2352           tree stype = TREE_TYPE (step_expr);
2353           off = fold_build2 (MULT_EXPR, stype,
2354                                fold_convert (stype, niters), step_expr);
2355
2356           if (POINTER_TYPE_P (type))
2357             ni = fold_build_pointer_plus (init_expr, off);
2358           else
2359             ni = fold_convert (type,
2360                                fold_build2 (PLUS_EXPR, stype,
2361                                             fold_convert (stype, init_expr),
2362                                             off));
2363         }
2364       /* Don't bother call vect_peel_nonlinear_iv_init.  */
2365       else if (induction_type == vect_step_op_neg)
2366         ni = init_expr;
2367       else
2368         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2369                                           niters, step_expr,
2370                                           induction_type);
2371
2372       var = create_tmp_var (type, "tmp");
2373
2374       gimple_seq new_stmts = NULL;
2375       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2376
2377       /* Exit_bb shouldn't be empty.  */
2378       if (!gsi_end_p (last_gsi))
2379         {
2380           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2381           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2382         }
2383       else
2384         {
2385           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2386           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2387         }
2388
2389       /* Fix phi expressions in the successor bb.  */
2390       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2391     }
2392 }
2393
2394 /* Return a gimple value containing the misalignment (measured in vector
2395    elements) for the loop described by LOOP_VINFO, i.e. how many elements
2396    it is away from a perfectly aligned address.  Add any new statements
2397    to SEQ.  */
2398
2399 static tree
2400 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2401 {
2402   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2403   stmt_vec_info stmt_info = dr_info->stmt;
2404   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2405
2406   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2407   unsigned HOST_WIDE_INT target_align_c;
2408   tree target_align_minus_1;
2409
2410   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2411                                         size_zero_node) < 0;
2412   tree offset = (negative
2413                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2414                              * TREE_INT_CST_LOW
2415                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2416                  : size_zero_node);
2417   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2418                                                           stmt_info, seq,
2419                                                           offset);
2420   tree type = unsigned_type_for (TREE_TYPE (start_addr));
2421   if (target_align.is_constant (&target_align_c))
2422     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2423   else
2424     {
2425       tree vla = build_int_cst (type, target_align);
2426       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2427                                     fold_build2 (MINUS_EXPR, type,
2428                                                  build_int_cst (type, 0), vla));
2429       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2430                                           build_int_cst (type, 1));
2431     }
2432
2433   HOST_WIDE_INT elem_size
2434     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2435   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2436
2437   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
2438   tree int_start_addr = fold_convert (type, start_addr);
2439   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2440                                         target_align_minus_1);
2441
2442   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
2443   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2444                                         elem_size_log);
2445
2446   return misalign_in_elems;
2447 }
2448
2449 /* Function vect_gen_prolog_loop_niters
2450
2451    Generate the number of iterations which should be peeled as prolog for the
2452    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
2453    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2454    As a result, after the execution of this loop, the data reference DR will
2455    refer to an aligned location.  The following computation is generated:
2456
2457    If the misalignment of DR is known at compile time:
2458      addr_mis = int mis = DR_MISALIGNMENT (dr);
2459    Else, compute address misalignment in bytes:
2460      addr_mis = addr & (target_align - 1)
2461
2462    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2463
2464    (elem_size = element type size; an element is the scalar element whose type
2465    is the inner type of the vectype)
2466
2467    The computations will be emitted at the end of BB.  We also compute and
2468    store upper bound (included) of the result in BOUND.
2469
2470    When the step of the data-ref in the loop is not 1 (as in interleaved data
2471    and SLP), the number of iterations of the prolog must be divided by the step
2472    (which is equal to the size of interleaved group).
2473
2474    The above formulas assume that VF == number of elements in the vector. This
2475    may not hold when there are multiple-types in the loop.
2476    In this case, for some data-references in the loop the VF does not represent
2477    the number of elements that fit in the vector.  Therefore, instead of VF we
2478    use TYPE_VECTOR_SUBPARTS.  */
2479
2480 static tree
2481 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2482                              basic_block bb, int *bound)
2483 {
2484   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2485   tree var;
2486   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2487   gimple_seq stmts = NULL, new_stmts = NULL;
2488   tree iters, iters_name;
2489   stmt_vec_info stmt_info = dr_info->stmt;
2490   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2491   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2492
2493   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2494     {
2495       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2496
2497       if (dump_enabled_p ())
2498         dump_printf_loc (MSG_NOTE, vect_location,
2499                          "known peeling = %d.\n", npeel);
2500
2501       iters = build_int_cst (niters_type, npeel);
2502       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2503     }
2504   else
2505     {
2506       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2507       tree type = TREE_TYPE (misalign_in_elems);
2508       HOST_WIDE_INT elem_size
2509         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2510       /* We only do prolog peeling if the target alignment is known at compile
2511          time.  */
2512       poly_uint64 align_in_elems =
2513         exact_div (target_align, elem_size);
2514       tree align_in_elems_minus_1 =
2515         build_int_cst (type, align_in_elems - 1);
2516       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2517
2518       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
2519                                  & (align_in_elems - 1)).  */
2520       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2521                                             size_zero_node) < 0;
2522       if (negative)
2523         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2524                              align_in_elems_tree);
2525       else
2526         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2527                              misalign_in_elems);
2528       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2529       iters = fold_convert (niters_type, iters);
2530       unsigned HOST_WIDE_INT align_in_elems_c;
2531       if (align_in_elems.is_constant (&align_in_elems_c))
2532         *bound = align_in_elems_c - 1;
2533       else
2534         *bound = -1;
2535     }
2536
2537   if (dump_enabled_p ())
2538     dump_printf_loc (MSG_NOTE, vect_location,
2539                      "niters for prolog loop: %T\n", iters);
2540
2541   var = create_tmp_var (niters_type, "prolog_loop_niters");
2542   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2543
2544   if (new_stmts)
2545     gimple_seq_add_seq (&stmts, new_stmts);
2546   if (stmts)
2547     {
2548       gcc_assert (single_succ_p (bb));
2549       gimple_stmt_iterator gsi = gsi_last_bb (bb);
2550       if (gsi_end_p (gsi))
2551         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2552       else
2553         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2554     }
2555   return iters_name;
2556 }
2557
2558
2559 /* Function vect_update_init_of_dr
2560
2561    If CODE is PLUS, the vector loop starts NITERS iterations after the
2562    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2563    iterations before the scalar one (using masking to skip inactive
2564    elements).  This function updates the information recorded in DR to
2565    account for the difference.  Specifically, it updates the OFFSET
2566    field of DR_INFO.  */
2567
2568 static void
2569 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2570 {
2571   struct data_reference *dr = dr_info->dr;
2572   tree offset = dr_info->offset;
2573   if (!offset)
2574     offset = build_zero_cst (sizetype);
2575
2576   niters = fold_build2 (MULT_EXPR, sizetype,
2577                         fold_convert (sizetype, niters),
2578                         fold_convert (sizetype, DR_STEP (dr)));
2579   offset = fold_build2 (code, sizetype,
2580                         fold_convert (sizetype, offset), niters);
2581   dr_info->offset = offset;
2582 }
2583
2584
2585 /* Function vect_update_inits_of_drs
2586
2587    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2588    CODE and NITERS are as for vect_update_inits_of_dr.  */
2589
2590 void
2591 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2592                           tree_code code)
2593 {
2594   unsigned int i;
2595   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2596   struct data_reference *dr;
2597
2598   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2599
2600   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
2601      here, but since we might use these niters to update the epilogues niters
2602      and data references we can't insert them here as this definition might not
2603      always dominate its uses.  */
2604   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2605     niters = fold_convert (sizetype, niters);
2606
2607   FOR_EACH_VEC_ELT (datarefs, i, dr)
2608     {
2609       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2610       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2611           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2612         vect_update_init_of_dr (dr_info, niters, code);
2613     }
2614 }
2615
2616 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2617    by masking.  This involves calculating the number of iterations to
2618    be peeled and then aligning all memory references appropriately.  */
2619
2620 void
2621 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2622 {
2623   tree misalign_in_elems;
2624   tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2625
2626   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2627
2628   /* From the information recorded in LOOP_VINFO get the number of iterations
2629      that need to be skipped via masking.  */
2630   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2631     {
2632       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2633                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2634       misalign_in_elems = build_int_cst (type, misalign);
2635     }
2636   else
2637     {
2638       gimple_seq seq1 = NULL, seq2 = NULL;
2639       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2640       misalign_in_elems = fold_convert (type, misalign_in_elems);
2641       misalign_in_elems = force_gimple_operand (misalign_in_elems,
2642                                                 &seq2, true, NULL_TREE);
2643       gimple_seq_add_seq (&seq1, seq2);
2644       if (seq1)
2645         {
2646           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2647           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2648           gcc_assert (!new_bb);
2649         }
2650     }
2651
2652   if (dump_enabled_p ())
2653     dump_printf_loc (MSG_NOTE, vect_location,
2654                      "misalignment for fully-masked loop: %T\n",
2655                      misalign_in_elems);
2656
2657   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2658
2659   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2660 }
2661
2662 /* This function builds ni_name = number of iterations.  Statements
2663    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
2664    it to TRUE if new ssa_var is generated.  */
2665
2666 tree
2667 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2668 {
2669   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2670   if (TREE_CODE (ni) == INTEGER_CST)
2671     return ni;
2672   else
2673     {
2674       tree ni_name, var;
2675       gimple_seq stmts = NULL;
2676       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2677
2678       var = create_tmp_var (TREE_TYPE (ni), "niters");
2679       ni_name = force_gimple_operand (ni, &stmts, false, var);
2680       if (stmts)
2681         {
2682           gsi_insert_seq_on_edge_immediate (pe, stmts);
2683           if (new_var_p != NULL)
2684             *new_var_p = true;
2685         }
2686
2687       return ni_name;
2688     }
2689 }
2690
2691 /* Calculate the number of iterations above which vectorized loop will be
2692    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
2693    of prolog loop.  If it's integer const, the integer number is also passed
2694    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
2695    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
2696    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
2697    threshold below which the scalar (rather than vectorized) loop will be
2698    executed.  This function stores the upper bound (inclusive) of the result
2699    in BOUND_SCALAR.  */
2700
2701 static tree
2702 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2703                              int bound_prolog, poly_int64 bound_epilog, int th,
2704                              poly_uint64 *bound_scalar,
2705                              bool check_profitability)
2706 {
2707   tree type = TREE_TYPE (niters_prolog);
2708   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2709                              build_int_cst (type, bound_epilog));
2710
2711   *bound_scalar = bound_prolog + bound_epilog;
2712   if (check_profitability)
2713     {
2714       /* TH indicates the minimum niters of vectorized loop, while we
2715          compute the maximum niters of scalar loop.  */
2716       th--;
2717       /* Peeling for constant times.  */
2718       if (int_niters_prolog >= 0)
2719         {
2720           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2721           return build_int_cst (type, *bound_scalar);
2722         }
2723       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
2724          and BOUND_EPILOG are inclusive upper bounds.  */
2725       if (known_ge (th, bound_prolog + bound_epilog))
2726         {
2727           *bound_scalar = th;
2728           return build_int_cst (type, th);
2729         }
2730       /* Need to do runtime comparison.  */
2731       else if (maybe_gt (th, bound_epilog))
2732         {
2733           *bound_scalar = upper_bound (*bound_scalar, th);
2734           return fold_build2 (MAX_EXPR, type,
2735                               build_int_cst (type, th), niters);
2736         }
2737     }
2738   return niters;
2739 }
2740
2741 /* NITERS is the number of times that the original scalar loop executes
2742    after peeling.  Work out the maximum number of iterations N that can
2743    be handled by the vectorized form of the loop and then either:
2744
2745    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2746
2747         niters_vector = N
2748
2749    b) set *STEP_VECTOR_PTR to one and generate:
2750
2751         niters_vector = N / vf
2752
2753    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2754    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
2755    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
2756
2757 void
2758 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2759                              tree *niters_vector_ptr, tree *step_vector_ptr,
2760                              bool niters_no_overflow)
2761 {
2762   tree ni_minus_gap, var;
2763   tree niters_vector, step_vector, type = TREE_TYPE (niters);
2764   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2765   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2766   tree log_vf = NULL_TREE;
2767
2768   /* If epilogue loop is required because of data accesses with gaps, we
2769      subtract one iteration from the total number of iterations here for
2770      correct calculation of RATIO.  */
2771   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2772     {
2773       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2774                                   build_one_cst (type));
2775       if (!is_gimple_val (ni_minus_gap))
2776         {
2777           var = create_tmp_var (type, "ni_gap");
2778           gimple *stmts = NULL;
2779           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2780                                                true, var);
2781           gsi_insert_seq_on_edge_immediate (pe, stmts);
2782         }
2783     }
2784   else
2785     ni_minus_gap = niters;
2786
2787   /* To silence some unexpected warnings, simply initialize to 0. */
2788   unsigned HOST_WIDE_INT const_vf = 0;
2789   if (vf.is_constant (&const_vf)
2790       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2791     {
2792       /* Create: niters >> log2(vf) */
2793       /* If it's known that niters == number of latch executions + 1 doesn't
2794          overflow, we can generate niters >> log2(vf); otherwise we generate
2795          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2796          will be at least one.  */
2797       log_vf = build_int_cst (type, exact_log2 (const_vf));
2798       if (niters_no_overflow)
2799         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2800       else
2801         niters_vector
2802           = fold_build2 (PLUS_EXPR, type,
2803                          fold_build2 (RSHIFT_EXPR, type,
2804                                       fold_build2 (MINUS_EXPR, type,
2805                                                    ni_minus_gap,
2806                                                    build_int_cst (type, vf)),
2807                                       log_vf),
2808                          build_int_cst (type, 1));
2809       step_vector = build_one_cst (type);
2810     }
2811   else
2812     {
2813       niters_vector = ni_minus_gap;
2814       step_vector = build_int_cst (type, vf);
2815     }
2816
2817   if (!is_gimple_val (niters_vector))
2818     {
2819       var = create_tmp_var (type, "bnd");
2820       gimple_seq stmts = NULL;
2821       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2822       gsi_insert_seq_on_edge_immediate (pe, stmts);
2823       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2824          we set range information to make niters analyzer's life easier.
2825          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2826          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2827       if (stmts != NULL && log_vf)
2828         {
2829           if (niters_no_overflow)
2830             {
2831               value_range vr (type,
2832                               wi::one (TYPE_PRECISION (type)),
2833                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2834                                                          TYPE_SIGN (type)),
2835                                           exact_log2 (const_vf),
2836                                           TYPE_SIGN (type)));
2837               set_range_info (niters_vector, vr);
2838             }
2839           /* For VF == 1 the vector IV might also overflow so we cannot
2840              assert a minimum value of 1.  */
2841           else if (const_vf > 1)
2842             {
2843               value_range vr (type,
2844                               wi::one (TYPE_PRECISION (type)),
2845                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2846                                                          TYPE_SIGN (type))
2847                                           - (const_vf - 1),
2848                                           exact_log2 (const_vf), TYPE_SIGN (type))
2849                               + 1);
2850               set_range_info (niters_vector, vr);
2851             }
2852         }
2853     }
2854   *niters_vector_ptr = niters_vector;
2855   *step_vector_ptr = step_vector;
2856
2857   return;
2858 }
2859
2860 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2861    loop specified by LOOP_VINFO after vectorization, compute the number
2862    of iterations before vectorization (niters_vector * vf) and store it
2863    to NITERS_VECTOR_MULT_VF_PTR.  */
2864
2865 static void
2866 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2867                                      tree niters_vector,
2868                                      tree *niters_vector_mult_vf_ptr)
2869 {
2870   /* We should be using a step_vector of VF if VF is variable.  */
2871   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2872   tree type = TREE_TYPE (niters_vector);
2873   tree log_vf = build_int_cst (type, exact_log2 (vf));
2874   tree tree_vf = build_int_cst (type, vf);
2875   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2876
2877   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2878   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2879                                             niters_vector, log_vf);
2880
2881   /* If we've peeled a vector iteration then subtract one full vector
2882      iteration.  */
2883   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2884     niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
2885                                          niters_vector_mult_vf, tree_vf);
2886
2887   if (!is_gimple_val (niters_vector_mult_vf))
2888     {
2889       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2890       gimple_seq stmts = NULL;
2891       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2892                                                     &stmts, true, var);
2893       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2894       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2895     }
2896   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2897 }
2898
2899 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2900    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2901    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2902    appear like below:
2903
2904        guard_bb:
2905          if (cond)
2906            goto merge_bb;
2907          else
2908            goto skip_loop;
2909
2910      skip_loop:
2911        header_a:
2912          i_1 = PHI<i_0, i_2>;
2913          ...
2914          i_2 = i_1 + 1;
2915          if (cond_a)
2916            goto latch_a;
2917          else
2918            goto exit_a;
2919        latch_a:
2920          goto header_a;
2921
2922        exit_a:
2923          i_5 = PHI<i_2>;
2924
2925        merge_bb:
2926          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2927
2928      update_loop:
2929        header_b:
2930          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2931          ...
2932          i_4 = i_3 + 1;
2933          if (cond_b)
2934            goto latch_b;
2935          else
2936            goto exit_bb;
2937        latch_b:
2938          goto header_b;
2939
2940        exit_bb:
2941
2942    This function creates PHI nodes at merge_bb and replaces the use of i_5
2943    in the update_loop's PHI node with the result of new PHI result.  */
2944
2945 static void
2946 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2947                                     class loop *update_loop,
2948                                     edge guard_edge, edge merge_edge)
2949 {
2950   location_t merge_loc, guard_loc;
2951   edge orig_e = loop_preheader_edge (skip_loop);
2952   edge update_e = loop_preheader_edge (update_loop);
2953   gphi_iterator gsi_orig, gsi_update;
2954
2955   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2956         gsi_update = gsi_start_phis (update_loop->header));
2957        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2958        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2959     {
2960       gphi *orig_phi = gsi_orig.phi ();
2961       gphi *update_phi = gsi_update.phi ();
2962
2963       /* Generate new phi node at merge bb of the guard.  */
2964       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2965       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2966
2967       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2968          args in NEW_PHI for these edges.  */
2969       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2970       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2971       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2972       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2973       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2974       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2975
2976       /* Update phi in UPDATE_PHI.  */
2977       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2978     }
2979 }
2980
2981 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2982    Return a value that equals:
2983
2984    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2985    - SKIP_VALUE when the main loop is skipped.  */
2986
2987 tree
2988 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2989                            tree skip_value)
2990 {
2991   gcc_assert (loop_vinfo->main_loop_edge);
2992
2993   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2994   basic_block bb = loop_vinfo->main_loop_edge->dest;
2995   gphi *new_phi = create_phi_node (phi_result, bb);
2996   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2997                UNKNOWN_LOCATION);
2998   add_phi_arg (new_phi, skip_value,
2999                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
3000   return phi_result;
3001 }
3002
3003 /* Function vect_do_peeling.
3004
3005    Input:
3006    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
3007
3008        preheader:
3009      LOOP:
3010        header_bb:
3011          loop_body
3012          if (exit_loop_cond) goto exit_bb
3013          else                goto header_bb
3014        exit_bb:
3015
3016    - NITERS: The number of iterations of the loop.
3017    - NITERSM1: The number of iterations of the loop's latch.
3018    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3019    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3020                               CHECK_PROFITABILITY is true.
3021    Output:
3022    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3023      iterate after vectorization; see vect_set_loop_condition for details.
3024    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3025      should be set to the number of scalar iterations handled by the
3026      vector loop.  The SSA name is only used on exit from the loop.
3027
3028    This function peels prolog and epilog from the loop, adds guards skipping
3029    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
3030    would look like:
3031
3032        guard_bb_1:
3033          if (prefer_scalar_loop) goto merge_bb_1
3034          else                    goto guard_bb_2
3035
3036        guard_bb_2:
3037          if (skip_prolog) goto merge_bb_2
3038          else             goto prolog_preheader
3039
3040        prolog_preheader:
3041      PROLOG:
3042        prolog_header_bb:
3043          prolog_body
3044          if (exit_prolog_cond) goto prolog_exit_bb
3045          else                  goto prolog_header_bb
3046        prolog_exit_bb:
3047
3048        merge_bb_2:
3049
3050        vector_preheader:
3051      VECTOR LOOP:
3052        vector_header_bb:
3053          vector_body
3054          if (exit_vector_cond) goto vector_exit_bb
3055          else                  goto vector_header_bb
3056        vector_exit_bb:
3057
3058        guard_bb_3:
3059          if (skip_epilog) goto merge_bb_3
3060          else             goto epilog_preheader
3061
3062        merge_bb_1:
3063
3064        epilog_preheader:
3065      EPILOG:
3066        epilog_header_bb:
3067          epilog_body
3068          if (exit_epilog_cond) goto merge_bb_3
3069          else                  goto epilog_header_bb
3070
3071        merge_bb_3:
3072
3073    Note this function peels prolog and epilog only if it's necessary,
3074    as well as guards.
3075    This function returns the epilogue loop if a decision was made to vectorize
3076    it, otherwise NULL.
3077
3078    The analysis resulting in this epilogue loop's loop_vec_info was performed
3079    in the same vect_analyze_loop call as the main loop's.  At that time
3080    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3081    vectorization factors than the main loop.  This list is stored in the main
3082    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
3083    vectorize the epilogue loop for a lower vectorization factor,  the
3084    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
3085    updated and linked to the epilogue loop.  This is later used to vectorize
3086    the epilogue.  The reason the loop_vec_info needs updating is that it was
3087    constructed based on the original main loop, and the epilogue loop is a
3088    copy of this loop, so all links pointing to statements in the original loop
3089    need updating.  Furthermore, these loop_vec_infos share the
3090    data_reference's records, which will also need to be updated.
3091
3092    TODO: Guard for prefer_scalar_loop should be emitted along with
3093    versioning conditions if loop versioning is needed.  */
3094
3095
3096 class loop *
3097 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3098                  tree *niters_vector, tree *step_vector,
3099                  tree *niters_vector_mult_vf_var, int th,
3100                  bool check_profitability, bool niters_no_overflow,
3101                  tree *advance)
3102 {
3103   edge e, guard_e;
3104   tree type = TREE_TYPE (niters), guard_cond;
3105   basic_block guard_bb, guard_to;
3106   profile_probability prob_prolog, prob_vector, prob_epilog;
3107   int estimated_vf;
3108   int prolog_peeling = 0;
3109   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
3110   /* We currently do not support prolog peeling if the target alignment is not
3111      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
3112      target alignment being constant.  */
3113   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3114   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
3115     return NULL;
3116
3117   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3118     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3119
3120   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3121   poly_uint64 bound_epilog = 0;
3122   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3123       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3124     bound_epilog += vf - 1;
3125   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3126     bound_epilog += 1;
3127
3128   /* For early breaks the scalar loop needs to execute at most VF times
3129      to find the element that caused the break.  */
3130   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3131     bound_epilog = vf;
3132
3133   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3134   poly_uint64 bound_scalar = bound_epilog;
3135
3136   if (!prolog_peeling && !epilog_peeling)
3137     return NULL;
3138
3139   /* Before doing any peeling make sure to reset debug binds outside of
3140      the loop refering to defs not in LC SSA.  */
3141   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3142   for (unsigned i = 0; i < loop->num_nodes; ++i)
3143     {
3144       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3145       imm_use_iterator ui;
3146       gimple *use_stmt;
3147       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3148            gsi_next (&gsi))
3149         {
3150           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3151             if (gimple_debug_bind_p (use_stmt)
3152                 && loop != gimple_bb (use_stmt)->loop_father
3153                 && !flow_loop_nested_p (loop,
3154                                         gimple_bb (use_stmt)->loop_father))
3155               {
3156                 gimple_debug_bind_reset_value (use_stmt);
3157                 update_stmt (use_stmt);
3158               }
3159         }
3160       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3161            gsi_next (&gsi))
3162         {
3163           ssa_op_iter op_iter;
3164           def_operand_p def_p;
3165           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3166             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3167               if (gimple_debug_bind_p (use_stmt)
3168                   && loop != gimple_bb (use_stmt)->loop_father
3169                   && !flow_loop_nested_p (loop,
3170                                           gimple_bb (use_stmt)->loop_father))
3171                 {
3172                   gimple_debug_bind_reset_value (use_stmt);
3173                   update_stmt (use_stmt);
3174                 }
3175         }
3176     }
3177
3178   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3179   estimated_vf = vect_vf_for_cost (loop_vinfo);
3180   if (estimated_vf == 2)
3181     estimated_vf = 3;
3182   prob_prolog = prob_epilog = profile_probability::guessed_always ()
3183                         .apply_scale (estimated_vf - 1, estimated_vf);
3184
3185   class loop *prolog, *epilog = NULL;
3186   class loop *first_loop = loop;
3187   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3188
3189   /* SSA form needs to be up-to-date since we are going to manually
3190      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3191      update SSA state after that, so we have to make sure to not lose any
3192      pending update needs.  */
3193   gcc_assert (!need_ssa_update_p (cfun));
3194
3195   /* If we're vectorizing an epilogue loop, we have ensured that the
3196      virtual operand is in SSA form throughout the vectorized main loop.
3197      Normally it is possible to trace the updated
3198      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3199      back to scalar-stmt vuses, meaning that the effect of the SSA update
3200      remains local to the main loop.  However, there are rare cases in
3201      which the vectorized loop should have vdefs even when the original scalar
3202      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
3203      introduces clobbers of the temporary vector array, which in turn
3204      needs new vdefs.  If the scalar loop doesn't write to memory, these
3205      new vdefs will be the only ones in the vector loop.
3206      We are currently defering updating virtual SSA form and creating
3207      of a virtual PHI for this case so we do not have to make sure the
3208      newly introduced virtual def is in LCSSA form.  */
3209
3210   if (MAY_HAVE_DEBUG_BIND_STMTS)
3211     {
3212       gcc_assert (!adjust_vec.exists ());
3213       adjust_vec.create (32);
3214     }
3215   initialize_original_copy_tables ();
3216
3217   /* Record the anchor bb at which the guard should be placed if the scalar
3218      loop might be preferred.  */
3219   basic_block anchor = loop_preheader_edge (loop)->src;
3220
3221   /* Generate the number of iterations for the prolog loop.  We do this here
3222      so that we can also get the upper bound on the number of iterations.  */
3223   tree niters_prolog;
3224   int bound_prolog = 0;
3225   if (prolog_peeling)
3226     {
3227       niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3228                                                     &bound_prolog);
3229       /* If algonment peeling is known, we will always execute prolog.  */
3230       if (TREE_CODE (niters_prolog) == INTEGER_CST)
3231         prob_prolog = profile_probability::always ();
3232     }
3233   else
3234     niters_prolog = build_int_cst (type, 0);
3235
3236   loop_vec_info epilogue_vinfo = NULL;
3237   if (vect_epilogues)
3238     {
3239       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
3240       loop_vinfo->epilogue_vinfos.ordered_remove (0);
3241     }
3242
3243   tree niters_vector_mult_vf = NULL_TREE;
3244   /* Saving NITERs before the loop, as this may be changed by prologue.  */
3245   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3246   edge update_e = NULL, skip_e = NULL;
3247   unsigned int lowest_vf = constant_lower_bound (vf);
3248   /* Prolog loop may be skipped.  */
3249   bool skip_prolog = (prolog_peeling != 0);
3250   /* Skip this loop to epilog when there are not enough iterations to enter this
3251      vectorized loop.  If true we should perform runtime checks on the NITERS
3252      to check whether we should skip the current vectorized loop.  If we know
3253      the number of scalar iterations we may choose to add a runtime check if
3254      this number "maybe" smaller than the number of iterations required
3255      when we know the number of scalar iterations may potentially
3256      be smaller than the number of iterations required to enter this loop, for
3257      this we use the upper bounds on the prolog and epilog peeling.  When we
3258      don't know the number of iterations and don't require versioning it is
3259      because we have asserted that there are enough scalar iterations to enter
3260      the main loop, so this skip is not necessary.  When we are versioning then
3261      we only add such a skip if we have chosen to vectorize the epilogue.  */
3262   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3263                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3264                                   bound_prolog + bound_epilog)
3265                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3266                          || vect_epilogues));
3267
3268   /* Epilog loop must be executed if the number of iterations for epilog
3269      loop is known at compile time, otherwise we need to add a check at
3270      the end of vector loop and skip to the end of epilog loop.  */
3271   bool skip_epilog = (prolog_peeling < 0
3272                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3273                       || !vf.is_constant ());
3274   /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3275      loop must be executed.  */
3276   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3277       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3278     skip_epilog = false;
3279
3280   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3281   auto_vec<profile_count> original_counts;
3282   basic_block *original_bbs = NULL;
3283
3284   if (skip_vector)
3285     {
3286       split_edge (loop_preheader_edge (loop));
3287
3288       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3289         {
3290           original_bbs = get_loop_body (loop);
3291           for (unsigned int i = 0; i < loop->num_nodes; i++)
3292             original_counts.safe_push(original_bbs[i]->count);
3293         }
3294
3295       /* Due to the order in which we peel prolog and epilog, we first
3296          propagate probability to the whole loop.  The purpose is to
3297          avoid adjusting probabilities of both prolog and vector loops
3298          separately.  Note in this case, the probability of epilog loop
3299          needs to be scaled back later.  */
3300       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3301       if (prob_vector.initialized_p ())
3302         {
3303           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3304           scale_loop_profile (loop, prob_vector, -1);
3305         }
3306     }
3307
3308   if (vect_epilogues)
3309     {
3310       /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3311          use the original scalar loop as remaining epilogue if necessary.  */
3312       LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3313         = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3314       LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3315         = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3316     }
3317
3318   if (prolog_peeling)
3319     {
3320       e = loop_preheader_edge (loop);
3321       edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3322       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3323                            && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo));
3324
3325       /* Peel prolog and put it on preheader edge of loop.  */
3326       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3327       edge prolog_e = NULL;
3328       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3329                                                        scalar_loop, scalar_e,
3330                                                        e, &prolog_e);
3331       gcc_assert (prolog);
3332       prolog->force_vectorize = false;
3333
3334       first_loop = prolog;
3335       reset_original_copy_tables ();
3336
3337       /* Update the number of iterations for prolog loop.  */
3338       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3339       vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3340                                step_prolog, NULL_TREE, false);
3341
3342       /* Skip the prolog loop.  */
3343       if (skip_prolog)
3344         {
3345           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3346                                     niters_prolog, build_int_cst (type, 0));
3347           guard_bb = loop_preheader_edge (prolog)->src;
3348           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3349           guard_to = split_edge (loop_preheader_edge (loop));
3350           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3351                                            guard_to, guard_bb,
3352                                            prob_prolog.invert (),
3353                                            irred_flag);
3354           e = EDGE_PRED (guard_to, 0);
3355           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3356           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3357
3358           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3359           scale_loop_profile (prolog, prob_prolog, bound_prolog - 1);
3360         }
3361
3362       /* Update init address of DRs.  */
3363       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3364       /* Update niters for vector loop.  */
3365       LOOP_VINFO_NITERS (loop_vinfo)
3366         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3367       LOOP_VINFO_NITERSM1 (loop_vinfo)
3368         = fold_build2 (MINUS_EXPR, type,
3369                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3370       bool new_var_p = false;
3371       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3372       /* It's guaranteed that vector loop bound before vectorization is at
3373          least VF, so set range information for newly generated var.  */
3374       if (new_var_p)
3375         {
3376           value_range vr (type,
3377                           wi::to_wide (build_int_cst (type, lowest_vf)),
3378                           wi::to_wide (TYPE_MAX_VALUE (type)));
3379           set_range_info (niters, vr);
3380         }
3381
3382       /* Prolog iterates at most bound_prolog times, latch iterates at
3383          most bound_prolog - 1 times.  */
3384       record_niter_bound (prolog, bound_prolog - 1, false, true);
3385       delete_update_ssa ();
3386       adjust_vec_debug_stmts ();
3387       scev_reset ();
3388     }
3389   basic_block bb_before_epilog = NULL;
3390
3391   if (epilog_peeling)
3392     {
3393       e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3394       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3395
3396       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
3397          said epilog then we should use a copy of the main loop as a starting
3398          point.  This loop may have already had some preliminary transformations
3399          to allow for more optimal vectorization, for example if-conversion.
3400          If we are not vectorizing the epilog then we should use the scalar loop
3401          as the transformations mentioned above make less or no sense when not
3402          vectorizing.  */
3403       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3404       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3405       edge epilog_e = vect_epilogues ? e : scalar_e;
3406       edge new_epilog_e = NULL;
3407       auto_vec<basic_block> doms;
3408       epilog
3409         = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3410                                                   &new_epilog_e, true, &doms);
3411
3412       LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3413       gcc_assert (epilog);
3414       gcc_assert (new_epilog_e);
3415       epilog->force_vectorize = false;
3416       bb_before_epilog = loop_preheader_edge (epilog)->src;
3417
3418       /* Scalar version loop may be preferred.  In this case, add guard
3419          and skip to epilog.  Note this only happens when the number of
3420          iterations of loop is unknown at compile time, otherwise this
3421          won't be vectorized.  */
3422       if (skip_vector)
3423         {
3424           /* Additional epilogue iteration is peeled if gap exists.  */
3425           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3426                                                 bound_prolog, bound_epilog,
3427                                                 th, &bound_scalar,
3428                                                 check_profitability);
3429           /* Build guard against NITERSM1 since NITERS may overflow.  */
3430           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3431           guard_bb = anchor;
3432           guard_to = split_edge (loop_preheader_edge (epilog));
3433           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3434                                            guard_to, guard_bb,
3435                                            prob_vector.invert (),
3436                                            irred_flag);
3437           skip_e = guard_e;
3438           e = EDGE_PRED (guard_to, 0);
3439           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3440           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3441
3442           /* Simply propagate profile info from guard_bb to guard_to which is
3443              a merge point of control flow.  */
3444           profile_count old_count = guard_to->count;
3445           guard_to->count = guard_bb->count;
3446
3447           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3448           if (vect_epilogues || scalar_loop == NULL)
3449             {
3450               gcc_assert(epilog->num_nodes == loop->num_nodes);
3451               basic_block *bbs = get_loop_body (epilog);
3452               for (unsigned int i = 0; i < epilog->num_nodes; i++)
3453                 {
3454                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3455                   bbs[i]->count = original_counts[i];
3456                 }
3457               free (bbs);
3458               free (original_bbs);
3459             }
3460           else if (old_count.nonzero_p ())
3461             scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3462
3463           /* Only need to handle basic block before epilog loop if it's not
3464              the guard_bb, which is the case when skip_vector is true.  */
3465           if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3466             bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3467           bb_before_epilog = loop_preheader_edge (epilog)->src;
3468         }
3469
3470       /* If loop is peeled for non-zero constant times, now niters refers to
3471          orig_niters - prolog_peeling, it won't overflow even the orig_niters
3472          overflows.  */
3473       niters_no_overflow |= (prolog_peeling > 0);
3474       vect_gen_vector_loop_niters (loop_vinfo, niters,
3475                                    niters_vector, step_vector,
3476                                    niters_no_overflow);
3477       if (!integer_onep (*step_vector))
3478         {
3479           /* On exit from the loop we will have an easy way of calcalating
3480              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3481              until then.  */
3482           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3483           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3484           *niters_vector_mult_vf_var = niters_vector_mult_vf;
3485         }
3486       else
3487         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3488                                              &niters_vector_mult_vf);
3489       /* Update IVs of original loop as if they were advanced by
3490          niters_vector_mult_vf steps.  */
3491       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3492       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3493       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3494         update_e = single_succ_edge (LOOP_VINFO_IV_EXIT (loop_vinfo)->dest);
3495
3496       /* If we have a peeled vector iteration, all exits are the same, leave it
3497          and so the main exit needs to be treated the same as the alternative
3498          exits in that we leave their updates to vectorizable_live_operations.
3499          */
3500       if (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3501         vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3502                                           update_e);
3503
3504       /* If we have a peeled vector iteration we will never skip the epilog loop
3505          and we can simplify the cfg a lot by not doing the edge split.  */
3506       if (skip_epilog || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3507         {
3508           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3509                                     niters, niters_vector_mult_vf);
3510
3511           guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3512           edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3513           guard_to = epilog_e->dest;
3514           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3515                                            skip_vector ? anchor : guard_bb,
3516                                            prob_epilog.invert (),
3517                                            irred_flag);
3518           doms.safe_push (guard_to);
3519           if (vect_epilogues)
3520             epilogue_vinfo->skip_this_loop_edge = guard_e;
3521           edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3522           gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3523           for (gphi_iterator gsi = gsi_start_phis (guard_to);
3524                !gsi_end_p (gsi); gsi_next (&gsi))
3525             {
3526               /* We are expecting all of the PHIs we have on epilog_e
3527                  to be also on the main loop exit.  But sometimes
3528                  a stray virtual definition can appear at epilog_e
3529                  which we can then take as the same on all exits,
3530                  we've removed the LC SSA PHI on the main exit before
3531                  so we wouldn't need to create a loop PHI for it.  */
3532               if (virtual_operand_p (gimple_phi_result (*gsi))
3533                   && (gsi_end_p (gsi2)
3534                       || !virtual_operand_p (gimple_phi_result (*gsi2))))
3535                 add_phi_arg (*gsi,
3536                              gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3537                              guard_e, UNKNOWN_LOCATION);
3538               else
3539                 {
3540                   add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3541                                UNKNOWN_LOCATION);
3542                   gsi_next (&gsi2);
3543                 }
3544             }
3545
3546           /* Only need to handle basic block before epilog loop if it's not
3547              the guard_bb, which is the case when skip_vector is true.  */
3548           if (guard_bb != bb_before_epilog)
3549             {
3550               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3551
3552               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3553             }
3554           scale_loop_profile (epilog, prob_epilog, -1);
3555         }
3556
3557       /* Recalculate the dominators after adding the guard edge.  */
3558       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3559         iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3560
3561       /* When we do not have a loop-around edge to the epilog we know
3562          the vector loop covered at least VF scalar iterations unless
3563          we have early breaks.
3564          Update any known upper bound with this knowledge.  */
3565       if (! skip_vector
3566           && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3567         {
3568           if (epilog->any_upper_bound)
3569             epilog->nb_iterations_upper_bound -= lowest_vf;
3570           if (epilog->any_likely_upper_bound)
3571             epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3572           if (epilog->any_estimate)
3573             epilog->nb_iterations_estimate -= lowest_vf;
3574         }
3575
3576       unsigned HOST_WIDE_INT bound;
3577       if (bound_scalar.is_constant (&bound))
3578         {
3579           gcc_assert (bound != 0);
3580           /* Adjust the upper bound by the extra peeled vector iteration if we
3581              are an epilogue of an peeled vect loop and not VLA.  For VLA the
3582              loop bounds are unknown.  */
3583           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3584               && vf.is_constant ())
3585             bound += vf.to_constant ();
3586           /* -1 to convert loop iterations to latch iterations.  */
3587           record_niter_bound (epilog, bound - 1, false, true);
3588           scale_loop_profile (epilog, profile_probability::always (),
3589                               bound - 1);
3590         }
3591
3592       delete_update_ssa ();
3593       adjust_vec_debug_stmts ();
3594       scev_reset ();
3595     }
3596
3597   if (vect_epilogues)
3598     {
3599       epilog->aux = epilogue_vinfo;
3600       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3601       LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3602         = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3603
3604       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3605
3606       /* We now must calculate the number of NITERS performed by the previous
3607          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3608       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3609                                  niters_prolog, niters_vector_mult_vf);
3610
3611       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3612          determine whether we are coming from the previous vectorized loop
3613          using the update_e edge or the skip_vector basic block using the
3614          skip_e edge.  */
3615       if (skip_vector)
3616         {
3617           gcc_assert (update_e != NULL && skip_e != NULL);
3618           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3619                                            update_e->dest);
3620           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3621           gimple *stmt = gimple_build_assign (new_ssa, niters);
3622           gimple_stmt_iterator gsi;
3623           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3624               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3625             {
3626               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3627               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3628             }
3629           else
3630             {
3631               gsi = gsi_last_bb (update_e->src);
3632               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3633             }
3634
3635           niters = new_ssa;
3636           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3637           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3638                        UNKNOWN_LOCATION);
3639           niters = PHI_RESULT (new_phi);
3640           epilogue_vinfo->main_loop_edge = update_e;
3641           epilogue_vinfo->skip_main_loop_edge = skip_e;
3642         }
3643
3644       /* Set ADVANCE to the number of iterations performed by the previous
3645          loop and its prologue.  */
3646       *advance = niters;
3647
3648       /* Subtract the number of iterations performed by the vectorized loop
3649          from the number of total iterations.  */
3650       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3651                                           before_loop_niters,
3652                                           niters);
3653
3654       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3655       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3656         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3657                        epilogue_niters,
3658                        build_one_cst (TREE_TYPE (epilogue_niters)));
3659
3660       /* Decide what to do if the number of epilogue iterations is not
3661          a multiple of the epilogue loop's vectorization factor.
3662          We should have rejected the loop during the analysis phase
3663          if this fails.  */
3664       bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo);
3665       gcc_assert (res);
3666     }
3667
3668   adjust_vec.release ();
3669   free_original_copy_tables ();
3670
3671   return vect_epilogues ? epilog : NULL;
3672 }
3673
3674 /* Function vect_create_cond_for_niters_checks.
3675
3676    Create a conditional expression that represents the run-time checks for
3677    loop's niter.  The loop is guaranteed to terminate if the run-time
3678    checks hold.
3679
3680    Input:
3681    COND_EXPR  - input conditional expression.  New conditions will be chained
3682                 with logical AND operation.  If it is NULL, then the function
3683                 is used to return the number of alias checks.
3684    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3685                 to be checked.
3686
3687    Output:
3688    COND_EXPR - conditional expression.
3689
3690    The returned COND_EXPR is the conditional expression to be used in the
3691    if statement that controls which version of the loop gets executed at
3692    runtime.  */
3693
3694 static void
3695 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3696 {
3697   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3698
3699   if (*cond_expr)
3700     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3701                               *cond_expr, part_cond_expr);
3702   else
3703     *cond_expr = part_cond_expr;
3704 }
3705
3706 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3707    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3708
3709 static void
3710 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3711 {
3712   if (*cond_expr)
3713     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3714                               *cond_expr, part_cond_expr);
3715   else
3716     *cond_expr = part_cond_expr;
3717 }
3718
3719 /* Function vect_create_cond_for_align_checks.
3720
3721    Create a conditional expression that represents the alignment checks for
3722    all of data references (array element references) whose alignment must be
3723    checked at runtime.
3724
3725    Input:
3726    COND_EXPR  - input conditional expression.  New conditions will be chained
3727                 with logical AND operation.
3728    LOOP_VINFO - two fields of the loop information are used.
3729                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3730                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3731
3732    Output:
3733    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3734                          expression.
3735    The returned value is the conditional expression to be used in the if
3736    statement that controls which version of the loop gets executed at runtime.
3737
3738    The algorithm makes two assumptions:
3739      1) The number of bytes "n" in a vector is a power of 2.
3740      2) An address "a" is aligned if a%n is zero and that this
3741         test can be done as a&(n-1) == 0.  For example, for 16
3742         byte vectors the test is a&0xf == 0.  */
3743
3744 static void
3745 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3746                                    tree *cond_expr,
3747                                    gimple_seq *cond_expr_stmt_list)
3748 {
3749   const vec<stmt_vec_info> &may_misalign_stmts
3750     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3751   stmt_vec_info stmt_info;
3752   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3753   tree mask_cst;
3754   unsigned int i;
3755   tree int_ptrsize_type;
3756   char tmp_name[20];
3757   tree or_tmp_name = NULL_TREE;
3758   tree and_tmp_name;
3759   gimple *and_stmt;
3760   tree ptrsize_zero;
3761   tree part_cond_expr;
3762
3763   /* Check that mask is one less than a power of 2, i.e., mask is
3764      all zeros followed by all ones.  */
3765   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3766
3767   int_ptrsize_type = signed_type_for (ptr_type_node);
3768
3769   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3770      of the first vector of the i'th data reference. */
3771
3772   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3773     {
3774       gimple_seq new_stmt_list = NULL;
3775       tree addr_base;
3776       tree addr_tmp_name;
3777       tree new_or_tmp_name;
3778       gimple *addr_stmt, *or_stmt;
3779       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3780       bool negative = tree_int_cst_compare
3781         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3782       tree offset = negative
3783         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3784                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3785         : size_zero_node;
3786
3787       /* create: addr_tmp = (int)(address_of_first_vector) */
3788       addr_base =
3789         vect_create_addr_base_for_vector_ref (loop_vinfo,
3790                                               stmt_info, &new_stmt_list,
3791                                               offset);
3792       if (new_stmt_list != NULL)
3793         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3794
3795       sprintf (tmp_name, "addr2int%d", i);
3796       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3797       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3798       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3799
3800       /* The addresses are OR together.  */
3801
3802       if (or_tmp_name != NULL_TREE)
3803         {
3804           /* create: or_tmp = or_tmp | addr_tmp */
3805           sprintf (tmp_name, "orptrs%d", i);
3806           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3807           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3808                                          or_tmp_name, addr_tmp_name);
3809           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3810           or_tmp_name = new_or_tmp_name;
3811         }
3812       else
3813         or_tmp_name = addr_tmp_name;
3814
3815     } /* end for i */
3816
3817   mask_cst = build_int_cst (int_ptrsize_type, mask);
3818
3819   /* create: and_tmp = or_tmp & mask  */
3820   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3821
3822   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3823                                   or_tmp_name, mask_cst);
3824   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3825
3826   /* Make and_tmp the left operand of the conditional test against zero.
3827      if and_tmp has a nonzero bit then some address is unaligned.  */
3828   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3829   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3830                                 and_tmp_name, ptrsize_zero);
3831   chain_cond_expr (cond_expr, part_cond_expr);
3832 }
3833
3834 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3835    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3836    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3837    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3838
3839 static void
3840 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3841 {
3842   const vec<vec_object_pair> &pairs
3843     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3844   unsigned int i;
3845   vec_object_pair *pair;
3846   FOR_EACH_VEC_ELT (pairs, i, pair)
3847     {
3848       tree addr1 = build_fold_addr_expr (pair->first);
3849       tree addr2 = build_fold_addr_expr (pair->second);
3850       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3851                                          addr1, addr2);
3852       chain_cond_expr (cond_expr, part_cond_expr);
3853     }
3854 }
3855
3856 /* Create an expression that is true when all lower-bound conditions for
3857    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3858
3859 static void
3860 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3861 {
3862   const vec<vec_lower_bound> &lower_bounds
3863     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3864   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3865     {
3866       tree expr = lower_bounds[i].expr;
3867       tree type = unsigned_type_for (TREE_TYPE (expr));
3868       expr = fold_convert (type, expr);
3869       poly_uint64 bound = lower_bounds[i].min_value;
3870       if (!lower_bounds[i].unsigned_p)
3871         {
3872           expr = fold_build2 (PLUS_EXPR, type, expr,
3873                               build_int_cstu (type, bound - 1));
3874           bound += bound - 1;
3875         }
3876       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3877                                          build_int_cstu (type, bound));
3878       chain_cond_expr (cond_expr, part_cond_expr);
3879     }
3880 }
3881
3882 /* Function vect_create_cond_for_alias_checks.
3883
3884    Create a conditional expression that represents the run-time checks for
3885    overlapping of address ranges represented by a list of data references
3886    relations passed as input.
3887
3888    Input:
3889    COND_EXPR  - input conditional expression.  New conditions will be chained
3890                 with logical AND operation.  If it is NULL, then the function
3891                 is used to return the number of alias checks.
3892    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3893                 to be checked.
3894
3895    Output:
3896    COND_EXPR - conditional expression.
3897
3898    The returned COND_EXPR is the conditional expression to be used in the if
3899    statement that controls which version of the loop gets executed at runtime.
3900 */
3901
3902 void
3903 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3904 {
3905   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3906     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3907
3908   if (comp_alias_ddrs.is_empty ())
3909     return;
3910
3911   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3912                                &comp_alias_ddrs, cond_expr);
3913   if (dump_enabled_p ())
3914     dump_printf_loc (MSG_NOTE, vect_location,
3915                      "created %u versioning for alias checks.\n",
3916                      comp_alias_ddrs.length ());
3917 }
3918
3919
3920 /* Function vect_loop_versioning.
3921
3922    If the loop has data references that may or may not be aligned or/and
3923    has data reference relations whose independence was not proven then
3924    two versions of the loop need to be generated, one which is vectorized
3925    and one which isn't.  A test is then generated to control which of the
3926    loops is executed.  The test checks for the alignment of all of the
3927    data references that may or may not be aligned.  An additional
3928    sequence of runtime tests is generated for each pairs of DDRs whose
3929    independence was not proven.  The vectorized version of loop is
3930    executed only if both alias and alignment tests are passed.
3931
3932    The test generated to check which version of loop is executed
3933    is modified to also check for profitability as indicated by the
3934    cost model threshold TH.
3935
3936    The versioning precondition(s) are placed in *COND_EXPR and
3937    *COND_EXPR_STMT_LIST.  */
3938
3939 class loop *
3940 vect_loop_versioning (loop_vec_info loop_vinfo,
3941                       gimple *loop_vectorized_call)
3942 {
3943   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3944   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3945   basic_block condition_bb;
3946   gphi_iterator gsi;
3947   gimple_stmt_iterator cond_exp_gsi;
3948   basic_block merge_bb;
3949   basic_block new_exit_bb;
3950   edge new_exit_e, e;
3951   gphi *orig_phi, *new_phi;
3952   tree cond_expr = NULL_TREE;
3953   gimple_seq cond_expr_stmt_list = NULL;
3954   tree arg;
3955   profile_probability prob = profile_probability::likely ();
3956   gimple_seq gimplify_stmt_list = NULL;
3957   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3958   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3959   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3960   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3961   poly_uint64 versioning_threshold
3962     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3963   tree version_simd_if_cond
3964     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3965   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3966
3967   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3968       && !ordered_p (th, versioning_threshold))
3969     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3970                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3971                                             th - 1));
3972   if (maybe_ne (versioning_threshold, 0U))
3973     {
3974       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3975                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3976                                               versioning_threshold - 1));
3977       if (cond_expr)
3978         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3979                                  expr, cond_expr);
3980       else
3981         cond_expr = expr;
3982     }
3983
3984   tree cost_name = NULL_TREE;
3985   profile_probability prob2 = profile_probability::always ();
3986   if (cond_expr
3987       && EXPR_P (cond_expr)
3988       && (version_niter
3989           || version_align
3990           || version_alias
3991           || version_simd_if_cond))
3992     {
3993       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3994                                                       &cond_expr_stmt_list,
3995                                                       is_gimple_val, NULL_TREE);
3996       /* Split prob () into two so that the overall probability of passing
3997          both the cost-model and versioning checks is the orig prob.  */
3998       prob2 = prob = prob.sqrt ();
3999     }
4000
4001   if (version_niter)
4002     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
4003
4004   if (cond_expr)
4005     {
4006       gimple_seq tem = NULL;
4007       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4008                                           &tem, is_gimple_condexpr_for_cond,
4009                                           NULL_TREE);
4010       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
4011     }
4012
4013   if (version_align)
4014     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
4015                                        &cond_expr_stmt_list);
4016
4017   if (version_alias)
4018     {
4019       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
4020       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
4021       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
4022     }
4023
4024   if (version_simd_if_cond)
4025     {
4026       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4027       if (flag_checking)
4028         if (basic_block bb
4029             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4030           gcc_assert (bb != loop->header
4031                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4032                       && (scalar_loop == NULL
4033                           || (bb != scalar_loop->header
4034                               && dominated_by_p (CDI_DOMINATORS,
4035                                                  scalar_loop->header, bb))));
4036       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4037       tree c = fold_build2 (NE_EXPR, boolean_type_node,
4038                             version_simd_if_cond, zero);
4039       if (cond_expr)
4040         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4041                                  c, cond_expr);
4042       else
4043         cond_expr = c;
4044       if (dump_enabled_p ())
4045         dump_printf_loc (MSG_NOTE, vect_location,
4046                          "created versioning for simd if condition check.\n");
4047     }
4048
4049   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4050                                       &gimplify_stmt_list,
4051                                       is_gimple_condexpr_for_cond, NULL_TREE);
4052   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4053
4054   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4055      invariant in.  */
4056   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4057   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4058        !gsi_end_p (gsi); gsi_next (&gsi))
4059     {
4060       gimple *stmt = gsi_stmt (gsi);
4061       update_stmt (stmt);
4062       ssa_op_iter iter;
4063       use_operand_p use_p;
4064       basic_block def_bb;
4065       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4066         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4067             && flow_bb_inside_loop_p (outermost, def_bb))
4068           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4069     }
4070
4071   /* Search for the outermost loop we can version.  Avoid versioning of
4072      non-perfect nests but allow if-conversion versioned loops inside.  */
4073   class loop *loop_to_version = loop;
4074   if (flow_loop_nested_p (outermost, loop))
4075     {
4076       if (dump_enabled_p ())
4077         dump_printf_loc (MSG_NOTE, vect_location,
4078                          "trying to apply versioning to outer loop %d\n",
4079                          outermost->num);
4080       if (outermost->num == 0)
4081         outermost = superloop_at_depth (loop, 1);
4082       /* And avoid applying versioning on non-perfect nests.  */
4083       while (loop_to_version != outermost
4084              && (e = single_exit (loop_outer (loop_to_version)))
4085              && !(e->flags & EDGE_COMPLEX)
4086              && (!loop_outer (loop_to_version)->inner->next
4087                  || vect_loop_vectorized_call (loop_to_version))
4088              && (!loop_outer (loop_to_version)->inner->next
4089                  || !loop_outer (loop_to_version)->inner->next->next))
4090         loop_to_version = loop_outer (loop_to_version);
4091     }
4092
4093   /* Apply versioning.  If there is already a scalar version created by
4094      if-conversion re-use that.  Note we cannot re-use the copy of
4095      an if-converted outer-loop when vectorizing the inner loop only.  */
4096   gcond *cond;
4097   if ((!loop_to_version->inner || loop == loop_to_version)
4098       && loop_vectorized_call)
4099     {
4100       gcc_assert (scalar_loop);
4101       condition_bb = gimple_bb (loop_vectorized_call);
4102       cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4103       gimple_cond_set_condition_from_tree (cond, cond_expr);
4104       update_stmt (cond);
4105
4106       if (cond_expr_stmt_list)
4107         {
4108           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4109           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4110                                  GSI_SAME_STMT);
4111         }
4112
4113       /* if-conversion uses profile_probability::always () for both paths,
4114          reset the paths probabilities appropriately.  */
4115       edge te, fe;
4116       extract_true_false_edges_from_block (condition_bb, &te, &fe);
4117       te->probability = prob;
4118       fe->probability = prob.invert ();
4119       /* We can scale loops counts immediately but have to postpone
4120          scaling the scalar loop because we re-use it during peeling.
4121
4122          Ifcvt duplicates loop preheader, loop body and produces an basic
4123          block after loop exit.  We need to scale all that.  */
4124       basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4125       preheader->count = preheader->count.apply_probability (prob * prob2);
4126       scale_loop_frequencies (loop_to_version, prob * prob2);
4127       /* When the loop has multiple exits then we can only version itself.
4128         This is denoted by loop_to_version == loop.  In this case we can
4129         do the versioning by selecting the exit edge the vectorizer is
4130         currently using.  */
4131       edge exit_edge;
4132       if (loop_to_version == loop)
4133        exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4134       else
4135        exit_edge = single_exit (loop_to_version);
4136       exit_edge->dest->count = preheader->count;
4137       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4138
4139       nloop = scalar_loop;
4140       if (dump_enabled_p ())
4141         dump_printf_loc (MSG_NOTE, vect_location,
4142                          "reusing %sloop version created by if conversion\n",
4143                          loop_to_version != loop ? "outer " : "");
4144     }
4145   else
4146     {
4147       if (loop_to_version != loop
4148           && dump_enabled_p ())
4149         dump_printf_loc (MSG_NOTE, vect_location,
4150                          "applying loop versioning to outer loop %d\n",
4151                          loop_to_version->num);
4152
4153       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4154
4155       initialize_original_copy_tables ();
4156       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4157                             prob * prob2, (prob * prob2).invert (),
4158                             prob * prob2, (prob * prob2).invert (),
4159                             true);
4160       /* We will later insert second conditional so overall outcome of
4161          both is prob * prob2.  */
4162       edge true_e, false_e;
4163       extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4164       true_e->probability = prob;
4165       false_e->probability = prob.invert ();
4166       gcc_assert (nloop);
4167       nloop = get_loop_copy (loop);
4168
4169       /* For cycle vectorization with SLP we rely on the PHI arguments
4170          appearing in the same order as the SLP node operands which for the
4171          loop PHI nodes means the preheader edge dest index needs to remain
4172          the same for the analyzed loop which also becomes the vectorized one.
4173          Make it so in case the state after versioning differs by redirecting
4174          the first edge into the header to the same destination which moves
4175          it last.  */
4176       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4177         {
4178           edge e = EDGE_PRED (loop->header, 0);
4179           ssa_redirect_edge (e, e->dest);
4180           flush_pending_stmts (e);
4181         }
4182       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4183
4184       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4185          reap those otherwise;  they also refer to the original
4186          loops.  */
4187       class loop *l = loop;
4188       while (gimple *call = vect_loop_vectorized_call (l))
4189         {
4190           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4191           fold_loop_internal_call (call, boolean_false_node);
4192           l = loop_outer (l);
4193         }
4194       free_original_copy_tables ();
4195
4196       if (cond_expr_stmt_list)
4197         {
4198           cond_exp_gsi = gsi_last_bb (condition_bb);
4199           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4200                                  GSI_SAME_STMT);
4201         }
4202
4203       /* Loop versioning violates an assumption we try to maintain during
4204          vectorization - that the loop exit block has a single predecessor.
4205          After versioning, the exit block of both loop versions is the same
4206          basic block (i.e. it has two predecessors). Just in order to simplify
4207          following transformations in the vectorizer, we fix this situation
4208          here by adding a new (empty) block on the exit-edge of the loop,
4209          with the proper loop-exit phis to maintain loop-closed-form.
4210          If loop versioning wasn't done from loop, but scalar_loop instead,
4211          merge_bb will have already just a single successor.  */
4212
4213       /* When the loop has multiple exits then we can only version itself.
4214          This is denoted by loop_to_version == loop.  In this case we can
4215          do the versioning by selecting the exit edge the vectorizer is
4216          currently using.  */
4217       edge exit_edge;
4218       if (loop_to_version == loop)
4219         exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4220       else
4221         exit_edge = single_exit (loop_to_version);
4222
4223       gcc_assert (exit_edge);
4224       merge_bb = exit_edge->dest;
4225       if (EDGE_COUNT (merge_bb->preds) >= 2)
4226         {
4227           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4228           new_exit_bb = split_edge (exit_edge);
4229           new_exit_e = exit_edge;
4230           e = EDGE_SUCC (new_exit_bb, 0);
4231
4232           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4233                gsi_next (&gsi))
4234             {
4235               tree new_res;
4236               orig_phi = gsi.phi ();
4237               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4238               new_phi = create_phi_node (new_res, new_exit_bb);
4239               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4240               add_phi_arg (new_phi, arg, new_exit_e,
4241                            gimple_phi_arg_location_from_edge (orig_phi, e));
4242               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4243             }
4244         }
4245
4246       update_ssa (TODO_update_ssa_no_phi);
4247     }
4248
4249   /* Split the cost model check off to a separate BB.  Costing assumes
4250      this is the only thing we perform when we enter the scalar loop
4251      from a failed cost decision.  */
4252   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4253     {
4254       gimple *def = SSA_NAME_DEF_STMT (cost_name);
4255       gcc_assert (gimple_bb (def) == condition_bb);
4256       /* All uses of the cost check are 'true' after the check we
4257          are going to insert.  */
4258       replace_uses_by (cost_name, boolean_true_node);
4259       /* And we're going to build the new single use of it.  */
4260       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4261                                        NULL_TREE, NULL_TREE);
4262       edge e = split_block (gimple_bb (def), def);
4263       gimple_stmt_iterator gsi = gsi_for_stmt (def);
4264       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4265       edge true_e, false_e;
4266       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4267       e->flags &= ~EDGE_FALLTHRU;
4268       e->flags |= EDGE_TRUE_VALUE;
4269       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4270       e->probability = prob2;
4271       e2->probability = prob2.invert ();
4272       e->dest->count = e->count ();
4273       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4274       auto_vec<basic_block, 3> adj;
4275       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4276            son;
4277            son = next_dom_son (CDI_DOMINATORS, son))
4278         if (EDGE_COUNT (son->preds) > 1)
4279           adj.safe_push (son);
4280       for (auto son : adj)
4281         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4282       //debug_bb (condition_bb);
4283       //debug_bb (e->src);
4284     }
4285
4286   if (version_niter)
4287     {
4288       /* The versioned loop could be infinite, we need to clear existing
4289          niter information which is copied from the original loop.  */
4290       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4291       vect_free_loop_info_assumptions (nloop);
4292     }
4293
4294   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4295       && dump_enabled_p ())
4296     {
4297       if (version_alias)
4298         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4299                          vect_location,
4300                          "loop versioned for vectorization because of "
4301                          "possible aliasing\n");
4302       if (version_align)
4303         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4304                          vect_location,
4305                          "loop versioned for vectorization to enhance "
4306                          "alignment\n");
4307
4308     }
4309
4310   return nloop;
4311 }