gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53 #include "langhooks.h"
  54 #include "tree-vector-builder.h"
  55 #include "optabs-tree.h"
  56
  57 /*************************************************************************
  58   Simple Loop Peeling Utilities
  59
  60   Utilities to support loop peeling for vectorization purposes.
  61  *************************************************************************/
  62
  63
  64 /* Renames the use *OP_P.  */
  65
  66 static void
  67 rename_use_op (use_operand_p op_p)
  68 {
  69   tree new_name;
  70
  71   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  72     return;
  73
  74   new_name = get_current_def (USE_FROM_PTR (op_p));
  75
  76   /* Something defined outside of the loop.  */
  77   if (!new_name)
  78     return;
  79
  80   /* An ordinary ssa name defined in the loop.  */
  81
  82   SET_USE (op_p, new_name);
  83 }
  84
  85
  86 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  87    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  88    true.  */
  89
  90 static void
  91 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  92 {
  93   gimple *stmt;
  94   use_operand_p use_p;
  95   ssa_op_iter iter;
  96   edge e;
  97   edge_iterator ei;
  98   class loop *loop = bb->loop_father;
  99   class loop *outer_loop = NULL;
 100
 101   if (rename_from_outer_loop)
 102     {
 103       gcc_assert (loop);
 104       outer_loop = loop_outer (loop);
 105     }
 106
 107   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 108        gsi_next (&gsi))
 109     {
 110       stmt = gsi_stmt (gsi);
 111       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 112         rename_use_op (use_p);
 113     }
 114
 115   FOR_EACH_EDGE (e, ei, bb->preds)
 116     {
 117       if (!flow_bb_inside_loop_p (loop, e->src))
 118         {
 119           if (!rename_from_outer_loop)
 120             continue;
 121           if (e->src != outer_loop->header)
 122             {
 123               if (outer_loop->inner->next)
 124                 {
 125                   /* If outer_loop has 2 inner loops, allow there to
 126                      be an extra basic block which decides which of the
 127                      two loops to use using LOOP_VECTORIZED.  */
 128                   if (!single_pred_p (e->src)
 129                       || single_pred (e->src) != outer_loop->header)
 130                     continue;
 131                 }
 132             }
 133         }
 134       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 135            gsi_next (&gsi))
 136         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 137     }
 138 }
 139
 140
 141 struct adjust_info
 142 {
 143   tree from, to;
 144   basic_block bb;
 145 };
 146
 147 /* A stack of values to be adjusted in debug stmts.  We have to
 148    process them LIFO, so that the closest substitution applies.  If we
 149    processed them FIFO, without the stack, we might substitute uses
 150    with a PHI DEF that would soon become non-dominant, and when we got
 151    to the suitable one, it wouldn't have anything to substitute any
 152    more.  */
 153 static vec<adjust_info, va_heap> adjust_vec;
 154
 155 /* Adjust any debug stmts that referenced AI->from values to use the
 156    loop-closed AI->to, if the references are dominated by AI->bb and
 157    not by the definition of AI->from.  */
 158
 159 static void
 160 adjust_debug_stmts_now (adjust_info *ai)
 161 {
 162   basic_block bbphi = ai->bb;
 163   tree orig_def = ai->from;
 164   tree new_def = ai->to;
 165   imm_use_iterator imm_iter;
 166   gimple *stmt;
 167   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 168
 169   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 170
 171   /* Adjust any debug stmts that held onto non-loop-closed
 172      references.  */
 173   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 174     {
 175       use_operand_p use_p;
 176       basic_block bbuse;
 177
 178       if (!is_gimple_debug (stmt))
 179         continue;
 180
 181       gcc_assert (gimple_debug_bind_p (stmt));
 182
 183       bbuse = gimple_bb (stmt);
 184
 185       if ((bbuse == bbphi
 186            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 187           && !(bbuse == bbdef
 188                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 189         {
 190           if (new_def)
 191             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 192               SET_USE (use_p, new_def);
 193           else
 194             {
 195               gimple_debug_bind_reset_value (stmt);
 196               update_stmt (stmt);
 197             }
 198         }
 199     }
 200 }
 201
 202 /* Adjust debug stmts as scheduled before.  */
 203
 204 static void
 205 adjust_vec_debug_stmts (void)
 206 {
 207   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 208     return;
 209
 210   gcc_assert (adjust_vec.exists ());
 211
 212   while (!adjust_vec.is_empty ())
 213     {
 214       adjust_debug_stmts_now (&adjust_vec.last ());
 215       adjust_vec.pop ();
 216     }
 217 }
 218
 219 /* Adjust any debug stmts that referenced FROM values to use the
 220    loop-closed TO, if the references are dominated by BB and not by
 221    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 222    will be postponed until adjust_vec_debug_stmts is called.  */
 223
 224 static void
 225 adjust_debug_stmts (tree from, tree to, basic_block bb)
 226 {
 227   adjust_info ai;
 228
 229   if (MAY_HAVE_DEBUG_BIND_STMTS
 230       && TREE_CODE (from) == SSA_NAME
 231       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 232       && ! virtual_operand_p (from))
 233     {
 234       ai.from = from;
 235       ai.to = to;
 236       ai.bb = bb;
 237
 238       if (adjust_vec.exists ())
 239         adjust_vec.safe_push (ai);
 240       else
 241         adjust_debug_stmts_now (&ai);
 242     }
 243 }
 244
 245 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 246    to adjust any debug stmts that referenced the old phi arg,
 247    presumably non-loop-closed references left over from other
 248    transformations.  */
 249
 250 static void
 251 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 252 {
 253   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 254
 255   gcc_assert (TREE_CODE (orig_def) != SSA_NAME
 256               || orig_def != new_def);
 257
 258   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 259
 260   if (MAY_HAVE_DEBUG_BIND_STMTS)
 261     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 262                         gimple_bb (update_phi));
 263 }
 264
 265 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 266    that the control should have during the first iteration and NEXT_CTRL is the
 267    value that it should have on subsequent iterations.  */
 268
 269 static void
 270 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 271                        tree next_ctrl)
 272 {
 273   gphi *phi = create_phi_node (ctrl, loop->header);
 274   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 275   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 276 }
 277
 278 /* Add SEQ to the end of LOOP's preheader block.  */
 279
 280 static void
 281 add_preheader_seq (class loop *loop, gimple_seq seq)
 282 {
 283   if (seq)
 284     {
 285       edge pe = loop_preheader_edge (loop);
 286       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 287       gcc_assert (!new_bb);
 288     }
 289 }
 290
 291 /* Add SEQ to the beginning of LOOP's header block.  */
 292
 293 static void
 294 add_header_seq (class loop *loop, gimple_seq seq)
 295 {
 296   if (seq)
 297     {
 298       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 299       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 300     }
 301 }
 302
 303 /* Return true if the target can interleave elements of two vectors.
 304    OFFSET is 0 if the first half of the vectors should be interleaved
 305    or 1 if the second half should.  When returning true, store the
 306    associated permutation in INDICES.  */
 307
 308 static bool
 309 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 310                         unsigned int offset)
 311 {
 312   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 313   poly_uint64 base = exact_div (nelts, 2) * offset;
 314   vec_perm_builder sel (nelts, 2, 3);
 315   for (unsigned int i = 0; i < 3; ++i)
 316     {
 317       sel.quick_push (base + i);
 318       sel.quick_push (base + i + nelts);
 319     }
 320   indices->new_vector (sel, 2, nelts);
 321   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 322                                *indices);
 323 }
 324
 325 /* Try to use permutes to define the masks in DEST_RGM using the masks
 326    in SRC_RGM, given that the former has twice as many masks as the
 327    latter.  Return true on success, adding any new statements to SEQ.  */
 328
 329 static bool
 330 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 331                                rgroup_controls *src_rgm)
 332 {
 333   tree src_masktype = src_rgm->type;
 334   tree dest_masktype = dest_rgm->type;
 335   machine_mode src_mode = TYPE_MODE (src_masktype);
 336   insn_code icode1, icode2;
 337   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 338       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 339                                   src_mode)) != CODE_FOR_nothing
 340       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 341                                   src_mode)) != CODE_FOR_nothing)
 342     {
 343       /* Unpacking the source masks gives at least as many mask bits as
 344          we need.  We can then VIEW_CONVERT any excess bits away.  */
 345       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 346       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 347       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 348       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 349         {
 350           tree src = src_rgm->controls[i / 2];
 351           tree dest = dest_rgm->controls[i];
 352           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 353                             ? VEC_UNPACK_HI_EXPR
 354                             : VEC_UNPACK_LO_EXPR);
 355           gassign *stmt;
 356           if (dest_masktype == unpack_masktype)
 357             stmt = gimple_build_assign (dest, code, src);
 358           else
 359             {
 360               tree temp = make_ssa_name (unpack_masktype);
 361               stmt = gimple_build_assign (temp, code, src);
 362               gimple_seq_add_stmt (seq, stmt);
 363               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 364                                           build1 (VIEW_CONVERT_EXPR,
 365                                                   dest_masktype, temp));
 366             }
 367           gimple_seq_add_stmt (seq, stmt);
 368         }
 369       return true;
 370     }
 371   vec_perm_indices indices[2];
 372   if (dest_masktype == src_masktype
 373       && interleave_supported_p (&indices[0], src_masktype, 0)
 374       && interleave_supported_p (&indices[1], src_masktype, 1))
 375     {
 376       /* The destination requires twice as many mask bits as the source, so
 377          we can use interleaving permutes to double up the number of bits.  */
 378       tree masks[2];
 379       for (unsigned int i = 0; i < 2; ++i)
 380         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 381       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 382         {
 383           tree src = src_rgm->controls[i / 2];
 384           tree dest = dest_rgm->controls[i];
 385           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 386                                               src, src, masks[i & 1]);
 387           gimple_seq_add_stmt (seq, stmt);
 388         }
 389       return true;
 390     }
 391   return false;
 392 }
 393
 394 /* Populate DEST_RGM->controls, given that they should add up to STEP.
 395
 396      STEP = MIN_EXPR <ivtmp_34, VF>;
 397
 398      First length (MIN (X, VF/N)):
 399        loop_len_15 = MIN_EXPR <STEP, VF/N>;
 400
 401      Second length:
 402        tmp = STEP - loop_len_15;
 403        loop_len_16 = MIN (tmp, VF/N);
 404
 405      Third length:
 406        tmp2 = tmp - loop_len_16;
 407        loop_len_17 = MIN (tmp2, VF/N);
 408
 409      Last length:
 410        loop_len_18 = tmp2 - loop_len_17;
 411 */
 412
 413 static void
 414 vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
 415                                rgroup_controls *dest_rgm, tree step)
 416 {
 417   tree ctrl_type = dest_rgm->type;
 418   poly_uint64 nitems_per_ctrl
 419     = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
 420   tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
 421
 422   for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 423     {
 424       tree ctrl = dest_rgm->controls[i];
 425       if (i == 0)
 426         {
 427           /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
 428           gassign *assign
 429             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 430           gimple_seq_add_stmt (seq, assign);
 431         }
 432       else if (i == dest_rgm->controls.length () - 1)
 433         {
 434           /* Last iteration: Remain capped to the range [0, VF/N].  */
 435           gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
 436                                                  dest_rgm->controls[i - 1]);
 437           gimple_seq_add_stmt (seq, assign);
 438         }
 439       else
 440         {
 441           /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
 442           step = gimple_build (seq, MINUS_EXPR, iv_type, step,
 443                                dest_rgm->controls[i - 1]);
 444           gassign *assign
 445             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 446           gimple_seq_add_stmt (seq, assign);
 447         }
 448     }
 449 }
 450
 451 /* Stores the standard position for induction variable increment in belonging to
 452    LOOP_EXIT (just before the exit condition of the given exit to BSI.
 453    INSERT_AFTER is set to true if the increment should be inserted after
 454    *BSI.  */
 455
 456 void
 457 vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
 458                             bool *insert_after)
 459 {
 460   basic_block bb = loop_exit->src;
 461   *bsi = gsi_last_bb (bb);
 462   *insert_after = false;
 463 }
 464
 465 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 466    for all the rgroup controls in RGC and return a control that is nonzero
 467    when the loop needs to iterate.  Add any new preheader statements to
 468    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 469
 470    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 471    times and has been vectorized according to LOOP_VINFO.
 472
 473    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 474    starts with NITERS_SKIP dummy iterations of the scalar loop before
 475    the real work starts.  The mask elements for these dummy iterations
 476    must be 0, to ensure that the extra iterations do not have an effect.
 477
 478    It is known that:
 479
 480      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 481
 482    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 483    variable that starts at 0 and has step:
 484
 485      VF * RGC->max_nscalars_per_iter * RGC->factor
 486
 487    might overflow before hitting a value above:
 488
 489      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 490
 491    This means that we cannot guarantee that such an induction variable
 492    would ever hit a value that produces a set of all-false masks or zero
 493    lengths for RGC.
 494
 495    Note: the cost of the code generated by this function is modeled
 496    by vect_estimate_min_profitable_iters, so changes here may need
 497    corresponding changes there.  */
 498
 499 static tree
 500 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 501                                  gimple_seq *preheader_seq,
 502                                  gimple_seq *header_seq,
 503                                  gimple_stmt_iterator loop_cond_gsi,
 504                                  rgroup_controls *rgc, tree niters,
 505                                  tree niters_skip, bool might_wrap_p,
 506                                  tree *iv_step, tree *compare_step)
 507 {
 508   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 509   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 510   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 511
 512   tree ctrl_type = rgc->type;
 513   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 514   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 515   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 516   tree length_limit = NULL_TREE;
 517   /* For length, we need length_limit to ensure length in range.  */
 518   if (!use_masks_p)
 519     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 520
 521   /* Calculate the maximum number of item values that the rgroup
 522      handles in total, the number that it handles for each iteration
 523      of the vector loop, and the number that it should skip during the
 524      first iteration of the vector loop.  */
 525   tree nitems_total = niters;
 526   tree nitems_step = build_int_cst (iv_type, vf);
 527   tree nitems_skip = niters_skip;
 528   if (nitems_per_iter != 1)
 529     {
 530       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 531          these multiplications don't overflow.  */
 532       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 533       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 534       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 535                                    nitems_total, compare_factor);
 536       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 537                                   nitems_step, iv_factor);
 538       if (nitems_skip)
 539         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 540                                     nitems_skip, compare_factor);
 541     }
 542
 543   /* Create an induction variable that counts the number of items
 544      processed.  */
 545   tree index_before_incr, index_after_incr;
 546   gimple_stmt_iterator incr_gsi;
 547   bool insert_after;
 548   edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
 549   vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
 550   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
 551     {
 552       /* Create an IV that counts down from niters_total and whose step
 553          is the (variable) amount processed in the current iteration:
 554            ...
 555            _10 = (unsigned long) count_12(D);
 556            ...
 557            # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
 558            _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
 559            ...
 560            vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
 561            ...
 562            ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
 563            ...
 564            if (ivtmp_9 > POLY_INT_CST [4, 4])
 565              goto <bb 4>; [83.33%]
 566            else
 567              goto <bb 5>; [16.67%]
 568       */
 569       nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
 570       tree step = rgc->controls.length () == 1 ? rgc->controls[0]
 571                                                : make_ssa_name (iv_type);
 572       /* Create decrement IV.  */
 573       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 574         {
 575           create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
 576                      insert_after, &index_before_incr, &index_after_incr);
 577           tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
 578                                    index_before_incr, nitems_step);
 579           gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
 580         }
 581       else
 582         {
 583           create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
 584                      &incr_gsi, insert_after, &index_before_incr,
 585                      &index_after_incr);
 586           gimple_seq_add_stmt (header_seq,
 587                                gimple_build_assign (step, MIN_EXPR,
 588                                                     index_before_incr,
 589                                                     nitems_step));
 590         }
 591       *iv_step = step;
 592       *compare_step = nitems_step;
 593       return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
 594                                                        : index_before_incr;
 595     }
 596
 597   /* Create increment IV.  */
 598   create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
 599              loop, &incr_gsi, insert_after, &index_before_incr,
 600              &index_after_incr);
 601
 602   tree zero_index = build_int_cst (compare_type, 0);
 603   tree test_index, test_limit, first_limit;
 604   gimple_stmt_iterator *test_gsi;
 605   if (might_wrap_p)
 606     {
 607       /* In principle the loop should stop iterating once the incremented
 608          IV reaches a value greater than or equal to:
 609
 610            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 611
 612          However, there's no guarantee that this addition doesn't overflow
 613          the comparison type, or that the IV hits a value above it before
 614          wrapping around.  We therefore adjust the limit down by one
 615          IV step:
 616
 617            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 618            -[infinite-prec] NITEMS_STEP
 619
 620          and compare the IV against this limit _before_ incrementing it.
 621          Since the comparison type is unsigned, we actually want the
 622          subtraction to saturate at zero:
 623
 624            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 625            -[sat] NITEMS_STEP
 626
 627          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 628
 629            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 630
 631          where the rightmost subtraction can be done directly in
 632          COMPARE_TYPE.  */
 633       test_index = index_before_incr;
 634       tree adjust = gimple_convert (preheader_seq, compare_type,
 635                                     nitems_step);
 636       if (nitems_skip)
 637         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 638                                adjust, nitems_skip);
 639       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 640                                  nitems_total, adjust);
 641       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 642                                  test_limit, adjust);
 643       test_gsi = &incr_gsi;
 644
 645       /* Get a safe limit for the first iteration.  */
 646       if (nitems_skip)
 647         {
 648           /* The first vector iteration can handle at most NITEMS_STEP
 649              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 650              NITEMS_SKIP to that cannot overflow.  */
 651           tree const_limit = build_int_cst (compare_type,
 652                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 653                                             * nitems_per_iter);
 654           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 655                                       nitems_total, const_limit);
 656           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 657                                       first_limit, nitems_skip);
 658         }
 659       else
 660         /* For the first iteration it doesn't matter whether the IV hits
 661            a value above NITEMS_TOTAL.  That only matters for the latch
 662            condition.  */
 663         first_limit = nitems_total;
 664     }
 665   else
 666     {
 667       /* Test the incremented IV, which will always hit a value above
 668          the bound before wrapping.  */
 669       test_index = index_after_incr;
 670       test_limit = nitems_total;
 671       if (nitems_skip)
 672         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 673                                    test_limit, nitems_skip);
 674       test_gsi = &loop_cond_gsi;
 675
 676       first_limit = test_limit;
 677     }
 678
 679   /* Convert the IV value to the comparison type (either a no-op or
 680      a demotion).  */
 681   gimple_seq test_seq = NULL;
 682   test_index = gimple_convert (&test_seq, compare_type, test_index);
 683   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 684
 685   /* Provide a definition of each control in the group.  */
 686   tree next_ctrl = NULL_TREE;
 687   tree ctrl;
 688   unsigned int i;
 689   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 690     {
 691       /* Previous controls will cover BIAS items.  This control covers the
 692          next batch.  */
 693       poly_uint64 bias = nitems_per_ctrl * i;
 694       tree bias_tree = build_int_cst (compare_type, bias);
 695
 696       /* See whether the first iteration of the vector loop is known
 697          to have a full control.  */
 698       poly_uint64 const_limit;
 699       bool first_iteration_full
 700         = (poly_int_tree_p (first_limit, &const_limit)
 701            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 702
 703       /* Rather than have a new IV that starts at BIAS and goes up to
 704          TEST_LIMIT, prefer to use the same 0-based IV for each control
 705          and adjust the bound down by BIAS.  */
 706       tree this_test_limit = test_limit;
 707       if (i != 0)
 708         {
 709           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 710                                           compare_type, this_test_limit,
 711                                           bias_tree);
 712           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 713                                           compare_type, this_test_limit,
 714                                           bias_tree);
 715         }
 716
 717       /* Create the initial control.  First include all items that
 718          are within the loop limit.  */
 719       tree init_ctrl = NULL_TREE;
 720       if (!first_iteration_full)
 721         {
 722           tree start, end;
 723           if (first_limit == test_limit)
 724             {
 725               /* Use a natural test between zero (the initial IV value)
 726                  and the loop limit.  The "else" block would be valid too,
 727                  but this choice can avoid the need to load BIAS_TREE into
 728                  a register.  */
 729               start = zero_index;
 730               end = this_test_limit;
 731             }
 732           else
 733             {
 734               /* FIRST_LIMIT is the maximum number of items handled by the
 735                  first iteration of the vector loop.  Test the portion
 736                  associated with this control.  */
 737               start = bias_tree;
 738               end = first_limit;
 739             }
 740
 741           if (use_masks_p)
 742             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 743                                         start, end, "max_mask");
 744           else
 745             {
 746               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 747               gimple_seq seq = vect_gen_len (init_ctrl, start,
 748                                              end, length_limit);
 749               gimple_seq_add_seq (preheader_seq, seq);
 750             }
 751         }
 752
 753       /* Now AND out the bits that are within the number of skipped
 754          items.  */
 755       poly_uint64 const_skip;
 756       if (nitems_skip
 757           && !(poly_int_tree_p (nitems_skip, &const_skip)
 758                && known_le (const_skip, bias)))
 759         {
 760           gcc_assert (use_masks_p);
 761           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 762                                                     bias_tree, nitems_skip);
 763           if (init_ctrl)
 764             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 765                                       init_ctrl, unskipped_mask);
 766           else
 767             init_ctrl = unskipped_mask;
 768         }
 769
 770       if (!init_ctrl)
 771         {
 772           /* First iteration is full.  */
 773           if (use_masks_p)
 774             init_ctrl = build_minus_one_cst (ctrl_type);
 775           else
 776             init_ctrl = length_limit;
 777         }
 778
 779       /* Get the control value for the next iteration of the loop.  */
 780       if (use_masks_p)
 781         {
 782           gimple_seq stmts = NULL;
 783           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 784                                       this_test_limit, "next_mask");
 785           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 786         }
 787       else
 788         {
 789           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 790           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 791                                          length_limit);
 792           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 793         }
 794
 795       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 796     }
 797
 798   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 799   if (partial_load_bias != 0)
 800     {
 801       tree adjusted_len = rgc->bias_adjusted_ctrl;
 802       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 803                                             rgc->controls[0],
 804                                             build_int_cst
 805                                             (TREE_TYPE (rgc->controls[0]),
 806                                              partial_load_bias));
 807       gimple_seq_add_stmt (header_seq, minus);
 808     }
 809
 810   return next_ctrl;
 811 }
 812
 813 /* Set up the iteration condition and rgroup controls for LOOP, given
 814    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 815    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 816    the number of iterations of the original scalar loop that should be
 817    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 818    for vect_set_loop_condition.
 819
 820    Insert the branch-back condition before LOOP_COND_GSI and return the
 821    final gcond.  */
 822
 823 static gcond *
 824 vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
 825                                          loop_vec_info loop_vinfo, tree niters,
 826                                          tree final_iv, bool niters_maybe_zero,
 827                                          gimple_stmt_iterator loop_cond_gsi)
 828 {
 829   gimple_seq preheader_seq = NULL;
 830   gimple_seq header_seq = NULL;
 831
 832   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 833   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 834   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 835   tree orig_niters = niters;
 836
 837   /* Type of the initial value of NITERS.  */
 838   tree ni_actual_type = TREE_TYPE (niters);
 839   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 840   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 841   if (niters_skip)
 842     niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
 843
 844   /* Convert NITERS to the same size as the compare.  */
 845   if (compare_precision > ni_actual_precision
 846       && niters_maybe_zero)
 847     {
 848       /* We know that there is always at least one iteration, so if the
 849          count is zero then it must have wrapped.  Cope with this by
 850          subtracting 1 before the conversion and adding 1 to the result.  */
 851       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 852       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 853                              niters, build_minus_one_cst (ni_actual_type));
 854       niters = gimple_convert (&preheader_seq, compare_type, niters);
 855       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 856                              niters, build_one_cst (compare_type));
 857     }
 858   else
 859     niters = gimple_convert (&preheader_seq, compare_type, niters);
 860
 861   /* Iterate over all the rgroups and fill in their controls.  We could use
 862      the first control from any rgroup for the loop condition; here we
 863      arbitrarily pick the last.  */
 864   tree test_ctrl = NULL_TREE;
 865   tree iv_step = NULL_TREE;
 866   tree compare_step = NULL_TREE;
 867   rgroup_controls *rgc;
 868   rgroup_controls *iv_rgc = nullptr;
 869   unsigned int i;
 870   auto_vec<rgroup_controls> *controls = use_masks_p
 871                                           ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
 872                                           : &LOOP_VINFO_LENS (loop_vinfo);
 873   FOR_EACH_VEC_ELT (*controls, i, rgc)
 874     if (!rgc->controls.is_empty ())
 875       {
 876         /* First try using permutes.  This adds a single vector
 877            instruction to the loop for each mask, but needs no extra
 878            loop invariants or IVs.  */
 879         unsigned int nmasks = i + 1;
 880         if (use_masks_p && (nmasks & 1) == 0)
 881           {
 882             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 883             if (!half_rgc->controls.is_empty ()
 884                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 885               continue;
 886           }
 887
 888         if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 889             || !iv_rgc
 890             || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
 891                 != rgc->max_nscalars_per_iter * rgc->factor))
 892           {
 893             /* See whether zero-based IV would ever generate all-false masks
 894                or zero length before wrapping around.  */
 895             bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 896
 897             /* Set up all controls for this group.  */
 898             test_ctrl
 899               = vect_set_loop_controls_directly (loop, loop_vinfo,
 900                                                  &preheader_seq, &header_seq,
 901                                                  loop_cond_gsi, rgc, niters,
 902                                                  niters_skip, might_wrap_p,
 903                                                  &iv_step, &compare_step);
 904
 905             iv_rgc = rgc;
 906           }
 907
 908         if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 909             && rgc->controls.length () > 1)
 910           {
 911             /* vect_set_loop_controls_directly creates an IV whose step
 912                is equal to the expected sum of RGC->controls.  Use that
 913                information to populate RGC->controls.  */
 914             tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 915             gcc_assert (iv_step);
 916             vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
 917           }
 918       }
 919
 920   /* Emit all accumulated statements.  */
 921   add_preheader_seq (loop, preheader_seq);
 922   add_header_seq (loop, header_seq);
 923
 924   /* Get a boolean result that tells us whether to iterate.  */
 925   gcond *cond_stmt;
 926   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 927       && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 928     {
 929       gcc_assert (compare_step);
 930       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
 931       cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
 932                                      NULL_TREE);
 933     }
 934   else
 935     {
 936       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 937       tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 938       cond_stmt
 939         = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
 940     }
 941   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 942
 943   /* The loop iterates (NITERS - 1) / VF + 1 times.
 944      Subtract one from this to get the latch count.  */
 945   tree step = build_int_cst (compare_type,
 946                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 947   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 948                                        build_minus_one_cst (compare_type));
 949   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 950                                      niters_minus_one, step);
 951
 952   if (final_iv)
 953     {
 954       gassign *assign;
 955       /* If vectorizing an inverted early break loop we have to restart the
 956          scalar loop at niters - vf.  This matches what we do in
 957          vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
 958       if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 959         {
 960           tree ftype = TREE_TYPE (orig_niters);
 961           tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 962           assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
 963         }
 964        else
 965         assign = gimple_build_assign (final_iv, orig_niters);
 966       gsi_insert_on_edge_immediate (exit_edge, assign);
 967     }
 968
 969   return cond_stmt;
 970 }
 971
 972 /* Set up the iteration condition and rgroup controls for LOOP in AVX512
 973    style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
 974    vectorized loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 975    the number of iterations of the original scalar loop that should be
 976    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 977    for vect_set_loop_condition.
 978
 979    Insert the branch-back condition before LOOP_COND_GSI and return the
 980    final gcond.  */
 981
 982 static gcond *
 983 vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
 984                                          edge exit_edge,
 985                                          loop_vec_info loop_vinfo, tree niters,
 986                                          tree final_iv,
 987                                          bool niters_maybe_zero,
 988                                          gimple_stmt_iterator loop_cond_gsi)
 989 {
 990   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 991   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 992   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 993   tree orig_niters = niters;
 994   gimple_seq preheader_seq = NULL;
 995
 996   /* Create an IV that counts down from niters and whose step
 997      is the number of iterations processed in the current iteration.
 998      Produce the controls with compares like the following.
 999
1000        # iv_2 = PHI <niters, iv_3>
1001        rem_4 = MIN <iv_2, VF>;
1002        remv_6 = { rem_4, rem_4, rem_4, ... }
1003        mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1004        iv_3 = iv_2 - VF;
1005        if (iv_2 > VF)
1006          continue;
1007
1008      Where the constant is built with elements at most VF - 1 and
1009      repetitions according to max_nscalars_per_iter which is guarnateed
1010      to be the same within a group.  */
1011
1012   /* Convert NITERS to the determined IV type.  */
1013   if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1014       && niters_maybe_zero)
1015     {
1016       /* We know that there is always at least one iteration, so if the
1017          count is zero then it must have wrapped.  Cope with this by
1018          subtracting 1 before the conversion and adding 1 to the result.  */
1019       gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1020       niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1021                              niters, build_minus_one_cst (TREE_TYPE (niters)));
1022       niters = gimple_convert (&preheader_seq, iv_type, niters);
1023       niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1024                              niters, build_one_cst (iv_type));
1025     }
1026   else
1027     niters = gimple_convert (&preheader_seq, iv_type, niters);
1028
1029   /* Bias the initial value of the IV in case we need to skip iterations
1030      at the beginning.  */
1031   tree niters_adj = niters;
1032   if (niters_skip)
1033     {
1034       tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1035       niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1036                                  iv_type, niters, skip);
1037     }
1038
1039   /* The iteration step is the vectorization factor.  */
1040   tree iv_step = build_int_cst (iv_type, vf);
1041
1042   /* Create the decrement IV.  */
1043   tree index_before_incr, index_after_incr;
1044   gimple_stmt_iterator incr_gsi;
1045   bool insert_after;
1046   vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1047   create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1048              &incr_gsi, insert_after, &index_before_incr,
1049              &index_after_incr);
1050
1051   /* Iterate over all the rgroups and fill in their controls.  */
1052   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1053     {
1054       if (rgc.controls.is_empty ())
1055         continue;
1056
1057       tree ctrl_type = rgc.type;
1058       poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1059
1060       tree vectype = rgc.compare_type;
1061
1062       /* index_after_incr is the IV specifying the remaining iterations in
1063          the next iteration.  */
1064       tree rem = index_after_incr;
1065       /* When the data type for the compare to produce the mask is
1066          smaller than the IV type we need to saturate.  Saturate to
1067          the smallest possible value (IV_TYPE) so we only have to
1068          saturate once (CSE will catch redundant ones we add).  */
1069       if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1070         rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1071                             UNKNOWN_LOCATION,
1072                             MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1073       rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1074                             UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1075
1076       /* Build a data vector composed of the remaining iterations.  */
1077       rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078                                           UNKNOWN_LOCATION, vectype, rem);
1079
1080       /* Provide a definition of each vector in the control group.  */
1081       tree next_ctrl = NULL_TREE;
1082       tree first_rem = NULL_TREE;
1083       tree ctrl;
1084       unsigned int i;
1085       FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1086         {
1087           /* Previous controls will cover BIAS items.  This control covers the
1088              next batch.  */
1089           poly_uint64 bias = nitems_per_ctrl * i;
1090
1091           /* Build the constant to compare the remaining iters against,
1092              this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1093              split into pieces.  */
1094           unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1095           tree_vector_builder builder (vectype, n, 1);
1096           for (unsigned i = 0; i < n; ++i)
1097             {
1098               unsigned HOST_WIDE_INT val
1099                 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1100               gcc_assert (val < vf.to_constant ());
1101               builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1102             }
1103           tree cmp_series = builder.build ();
1104
1105           /* Create the initial control.  First include all items that
1106              are within the loop limit.  */
1107           tree init_ctrl = NULL_TREE;
1108           poly_uint64 const_limit;
1109           /* See whether the first iteration of the vector loop is known
1110              to have a full control.  */
1111           if (poly_int_tree_p (niters, &const_limit)
1112               && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1113             init_ctrl = build_minus_one_cst (ctrl_type);
1114           else
1115             {
1116               /* The remaining work items initially are niters.  Saturate,
1117                  splat and compare.  */
1118               if (!first_rem)
1119                 {
1120                   first_rem = niters;
1121                   if (TYPE_PRECISION (TREE_TYPE (vectype))
1122                       < TYPE_PRECISION (iv_type))
1123                     first_rem = gimple_build (&preheader_seq,
1124                                               MIN_EXPR, TREE_TYPE (first_rem),
1125                                               first_rem, iv_step);
1126                   first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1127                                               first_rem);
1128                   first_rem = gimple_build_vector_from_val (&preheader_seq,
1129                                                             vectype, first_rem);
1130                 }
1131               init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1132                                         cmp_series, first_rem);
1133             }
1134
1135           /* Now AND out the bits that are within the number of skipped
1136              items.  */
1137           poly_uint64 const_skip;
1138           if (niters_skip
1139               && !(poly_int_tree_p (niters_skip, &const_skip)
1140                    && known_le (const_skip, bias)))
1141             {
1142               /* For integer mode masks it's cheaper to shift out the bits
1143                  since that avoids loading a constant.  */
1144               gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1145               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1146                                         lang_hooks.types.type_for_mode
1147                                           (TYPE_MODE (ctrl_type), 1),
1148                                         init_ctrl);
1149               /* ???  But when the shift amount isn't constant this requires
1150                  a round-trip to GRPs.  We could apply the bias to either
1151                  side of the compare instead.  */
1152               tree shift = gimple_build (&preheader_seq, MULT_EXPR,
1153                                          TREE_TYPE (niters_skip), niters_skip,
1154                                          build_int_cst (TREE_TYPE (niters_skip),
1155                                                         rgc.max_nscalars_per_iter));
1156               init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1157                                         TREE_TYPE (init_ctrl),
1158                                         init_ctrl, shift);
1159               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1160                                         ctrl_type, init_ctrl);
1161             }
1162
1163           /* Get the control value for the next iteration of the loop.  */
1164           next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1165                                     UNKNOWN_LOCATION,
1166                                     LT_EXPR, ctrl_type, cmp_series, rem);
1167
1168           vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1169         }
1170     }
1171
1172   /* Emit all accumulated statements.  */
1173   add_preheader_seq (loop, preheader_seq);
1174
1175   /* Adjust the exit test using the decrementing IV.  */
1176   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1177   /* When we peel for alignment with niter_skip != 0 this can
1178      cause niter + niter_skip to wrap and since we are comparing the
1179      value before the decrement here we get a false early exit.
1180      We can't compare the value after decrement either because that
1181      decrement could wrap as well as we're not doing a saturating
1182      decrement.  To avoid this situation we force a larger
1183      iv_type.  */
1184   gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1185                                         NULL_TREE, NULL_TREE);
1186   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1187
1188   /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1189      Subtract one from this to get the latch count.  */
1190   tree niters_minus_one
1191     = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1192                    build_minus_one_cst (TREE_TYPE (orig_niters)));
1193   tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1194   if (niters_skip)
1195     niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1196                                fold_convert (iv_type, niters_skip));
1197   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1198                                      niters_adj2, iv_step);
1199
1200   if (final_iv)
1201     {
1202       gassign *assign;
1203       /* If vectorizing an inverted early break loop we have to restart the
1204          scalar loop at niters - vf.  This matches what we do in
1205          vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
1206       if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1207         {
1208           tree ftype = TREE_TYPE (orig_niters);
1209           tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1210           assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1211         }
1212        else
1213         assign = gimple_build_assign (final_iv, orig_niters);
1214       gsi_insert_on_edge_immediate (exit_edge, assign);
1215     }
1216
1217   return cond_stmt;
1218 }
1219
1220
1221 /* Like vect_set_loop_condition, but handle the case in which the vector
1222    loop handles exactly VF scalars per iteration.  */
1223
1224 static gcond *
1225 vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1226                                 class loop *loop, tree niters, tree step,
1227                                 tree final_iv, bool niters_maybe_zero,
1228                                 gimple_stmt_iterator loop_cond_gsi)
1229 {
1230   tree indx_before_incr, indx_after_incr;
1231   gcond *cond_stmt;
1232   gcond *orig_cond;
1233   edge pe = loop_preheader_edge (loop);
1234   gimple_stmt_iterator incr_gsi;
1235   bool insert_after;
1236   enum tree_code code;
1237   tree niters_type = TREE_TYPE (niters);
1238
1239   orig_cond = get_loop_exit_condition (exit_edge);
1240   gcc_assert (orig_cond);
1241   loop_cond_gsi = gsi_for_stmt (orig_cond);
1242
1243   tree init, limit;
1244   if (!niters_maybe_zero && integer_onep (step))
1245     {
1246       /* In this case we can use a simple 0-based IV:
1247
1248          A:
1249            x = 0;
1250            do
1251              {
1252                ...
1253                x += 1;
1254              }
1255            while (x < NITERS);  */
1256       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1257       init = build_zero_cst (niters_type);
1258       limit = niters;
1259     }
1260   else
1261     {
1262       /* The following works for all values of NITERS except 0:
1263
1264          B:
1265            x = 0;
1266            do
1267              {
1268                ...
1269                x += STEP;
1270              }
1271            while (x <= NITERS - STEP);
1272
1273          so that the loop continues to iterate if x + STEP - 1 < NITERS
1274          but stops if x + STEP - 1 >= NITERS.
1275
1276          However, if NITERS is zero, x never hits a value above NITERS - STEP
1277          before wrapping around.  There are two obvious ways of dealing with
1278          this:
1279
1280          - start at STEP - 1 and compare x before incrementing it
1281          - start at -1 and compare x after incrementing it
1282
1283          The latter is simpler and is what we use.  The loop in this case
1284          looks like:
1285
1286          C:
1287            x = -1;
1288            do
1289              {
1290                ...
1291                x += STEP;
1292              }
1293            while (x < NITERS - STEP);
1294
1295          In both cases the loop limit is NITERS - STEP.  */
1296       gimple_seq seq = NULL;
1297       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1298       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1299       if (seq)
1300         {
1301           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1302           gcc_assert (!new_bb);
1303         }
1304       if (niters_maybe_zero)
1305         {
1306           /* Case C.  */
1307           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1308           init = build_all_ones_cst (niters_type);
1309         }
1310       else
1311         {
1312           /* Case B.  */
1313           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1314           init = build_zero_cst (niters_type);
1315         }
1316     }
1317
1318   vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1319   create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1320              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1321   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1322                                               true, NULL_TREE, true,
1323                                               GSI_SAME_STMT);
1324   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1325                                      true, GSI_SAME_STMT);
1326
1327   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1328                                  NULL_TREE);
1329
1330   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1331
1332   /* Record the number of latch iterations.  */
1333   if (limit == niters)
1334     /* Case A: the loop iterates NITERS times.  Subtract one to get the
1335        latch count.  */
1336     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1337                                        build_int_cst (niters_type, 1));
1338   else
1339     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1340        Subtract one from this to get the latch count.  */
1341     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1342                                        limit, step);
1343
1344   if (final_iv)
1345     {
1346       gassign *assign;
1347       gcc_assert (single_pred_p (exit_edge->dest));
1348       tree phi_dest
1349         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1350       /* Make sure to maintain LC SSA form here and elide the subtraction
1351          if the value is zero.  */
1352       gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1353       add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1354       if (!integer_zerop (init))
1355         {
1356           assign = gimple_build_assign (final_iv, MINUS_EXPR,
1357                                         phi_dest, init);
1358           gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1359           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1360         }
1361     }
1362
1363   return cond_stmt;
1364 }
1365
1366 /* If we're using fully-masked loops, make LOOP iterate:
1367
1368       N == (NITERS - 1) / STEP + 1
1369
1370    times.  When NITERS is zero, this is equivalent to making the loop
1371    execute (1 << M) / STEP times, where M is the precision of NITERS.
1372    NITERS_MAYBE_ZERO is true if this last case might occur.
1373
1374    If we're not using fully-masked loops, make LOOP iterate:
1375
1376       N == (NITERS - STEP) / STEP + 1
1377
1378    times, where NITERS is known to be outside the range [1, STEP - 1].
1379    This is equivalent to making the loop execute NITERS / STEP times
1380    when NITERS is nonzero and (1 << M) / STEP times otherwise.
1381    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1382
1383    If FINAL_IV is nonnull, it is an SSA name that should be set to
1384    N * STEP on exit from the loop.
1385
1386    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
1387
1388 void
1389 vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1390                          tree niters, tree step, tree final_iv,
1391                          bool niters_maybe_zero)
1392 {
1393   gcond *cond_stmt;
1394   gcond *orig_cond = get_loop_exit_condition (loop_e);
1395   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1396
1397   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1398     {
1399       if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1400         cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1401                                                                     loop_vinfo,
1402                                                                     niters, final_iv,
1403                                                                     niters_maybe_zero,
1404                                                                     loop_cond_gsi);
1405       else
1406         cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1407                                                              loop_vinfo,
1408                                                              niters, final_iv,
1409                                                              niters_maybe_zero,
1410                                                              loop_cond_gsi);
1411     }
1412   else
1413     cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1414                                                 niters,
1415                                                 step, final_iv,
1416                                                 niters_maybe_zero,
1417                                                 loop_cond_gsi);
1418
1419   /* Remove old loop exit test.  */
1420   stmt_vec_info orig_cond_info;
1421   if (loop_vinfo
1422       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1423     loop_vinfo->remove_stmt (orig_cond_info);
1424   else
1425     gsi_remove (&loop_cond_gsi, true);
1426
1427   if (dump_enabled_p ())
1428     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1429                      (gimple *) cond_stmt);
1430 }
1431
1432 /* Given LOOP this function generates a new copy of it and puts it
1433    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1434    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1435    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1436    entry or exit of LOOP.  If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1437    continuation.  This is correct for cases where one loop continues from the
1438    other like in the vectorizer, but not true for uses in e.g. loop distribution
1439    where the contents of the loop body are split but the iteration space of both
1440    copies remains the same.
1441
1442    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1443    dominators were updated during the peeling.  When doing early break vectorization
1444    then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1445    memory references that need to be updated should we decide to vectorize.  */
1446
1447 class loop *
1448 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1449                                         class loop *scalar_loop,
1450                                         edge scalar_exit, edge e, edge *new_e,
1451                                         bool flow_loops,
1452                                         vec<basic_block> *updated_doms)
1453 {
1454   class loop *new_loop;
1455   basic_block *new_bbs, *bbs, *pbbs;
1456   bool at_exit;
1457   bool was_imm_dom;
1458   basic_block exit_dest;
1459   edge exit, new_exit;
1460   bool duplicate_outer_loop = false;
1461
1462   exit = loop_exit;
1463   at_exit = (e == exit);
1464   if (!at_exit && e != loop_preheader_edge (loop))
1465     return NULL;
1466
1467   if (scalar_loop == NULL)
1468     {
1469       scalar_loop = loop;
1470       scalar_exit = loop_exit;
1471     }
1472   else if (scalar_loop == loop)
1473     scalar_exit = loop_exit;
1474   else
1475     {
1476       /* Loop has been version, match exits up using the aux index.  */
1477       for (edge exit : get_loop_exit_edges (scalar_loop))
1478         if (exit->aux == loop_exit->aux)
1479           {
1480             scalar_exit = exit;
1481             break;
1482           }
1483
1484       gcc_assert (scalar_exit);
1485     }
1486
1487   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1488   pbbs = bbs + 1;
1489   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1490   /* Allow duplication of outer loops.  */
1491   if (scalar_loop->inner)
1492     duplicate_outer_loop = true;
1493
1494   /* Generate new loop structure.  */
1495   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1496   duplicate_subloops (scalar_loop, new_loop);
1497
1498   exit_dest = exit->dest;
1499   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1500                                           exit_dest) == loop->header ?
1501                  true : false);
1502
1503   /* Also copy the pre-header, this avoids jumping through hoops to
1504      duplicate the loop entry PHI arguments.  Create an empty
1505      pre-header unconditionally for this.  */
1506   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1507   edge entry_e = single_pred_edge (preheader);
1508   bbs[0] = preheader;
1509   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1510
1511   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1512             &scalar_exit, 1, &new_exit, NULL,
1513             at_exit ? loop->latch : e->src, true);
1514   exit = loop_exit;
1515   basic_block new_preheader = new_bbs[0];
1516
1517   gcc_assert (new_exit);
1518
1519   /* Record the new loop exit information.  new_loop doesn't have SCEV data and
1520      so we must initialize the exit information.  */
1521   if (new_e)
1522     *new_e = new_exit;
1523
1524   /* Before installing PHI arguments make sure that the edges
1525      into them match that of the scalar loop we analyzed.  This
1526      makes sure the SLP tree matches up between the main vectorized
1527      loop and the epilogue vectorized copies.  */
1528   if (single_succ_edge (preheader)->dest_idx
1529       != single_succ_edge (new_bbs[0])->dest_idx)
1530     {
1531       basic_block swap_bb = new_bbs[1];
1532       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1533       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1534       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1535       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1536     }
1537   if (duplicate_outer_loop)
1538     {
1539       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1540       if (loop_preheader_edge (scalar_loop)->dest_idx
1541           != loop_preheader_edge (new_inner_loop)->dest_idx)
1542         {
1543           basic_block swap_bb = new_inner_loop->header;
1544           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1545           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1546           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1547           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1548         }
1549     }
1550
1551   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1552
1553   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1554   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1555     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1556
1557   /* Rename the exit uses.  */
1558   for (edge exit : get_loop_exit_edges (new_loop))
1559     for (auto gsi = gsi_start_phis (exit->dest);
1560          !gsi_end_p (gsi); gsi_next (&gsi))
1561       {
1562         tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1563         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1564         if (MAY_HAVE_DEBUG_BIND_STMTS)
1565           adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1566       }
1567
1568   auto loop_exits = get_loop_exit_edges (loop);
1569   bool multiple_exits_p = loop_exits.length () > 1;
1570   auto_vec<basic_block> doms;
1571   class loop *update_loop = NULL;
1572
1573   if (at_exit) /* Add the loop copy at exit.  */
1574     {
1575       if (scalar_loop != loop && new_exit->dest != exit_dest)
1576         {
1577           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1578           flush_pending_stmts (new_exit);
1579         }
1580
1581       bool need_virtual_phi = get_virtual_phi (loop->header);
1582
1583       /* For the main loop exit preserve the LC PHI nodes.  For vectorization
1584          we need them to continue or finalize reductions.  Since we do not
1585          copy the loop exit blocks we have to materialize PHIs at the
1586          new destination before redirecting edges.  */
1587       for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1588            !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1589         {
1590           tree res = gimple_phi_result (*gsi_from);
1591           create_phi_node (copy_ssa_name (res), new_preheader);
1592         }
1593       edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1594       gcc_assert (e == loop_exit);
1595       flush_pending_stmts (loop_exit);
1596       set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1597
1598       bool multiple_exits_p = loop_exits.length () > 1;
1599       basic_block main_loop_exit_block = new_preheader;
1600       basic_block alt_loop_exit_block = NULL;
1601       /* Create the CFG for multiple exits.
1602                    | loop_exit               | alt1   | altN
1603                    v                         v   ...  v
1604             main_loop_exit_block:       alt_loop_exit_block:
1605                    |                      /
1606                    v                     v
1607             new_preheader:
1608          where in the new preheader we need merge PHIs for
1609          the continuation values into the epilogue header.
1610          Do not bother with exit PHIs for the early exits but
1611          their live virtual operand.  We'll fix up things below.  */
1612       if (multiple_exits_p)
1613         {
1614           edge loop_e = single_succ_edge (new_preheader);
1615           new_preheader = split_edge (loop_e);
1616
1617           gphi *vphi = NULL;
1618           alt_loop_exit_block = new_preheader;
1619           for (auto exit : loop_exits)
1620             if (exit != loop_exit)
1621               {
1622                 tree vphi_def = NULL_TREE;
1623                 if (gphi *evphi = get_virtual_phi (exit->dest))
1624                   vphi_def = gimple_phi_arg_def_from_edge (evphi, exit);
1625                 edge res = redirect_edge_and_branch (exit, alt_loop_exit_block);
1626                 gcc_assert (res == exit);
1627                 redirect_edge_var_map_clear (exit);
1628                 if (alt_loop_exit_block == new_preheader)
1629                   alt_loop_exit_block = split_edge (exit);
1630                 if (!need_virtual_phi)
1631                   continue;
1632                 if (vphi_def)
1633                   {
1634                     if (!vphi)
1635                       vphi = create_phi_node (copy_ssa_name (vphi_def),
1636                                               alt_loop_exit_block);
1637                     else
1638                       /* Edge redirection might re-allocate the PHI node
1639                          so we have to rediscover it.  */
1640                       vphi = get_virtual_phi (alt_loop_exit_block);
1641                     add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1642                   }
1643               }
1644
1645           set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1646                                    loop->header);
1647         }
1648
1649       /* Adjust the epilog loop PHI entry values to continue iteration.
1650          This adds remaining necessary LC PHI nodes to the main exit
1651          and creates merge PHIs when we have multiple exits with
1652          their appropriate continuation.  */
1653       if (flow_loops)
1654         {
1655           edge loop_entry = single_succ_edge (new_preheader);
1656           bool peeled_iters = single_pred (loop->latch) != loop_exit->src;
1657
1658           /* Record the new SSA names in the cache so that we can skip
1659              materializing them again when we fill in the rest of the LC SSA
1660              variables.  */
1661           hash_map <tree, tree> new_phi_args;
1662           for (auto psi = gsi_start_phis (main_loop_exit_block);
1663                !gsi_end_p (psi); gsi_next (&psi))
1664             {
1665               gphi *phi = *psi;
1666               tree new_arg = gimple_phi_arg_def_from_edge (phi, loop_exit);
1667               if (TREE_CODE (new_arg) != SSA_NAME)
1668                 continue;
1669
1670               /* If the loop doesn't have a virtual def then only possibly keep
1671                  the epilog LC PHI for it and avoid creating new defs.  */
1672               if (virtual_operand_p (new_arg) && !need_virtual_phi)
1673                 {
1674                   auto gsi = gsi_for_stmt (phi);
1675                   remove_phi_node (&gsi, true);
1676                   continue;
1677                 }
1678
1679               /* If we decided not to remove the PHI node we should also not
1680                  rematerialize it later on.  */
1681               new_phi_args.put (new_arg, gimple_phi_result (phi));
1682             }
1683
1684           /* Create the merge PHI nodes in new_preheader and populate the
1685              arguments for the exits.  */
1686           if (multiple_exits_p)
1687             {
1688               for (auto gsi_from = gsi_start_phis (loop->header),
1689                    gsi_to = gsi_start_phis (new_loop->header);
1690                    !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1691                    gsi_next (&gsi_from), gsi_next (&gsi_to))
1692                 {
1693                   gimple *from_phi = gsi_stmt (gsi_from);
1694                   gimple *to_phi = gsi_stmt (gsi_to);
1695
1696                   /* When the vector loop is peeled then we need to use the
1697                      value at start of the loop, otherwise the main loop exit
1698                      should use the final iter value.  */
1699                   tree new_arg;
1700                   if (peeled_iters)
1701                     new_arg = gimple_phi_result (from_phi);
1702                   else
1703                     new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1704                                                      loop_latch_edge (loop));
1705
1706                   /* Check if we've already created a new phi node during edge
1707                      redirection and re-use it if so.  Otherwise create a
1708                      LC PHI node to feed the merge PHI.  */
1709                   tree *res;
1710                   if (virtual_operand_p (new_arg))
1711                     /* Use the existing virtual LC SSA from exit block.  */
1712                     new_arg = gimple_phi_result
1713                                 (get_virtual_phi (main_loop_exit_block));
1714                   else if ((res = new_phi_args.get (new_arg)))
1715                     new_arg = *res;
1716                   else
1717                     {
1718                       /* Create the LC PHI node for the exit.  */
1719                       tree new_def = copy_ssa_name (new_arg);
1720                       gphi *lc_phi
1721                           = create_phi_node (new_def, main_loop_exit_block);
1722                       SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1723                       new_arg = new_def;
1724                     }
1725
1726                   /* Create the PHI node in the merge block merging the
1727                      main and early exit values.  */
1728                   tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1729                   gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1730                   edge main_e = single_succ_edge (main_loop_exit_block);
1731                   SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1732
1733                   /* And adjust the epilog entry value.  */
1734                   adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1735                 }
1736
1737               /* After creating the merge PHIs handle the early exits those
1738                  should use the values at the start of the loop.  */
1739               for (auto gsi_from = gsi_start_phis (loop->header),
1740                    gsi_to = gsi_start_phis (new_preheader);
1741                    !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1742                    gsi_next (&gsi_from), gsi_next (&gsi_to))
1743                 {
1744                   gimple *from_phi = gsi_stmt (gsi_from);
1745                   gimple *to_phi = gsi_stmt (gsi_to);
1746
1747                   /* Now update the virtual PHI nodes with the right value.  */
1748                   tree alt_arg = gimple_phi_result (from_phi);
1749                   if (virtual_operand_p (alt_arg))
1750                     {
1751                       gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1752                       /* ???  When the exit yields to a path without
1753                          any virtual use we can miss a LC PHI for the
1754                          live virtual operand.  Simply choosing the
1755                          one live at the start of the loop header isn't
1756                          correct, but we should get here only with
1757                          early-exit vectorization which will move all
1758                          defs after the main exit, so leave a temporarily
1759                          wrong virtual operand in place.  This happens
1760                          for gcc.c-torture/execute/20150611-1.c  */
1761                       if (vphi)
1762                         alt_arg = gimple_phi_result (vphi);
1763                     }
1764                   /* For other live args we didn't create LC PHI nodes.
1765                      Do so here.  */
1766                   else
1767                     {
1768                       tree alt_def = copy_ssa_name (alt_arg);
1769                       gphi *lc_phi
1770                         = create_phi_node (alt_def, alt_loop_exit_block);
1771                       for (unsigned i = 0; i < gimple_phi_num_args (lc_phi);
1772                            ++i)
1773                         SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1774                       alt_arg = alt_def;
1775                     }
1776                   edge alt_e = single_succ_edge (alt_loop_exit_block);
1777                   SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1778                 }
1779             }
1780           /* For the single exit case only create the missing LC PHI nodes
1781              for the continuation of the loop IVs that are not also already
1782              reductions and thus had LC PHI nodes on the exit already.  */
1783           else
1784             {
1785               for (auto gsi_from = gsi_start_phis (loop->header),
1786                    gsi_to = gsi_start_phis (new_loop->header);
1787                    !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1788                    gsi_next (&gsi_from), gsi_next (&gsi_to))
1789                 {
1790                   gimple *from_phi = gsi_stmt (gsi_from);
1791                   gimple *to_phi = gsi_stmt (gsi_to);
1792                   tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1793                                                         loop_latch_edge (loop));
1794
1795                   /* Check if we've already created a new phi node during edge
1796                      redirection.  If we have, only propagate the value
1797                      downwards.  */
1798                   if (tree *res = new_phi_args.get (new_arg))
1799                     {
1800                       adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1801                       continue;
1802                     }
1803
1804                   tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1805                   gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1806                   SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg);
1807                   adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1808                 }
1809             }
1810         }
1811
1812       if (was_imm_dom || duplicate_outer_loop)
1813         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1814
1815       /* And remove the non-necessary forwarder again.  Keep the other
1816          one so we have a proper pre-header for the loop at the exit edge.  */
1817       redirect_edge_pred (single_succ_edge (preheader),
1818                           single_pred (preheader));
1819       delete_basic_block (preheader);
1820       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1821                                loop_preheader_edge (scalar_loop)->src);
1822
1823       /* Finally after wiring the new epilogue we need to update its main exit
1824          to the original function exit we recorded.  Other exits are already
1825          correct.  */
1826       if (multiple_exits_p)
1827         {
1828           update_loop = new_loop;
1829           doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1830           for (unsigned i = 0; i < doms.length (); ++i)
1831             if (flow_bb_inside_loop_p (loop, doms[i]))
1832               doms.unordered_remove (i);
1833         }
1834     }
1835   else /* Add the copy at entry.  */
1836     {
1837       /* Copy the current loop LC PHI nodes between the original loop exit
1838          block and the new loop header.  This allows us to later split the
1839          preheader block and still find the right LC nodes.  */
1840       if (flow_loops)
1841         for (auto gsi_from = gsi_start_phis (new_loop->header),
1842              gsi_to = gsi_start_phis (loop->header);
1843              !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1844              gsi_next (&gsi_from), gsi_next (&gsi_to))
1845           {
1846             gimple *from_phi = gsi_stmt (gsi_from);
1847             gimple *to_phi = gsi_stmt (gsi_to);
1848             tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1849                                                   loop_latch_edge (new_loop));
1850             adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1851                                         new_arg);
1852           }
1853
1854       if (scalar_loop != loop)
1855         {
1856           /* Remove the non-necessary forwarder of scalar_loop again.  */
1857           redirect_edge_pred (single_succ_edge (preheader),
1858                               single_pred (preheader));
1859           delete_basic_block (preheader);
1860           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1861                                    loop_preheader_edge (scalar_loop)->src);
1862           preheader = split_edge (loop_preheader_edge (loop));
1863           entry_e = single_pred_edge (preheader);
1864         }
1865
1866       redirect_edge_and_branch_force (entry_e, new_preheader);
1867       flush_pending_stmts (entry_e);
1868       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1869
1870       redirect_edge_and_branch_force (new_exit, preheader);
1871       flush_pending_stmts (new_exit);
1872       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1873
1874       /* And remove the non-necessary forwarder again.  Keep the other
1875          one so we have a proper pre-header for the loop at the exit edge.  */
1876       redirect_edge_pred (single_succ_edge (new_preheader),
1877                           single_pred (new_preheader));
1878       delete_basic_block (new_preheader);
1879       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1880                                loop_preheader_edge (new_loop)->src);
1881
1882       if (multiple_exits_p)
1883         update_loop = loop;
1884     }
1885
1886   if (multiple_exits_p)
1887     {
1888       for (edge e : get_loop_exit_edges (update_loop))
1889         {
1890           edge ex;
1891           edge_iterator ei;
1892           FOR_EACH_EDGE (ex, ei, e->dest->succs)
1893             {
1894               /* Find the first non-fallthrough block as fall-throughs can't
1895                  dominate other blocks.  */
1896               if (single_succ_p (ex->dest))
1897                 {
1898                   doms.safe_push (ex->dest);
1899                   ex = single_succ_edge (ex->dest);
1900                 }
1901               doms.safe_push (ex->dest);
1902             }
1903           doms.safe_push (e->dest);
1904         }
1905
1906       iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1907       if (updated_doms)
1908         updated_doms->safe_splice (doms);
1909     }
1910
1911   free (new_bbs);
1912   free (bbs);
1913
1914   checking_verify_dominators (CDI_DOMINATORS);
1915
1916   return new_loop;
1917 }
1918
1919
1920 /* Given the condition expression COND, put it as the last statement of
1921    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1922    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1923    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1924    new edge as irreducible if IRREDUCIBLE_P is true.  */
1925
1926 static edge
1927 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1928                        basic_block guard_to, basic_block dom_bb,
1929                        profile_probability probability, bool irreducible_p)
1930 {
1931   gimple_stmt_iterator gsi;
1932   edge new_e, enter_e;
1933   gcond *cond_stmt;
1934   gimple_seq gimplify_stmt_list = NULL;
1935
1936   enter_e = EDGE_SUCC (guard_bb, 0);
1937   enter_e->flags &= ~EDGE_FALLTHRU;
1938   enter_e->flags |= EDGE_FALSE_VALUE;
1939   gsi = gsi_last_bb (guard_bb);
1940
1941   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1942                                  is_gimple_condexpr_for_cond, NULL_TREE);
1943   if (gimplify_stmt_list)
1944     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1945
1946   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1947   gsi = gsi_last_bb (guard_bb);
1948   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1949
1950   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1951   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1952
1953   new_e->probability = probability;
1954   if (irreducible_p)
1955     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1956
1957   enter_e->probability = probability.invert ();
1958   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1959
1960   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1961   if (enter_e->dest->loop_father->header == enter_e->dest)
1962     split_edge (enter_e);
1963
1964   return new_e;
1965 }
1966
1967
1968 /* This function verifies that the following restrictions apply to LOOP:
1969    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1970        for innermost loop and 5 basic blocks for outer-loop.
1971    (2) it is single entry, single exit
1972    (3) its exit condition is the last stmt in the header
1973    (4) E is the entry/exit edge of LOOP.
1974  */
1975
1976 bool
1977 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
1978                              const_edge e)
1979 {
1980   edge entry_e = loop_preheader_edge (loop);
1981   gcond *orig_cond = get_loop_exit_condition (exit_e);
1982   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1983
1984   /* All loops have an outer scope; the only case loop->outer is NULL is for
1985      the function itself.  */
1986   if (!loop_outer (loop)
1987       || !empty_block_p (loop->latch)
1988       || !exit_e
1989       /* Verify that new loop exit condition can be trivially modified.  */
1990       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1991       || (e != exit_e && e != entry_e))
1992     return false;
1993
1994   basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1995   get_loop_body_with_size (loop, bbs, loop->num_nodes);
1996   bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1997   free (bbs);
1998   return ret;
1999 }
2000
2001 /* Function find_loop_location.
2002
2003    Extract the location of the loop in the source code.
2004    If the loop is not well formed for vectorization, an estimated
2005    location is calculated.
2006    Return the loop location if succeed and NULL if not.  */
2007
2008 dump_user_location_t
2009 find_loop_location (class loop *loop)
2010 {
2011   gimple *stmt = NULL;
2012   basic_block bb;
2013   gimple_stmt_iterator si;
2014
2015   if (!loop)
2016     return dump_user_location_t ();
2017
2018   /* For the root of the loop tree return the function location.  */
2019   if (!loop_outer (loop))
2020     return dump_user_location_t::from_function_decl (cfun->decl);
2021
2022   if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
2023     {
2024       /* We only care about the loop location, so use any exit with location
2025          information.  */
2026       for (edge e : get_loop_exit_edges (loop))
2027         {
2028           stmt = get_loop_exit_condition (e);
2029
2030           if (stmt
2031               && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2032             return stmt;
2033         }
2034     }
2035
2036   /* If we got here the loop is probably not "well formed",
2037      try to estimate the loop location */
2038
2039   if (!loop->header)
2040     return dump_user_location_t ();
2041
2042   bb = loop->header;
2043
2044   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2045     {
2046       stmt = gsi_stmt (si);
2047       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2048         return stmt;
2049     }
2050
2051   return dump_user_location_t ();
2052 }
2053
2054 /* Return true if the phi described by STMT_INFO defines an IV of the
2055    loop to be vectorized.  */
2056
2057 static bool
2058 iv_phi_p (stmt_vec_info stmt_info)
2059 {
2060   gphi *phi = as_a <gphi *> (stmt_info->stmt);
2061   if (virtual_operand_p (PHI_RESULT (phi)))
2062     return false;
2063
2064   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2065       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2066     return false;
2067
2068   return true;
2069 }
2070
2071 /* Return true if vectorizer can peel for nonlinear iv.  */
2072 static bool
2073 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2074                               stmt_vec_info stmt_info)
2075 {
2076   enum vect_induction_op_type induction_type
2077     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2078   tree niters_skip;
2079   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2080      if niters or vf is unkown:
2081      For shift, when shift mount >= precision, there would be UD.
2082      For mult, don't known how to generate
2083      init_expr * pow (step, niters) for variable niters.
2084      For neg, it should be ok, since niters of vectorized main loop
2085      will always be multiple of 2.  */
2086   if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2087        || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
2088       && induction_type != vect_step_op_neg)
2089     {
2090       if (dump_enabled_p ())
2091         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2092                          "Peeling for epilogue is not supported"
2093                          " for nonlinear induction except neg"
2094                          " when iteration count is unknown.\n");
2095       return false;
2096     }
2097
2098   /* Avoid compile time hog on vect_peel_nonlinear_iv_init.  */
2099   if (induction_type == vect_step_op_mul)
2100     {
2101       tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2102       tree type = TREE_TYPE (step_expr);
2103
2104       if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2105           && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2106         {
2107           if (dump_enabled_p ())
2108             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2109                              "Avoid compile time hog on"
2110                              " vect_peel_nonlinear_iv_init"
2111                              " for nonlinear induction vec_step_op_mul"
2112                              " when iteration count is too big.\n");
2113           return false;
2114         }
2115     }
2116
2117   /* Also doens't support peel for neg when niter is variable.
2118      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
2119   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2120   if ((niters_skip != NULL_TREE
2121        && (TREE_CODE (niters_skip) != INTEGER_CST
2122            || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2123       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2124           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2125     {
2126       if (dump_enabled_p ())
2127         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128                          "Peeling for alignement is not supported"
2129                          " for nonlinear induction when niters_skip"
2130                          " is not constant.\n");
2131       return false;
2132     }
2133
2134   /* We can't support partial vectors and early breaks with an induction
2135      type other than add or neg since we require the epilog and can't
2136      perform the peeling.  The below condition mirrors that of
2137      vect_gen_vector_loop_niters  where niters_vector_mult_vf_var then sets
2138      step_vector to VF rather than 1.  This is what creates the nonlinear
2139      IV.  PR113163.  */
2140   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
2141       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2142       && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2143       && induction_type != vect_step_op_neg)
2144     {
2145       if (dump_enabled_p ())
2146         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                          "Peeling for epilogue is not supported"
2148                          " for nonlinear induction except neg"
2149                          " when VF is known and early breaks.\n");
2150       return false;
2151     }
2152
2153   return true;
2154 }
2155
2156 /* Function vect_can_advance_ivs_p
2157
2158    In case the number of iterations that LOOP iterates is unknown at compile
2159    time, an epilog loop will be generated, and the loop induction variables
2160    (IVs) will be "advanced" to the value they are supposed to take just before
2161    the epilog loop.  Here we check that the access function of the loop IVs
2162    and the expression that represents the loop bound are simple enough.
2163    These restrictions will be relaxed in the future.  */
2164
2165 bool
2166 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2167 {
2168   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2169   basic_block bb = loop->header;
2170   gphi_iterator gsi;
2171
2172   /* Analyze phi functions of the loop header.  */
2173
2174   if (dump_enabled_p ())
2175     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2176   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2177     {
2178       tree evolution_part;
2179       enum vect_induction_op_type induction_type;
2180
2181       gphi *phi = gsi.phi ();
2182       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2183       if (dump_enabled_p ())
2184         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2185                          phi_info->stmt);
2186
2187       /* Skip virtual phi's. The data dependences that are associated with
2188          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2189
2190          Skip reduction phis.  */
2191       if (!iv_phi_p (phi_info))
2192         {
2193           if (dump_enabled_p ())
2194             dump_printf_loc (MSG_NOTE, vect_location,
2195                              "reduc or virtual phi. skip.\n");
2196           continue;
2197         }
2198
2199       induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2200       if (induction_type != vect_step_op_add)
2201         {
2202           if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2203             return false;
2204
2205           continue;
2206         }
2207
2208       /* Analyze the evolution function.  */
2209
2210       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2211       if (evolution_part == NULL_TREE)
2212         {
2213           if (dump_enabled_p ())
2214             dump_printf (MSG_MISSED_OPTIMIZATION,
2215                          "No access function or evolution.\n");
2216           return false;
2217         }
2218
2219       /* FORNOW: We do not transform initial conditions of IVs
2220          which evolution functions are not invariants in the loop.  */
2221
2222       if (!expr_invariant_in_loop_p (loop, evolution_part))
2223         {
2224           if (dump_enabled_p ())
2225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226                              "evolution not invariant in loop.\n");
2227           return false;
2228         }
2229
2230       /* FORNOW: We do not transform initial conditions of IVs
2231          which evolution functions are a polynomial of degree >= 2.  */
2232
2233       if (tree_is_chrec (evolution_part))
2234         {
2235           if (dump_enabled_p ())
2236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2237                              "evolution is chrec.\n");
2238           return false;
2239         }
2240     }
2241
2242   return true;
2243 }
2244
2245
2246 /*   Function vect_update_ivs_after_vectorizer.
2247
2248      "Advance" the induction variables of LOOP to the value they should take
2249      after the execution of LOOP.  This is currently necessary because the
2250      vectorizer does not handle induction variables that are used after the
2251      loop.  Such a situation occurs when the last iterations of LOOP are
2252      peeled, because:
2253      1. We introduced new uses after LOOP for IVs that were not originally used
2254         after LOOP: the IVs of LOOP are now used by an epilog loop.
2255      2. LOOP is going to be vectorized; this means that it will iterate N/VF
2256         times, whereas the loop IVs should be bumped N times.
2257
2258      Input:
2259      - LOOP - a loop that is going to be vectorized. The last few iterations
2260               of LOOP were peeled.
2261      - NITERS - the number of iterations that LOOP executes (before it is
2262                 vectorized). i.e, the number of times the ivs should be bumped.
2263      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2264                   coming out from LOOP on which there are uses of the LOOP ivs
2265                   (this is the path from LOOP->exit to epilog_loop->preheader).
2266
2267                   The new definitions of the ivs are placed in LOOP->exit.
2268                   The phi args associated with the edge UPDATE_E in the bb
2269                   UPDATE_E->dest are updated accordingly.
2270
2271      Assumption 1: Like the rest of the vectorizer, this function assumes
2272      a single loop exit that has a single predecessor.
2273
2274      Assumption 2: The phi nodes in the LOOP header and in update_bb are
2275      organized in the same order.
2276
2277      Assumption 3: The access function of the ivs is simple enough (see
2278      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
2279
2280      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2281      coming out of LOOP on which the ivs of LOOP are used (this is the path
2282      that leads to the epilog loop; other paths skip the epilog loop).  This
2283      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2284      needs to have its phis updated.
2285  */
2286
2287 static void
2288 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2289                                   tree niters, edge update_e)
2290 {
2291   gphi_iterator gsi, gsi1;
2292   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2293   basic_block update_bb = update_e->dest;
2294   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2295   gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2296
2297   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2298        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2299        gsi_next (&gsi), gsi_next (&gsi1))
2300     {
2301       tree init_expr;
2302       tree step_expr, off;
2303       tree type;
2304       tree var, ni, ni_name;
2305
2306       gphi *phi = gsi.phi ();
2307       gphi *phi1 = gsi1.phi ();
2308       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2309       if (dump_enabled_p ())
2310         dump_printf_loc (MSG_NOTE, vect_location,
2311                          "vect_update_ivs_after_vectorizer: phi: %G",
2312                          (gimple *) phi);
2313
2314       /* Skip reduction and virtual phis.  */
2315       if (!iv_phi_p (phi_info))
2316         {
2317           if (dump_enabled_p ())
2318             dump_printf_loc (MSG_NOTE, vect_location,
2319                              "reduc or virtual phi. skip.\n");
2320           continue;
2321         }
2322
2323       type = TREE_TYPE (gimple_phi_result (phi));
2324       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2325       step_expr = unshare_expr (step_expr);
2326
2327       /* FORNOW: We do not support IVs whose evolution function is a polynomial
2328          of degree >= 2 or exponential.  */
2329       gcc_assert (!tree_is_chrec (step_expr));
2330
2331       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2332       gimple_seq stmts = NULL;
2333       enum vect_induction_op_type induction_type
2334         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2335
2336       if (induction_type == vect_step_op_add)
2337         {
2338           tree stype = TREE_TYPE (step_expr);
2339           off = fold_build2 (MULT_EXPR, stype,
2340                                fold_convert (stype, niters), step_expr);
2341
2342           if (POINTER_TYPE_P (type))
2343             ni = fold_build_pointer_plus (init_expr, off);
2344           else
2345             ni = fold_convert (type,
2346                                fold_build2 (PLUS_EXPR, stype,
2347                                             fold_convert (stype, init_expr),
2348                                             off));
2349         }
2350       /* Don't bother call vect_peel_nonlinear_iv_init.  */
2351       else if (induction_type == vect_step_op_neg)
2352         ni = init_expr;
2353       else
2354         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2355                                           niters, step_expr,
2356                                           induction_type);
2357
2358       var = create_tmp_var (type, "tmp");
2359
2360       gimple_seq new_stmts = NULL;
2361       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2362
2363       /* Exit_bb shouldn't be empty.  */
2364       if (!gsi_end_p (last_gsi))
2365         {
2366           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2367           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2368         }
2369       else
2370         {
2371           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2372           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2373         }
2374
2375       /* Fix phi expressions in the successor bb.  */
2376       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2377     }
2378 }
2379
2380 /* Return a gimple value containing the misalignment (measured in vector
2381    elements) for the loop described by LOOP_VINFO, i.e. how many elements
2382    it is away from a perfectly aligned address.  Add any new statements
2383    to SEQ.  */
2384
2385 static tree
2386 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2387 {
2388   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2389   stmt_vec_info stmt_info = dr_info->stmt;
2390   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2391
2392   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2393   unsigned HOST_WIDE_INT target_align_c;
2394   tree target_align_minus_1;
2395
2396   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2397                                         size_zero_node) < 0;
2398   tree offset = (negative
2399                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2400                              * TREE_INT_CST_LOW
2401                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2402                  : size_zero_node);
2403   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2404                                                           stmt_info, seq,
2405                                                           offset);
2406   tree type = unsigned_type_for (TREE_TYPE (start_addr));
2407   if (target_align.is_constant (&target_align_c))
2408     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2409   else
2410     {
2411       tree vla = build_int_cst (type, target_align);
2412       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2413                                     fold_build2 (MINUS_EXPR, type,
2414                                                  build_int_cst (type, 0), vla));
2415       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2416                                           build_int_cst (type, 1));
2417     }
2418
2419   HOST_WIDE_INT elem_size
2420     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2421   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2422
2423   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
2424   tree int_start_addr = fold_convert (type, start_addr);
2425   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2426                                         target_align_minus_1);
2427
2428   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
2429   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2430                                         elem_size_log);
2431
2432   return misalign_in_elems;
2433 }
2434
2435 /* Function vect_gen_prolog_loop_niters
2436
2437    Generate the number of iterations which should be peeled as prolog for the
2438    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
2439    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2440    As a result, after the execution of this loop, the data reference DR will
2441    refer to an aligned location.  The following computation is generated:
2442
2443    If the misalignment of DR is known at compile time:
2444      addr_mis = int mis = DR_MISALIGNMENT (dr);
2445    Else, compute address misalignment in bytes:
2446      addr_mis = addr & (target_align - 1)
2447
2448    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2449
2450    (elem_size = element type size; an element is the scalar element whose type
2451    is the inner type of the vectype)
2452
2453    The computations will be emitted at the end of BB.  We also compute and
2454    store upper bound (included) of the result in BOUND.
2455
2456    When the step of the data-ref in the loop is not 1 (as in interleaved data
2457    and SLP), the number of iterations of the prolog must be divided by the step
2458    (which is equal to the size of interleaved group).
2459
2460    The above formulas assume that VF == number of elements in the vector. This
2461    may not hold when there are multiple-types in the loop.
2462    In this case, for some data-references in the loop the VF does not represent
2463    the number of elements that fit in the vector.  Therefore, instead of VF we
2464    use TYPE_VECTOR_SUBPARTS.  */
2465
2466 static tree
2467 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2468                              basic_block bb, int *bound)
2469 {
2470   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2471   tree var;
2472   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2473   gimple_seq stmts = NULL, new_stmts = NULL;
2474   tree iters, iters_name;
2475   stmt_vec_info stmt_info = dr_info->stmt;
2476   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2477   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2478
2479   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2480     {
2481       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2482
2483       if (dump_enabled_p ())
2484         dump_printf_loc (MSG_NOTE, vect_location,
2485                          "known peeling = %d.\n", npeel);
2486
2487       iters = build_int_cst (niters_type, npeel);
2488       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2489     }
2490   else
2491     {
2492       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2493       tree type = TREE_TYPE (misalign_in_elems);
2494       HOST_WIDE_INT elem_size
2495         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2496       /* We only do prolog peeling if the target alignment is known at compile
2497          time.  */
2498       poly_uint64 align_in_elems =
2499         exact_div (target_align, elem_size);
2500       tree align_in_elems_minus_1 =
2501         build_int_cst (type, align_in_elems - 1);
2502       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2503
2504       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
2505                                  & (align_in_elems - 1)).  */
2506       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2507                                             size_zero_node) < 0;
2508       if (negative)
2509         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2510                              align_in_elems_tree);
2511       else
2512         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2513                              misalign_in_elems);
2514       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2515       iters = fold_convert (niters_type, iters);
2516       unsigned HOST_WIDE_INT align_in_elems_c;
2517       if (align_in_elems.is_constant (&align_in_elems_c))
2518         *bound = align_in_elems_c - 1;
2519       else
2520         *bound = -1;
2521     }
2522
2523   if (dump_enabled_p ())
2524     dump_printf_loc (MSG_NOTE, vect_location,
2525                      "niters for prolog loop: %T\n", iters);
2526
2527   var = create_tmp_var (niters_type, "prolog_loop_niters");
2528   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2529
2530   if (new_stmts)
2531     gimple_seq_add_seq (&stmts, new_stmts);
2532   if (stmts)
2533     {
2534       gcc_assert (single_succ_p (bb));
2535       gimple_stmt_iterator gsi = gsi_last_bb (bb);
2536       if (gsi_end_p (gsi))
2537         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2538       else
2539         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2540     }
2541   return iters_name;
2542 }
2543
2544
2545 /* Function vect_update_init_of_dr
2546
2547    If CODE is PLUS, the vector loop starts NITERS iterations after the
2548    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2549    iterations before the scalar one (using masking to skip inactive
2550    elements).  This function updates the information recorded in DR to
2551    account for the difference.  Specifically, it updates the OFFSET
2552    field of DR_INFO.  */
2553
2554 static void
2555 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2556 {
2557   struct data_reference *dr = dr_info->dr;
2558   tree offset = dr_info->offset;
2559   if (!offset)
2560     offset = build_zero_cst (sizetype);
2561
2562   niters = fold_build2 (MULT_EXPR, sizetype,
2563                         fold_convert (sizetype, niters),
2564                         fold_convert (sizetype, DR_STEP (dr)));
2565   offset = fold_build2 (code, sizetype,
2566                         fold_convert (sizetype, offset), niters);
2567   dr_info->offset = offset;
2568 }
2569
2570
2571 /* Function vect_update_inits_of_drs
2572
2573    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2574    CODE and NITERS are as for vect_update_inits_of_dr.  */
2575
2576 void
2577 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2578                           tree_code code)
2579 {
2580   unsigned int i;
2581   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2582   struct data_reference *dr;
2583
2584   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2585
2586   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
2587      here, but since we might use these niters to update the epilogues niters
2588      and data references we can't insert them here as this definition might not
2589      always dominate its uses.  */
2590   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2591     niters = fold_convert (sizetype, niters);
2592
2593   FOR_EACH_VEC_ELT (datarefs, i, dr)
2594     {
2595       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2596       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2597           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2598         vect_update_init_of_dr (dr_info, niters, code);
2599     }
2600 }
2601
2602 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2603    by masking.  This involves calculating the number of iterations to
2604    be peeled and then aligning all memory references appropriately.  */
2605
2606 void
2607 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2608 {
2609   tree misalign_in_elems;
2610   tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2611
2612   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2613
2614   /* From the information recorded in LOOP_VINFO get the number of iterations
2615      that need to be skipped via masking.  */
2616   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2617     {
2618       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2619                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2620       misalign_in_elems = build_int_cst (type, misalign);
2621     }
2622   else
2623     {
2624       gimple_seq seq1 = NULL, seq2 = NULL;
2625       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2626       misalign_in_elems = fold_convert (type, misalign_in_elems);
2627       misalign_in_elems = force_gimple_operand (misalign_in_elems,
2628                                                 &seq2, true, NULL_TREE);
2629       gimple_seq_add_seq (&seq1, seq2);
2630       if (seq1)
2631         {
2632           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2633           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2634           gcc_assert (!new_bb);
2635         }
2636     }
2637
2638   if (dump_enabled_p ())
2639     dump_printf_loc (MSG_NOTE, vect_location,
2640                      "misalignment for fully-masked loop: %T\n",
2641                      misalign_in_elems);
2642
2643   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2644
2645   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2646 }
2647
2648 /* This function builds ni_name = number of iterations.  Statements
2649    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
2650    it to TRUE if new ssa_var is generated.  */
2651
2652 tree
2653 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2654 {
2655   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2656   if (TREE_CODE (ni) == INTEGER_CST)
2657     return ni;
2658   else
2659     {
2660       tree ni_name, var;
2661       gimple_seq stmts = NULL;
2662       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2663
2664       var = create_tmp_var (TREE_TYPE (ni), "niters");
2665       ni_name = force_gimple_operand (ni, &stmts, false, var);
2666       if (stmts)
2667         {
2668           gsi_insert_seq_on_edge_immediate (pe, stmts);
2669           if (new_var_p != NULL)
2670             *new_var_p = true;
2671         }
2672
2673       return ni_name;
2674     }
2675 }
2676
2677 /* Calculate the number of iterations above which vectorized loop will be
2678    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
2679    of prolog loop.  If it's integer const, the integer number is also passed
2680    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
2681    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
2682    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
2683    threshold below which the scalar (rather than vectorized) loop will be
2684    executed.  This function stores the upper bound (inclusive) of the result
2685    in BOUND_SCALAR.  */
2686
2687 static tree
2688 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2689                              int bound_prolog, poly_int64 bound_epilog, int th,
2690                              poly_uint64 *bound_scalar,
2691                              bool check_profitability)
2692 {
2693   tree type = TREE_TYPE (niters_prolog);
2694   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2695                              build_int_cst (type, bound_epilog));
2696
2697   *bound_scalar = bound_prolog + bound_epilog;
2698   if (check_profitability)
2699     {
2700       /* TH indicates the minimum niters of vectorized loop, while we
2701          compute the maximum niters of scalar loop.  */
2702       th--;
2703       /* Peeling for constant times.  */
2704       if (int_niters_prolog >= 0)
2705         {
2706           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2707           return build_int_cst (type, *bound_scalar);
2708         }
2709       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
2710          and BOUND_EPILOG are inclusive upper bounds.  */
2711       if (known_ge (th, bound_prolog + bound_epilog))
2712         {
2713           *bound_scalar = th;
2714           return build_int_cst (type, th);
2715         }
2716       /* Need to do runtime comparison.  */
2717       else if (maybe_gt (th, bound_epilog))
2718         {
2719           *bound_scalar = upper_bound (*bound_scalar, th);
2720           return fold_build2 (MAX_EXPR, type,
2721                               build_int_cst (type, th), niters);
2722         }
2723     }
2724   return niters;
2725 }
2726
2727 /* NITERS is the number of times that the original scalar loop executes
2728    after peeling.  Work out the maximum number of iterations N that can
2729    be handled by the vectorized form of the loop and then either:
2730
2731    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2732
2733         niters_vector = N
2734
2735    b) set *STEP_VECTOR_PTR to one and generate:
2736
2737         niters_vector = N / vf
2738
2739    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2740    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
2741    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
2742
2743 void
2744 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2745                              tree *niters_vector_ptr, tree *step_vector_ptr,
2746                              bool niters_no_overflow)
2747 {
2748   tree ni_minus_gap, var;
2749   tree niters_vector, step_vector, type = TREE_TYPE (niters);
2750   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2751   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2752   tree log_vf = NULL_TREE;
2753
2754   /* If epilogue loop is required because of data accesses with gaps, we
2755      subtract one iteration from the total number of iterations here for
2756      correct calculation of RATIO.  */
2757   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2758     {
2759       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2760                                   build_one_cst (type));
2761       if (!is_gimple_val (ni_minus_gap))
2762         {
2763           var = create_tmp_var (type, "ni_gap");
2764           gimple *stmts = NULL;
2765           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2766                                                true, var);
2767           gsi_insert_seq_on_edge_immediate (pe, stmts);
2768         }
2769     }
2770   else
2771     ni_minus_gap = niters;
2772
2773   /* To silence some unexpected warnings, simply initialize to 0. */
2774   unsigned HOST_WIDE_INT const_vf = 0;
2775   if (vf.is_constant (&const_vf)
2776       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2777     {
2778       /* Create: niters >> log2(vf) */
2779       /* If it's known that niters == number of latch executions + 1 doesn't
2780          overflow, we can generate niters >> log2(vf); otherwise we generate
2781          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2782          will be at least one.  */
2783       log_vf = build_int_cst (type, exact_log2 (const_vf));
2784       if (niters_no_overflow)
2785         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2786       else
2787         niters_vector
2788           = fold_build2 (PLUS_EXPR, type,
2789                          fold_build2 (RSHIFT_EXPR, type,
2790                                       fold_build2 (MINUS_EXPR, type,
2791                                                    ni_minus_gap,
2792                                                    build_int_cst (type, vf)),
2793                                       log_vf),
2794                          build_int_cst (type, 1));
2795       step_vector = build_one_cst (type);
2796     }
2797   else
2798     {
2799       niters_vector = ni_minus_gap;
2800       step_vector = build_int_cst (type, vf);
2801     }
2802
2803   if (!is_gimple_val (niters_vector))
2804     {
2805       var = create_tmp_var (type, "bnd");
2806       gimple_seq stmts = NULL;
2807       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2808       gsi_insert_seq_on_edge_immediate (pe, stmts);
2809       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2810          we set range information to make niters analyzer's life easier.
2811          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2812          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2813       if (stmts != NULL && log_vf)
2814         {
2815           if (niters_no_overflow)
2816             {
2817               value_range vr (type,
2818                               wi::one (TYPE_PRECISION (type)),
2819                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2820                                                          TYPE_SIGN (type)),
2821                                           exact_log2 (const_vf),
2822                                           TYPE_SIGN (type)));
2823               set_range_info (niters_vector, vr);
2824             }
2825           /* For VF == 1 the vector IV might also overflow so we cannot
2826              assert a minimum value of 1.  */
2827           else if (const_vf > 1)
2828             {
2829               value_range vr (type,
2830                               wi::one (TYPE_PRECISION (type)),
2831                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2832                                                          TYPE_SIGN (type))
2833                                           - (const_vf - 1),
2834                                           exact_log2 (const_vf), TYPE_SIGN (type))
2835                               + 1);
2836               set_range_info (niters_vector, vr);
2837             }
2838         }
2839     }
2840   *niters_vector_ptr = niters_vector;
2841   *step_vector_ptr = step_vector;
2842
2843   return;
2844 }
2845
2846 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2847    loop specified by LOOP_VINFO after vectorization, compute the number
2848    of iterations before vectorization (niters_vector * vf) and store it
2849    to NITERS_VECTOR_MULT_VF_PTR.  */
2850
2851 static void
2852 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2853                                      tree niters_vector,
2854                                      tree *niters_vector_mult_vf_ptr)
2855 {
2856   /* We should be using a step_vector of VF if VF is variable.  */
2857   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2858   tree type = TREE_TYPE (niters_vector);
2859   tree log_vf = build_int_cst (type, exact_log2 (vf));
2860   tree tree_vf = build_int_cst (type, vf);
2861   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2862
2863   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2864   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2865                                             niters_vector, log_vf);
2866
2867   /* If we've peeled a vector iteration then subtract one full vector
2868      iteration.  */
2869   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2870     niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
2871                                          niters_vector_mult_vf, tree_vf);
2872
2873   if (!is_gimple_val (niters_vector_mult_vf))
2874     {
2875       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2876       gimple_seq stmts = NULL;
2877       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2878                                                     &stmts, true, var);
2879       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2880       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2881     }
2882   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2883 }
2884
2885 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2886    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2887    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2888    appear like below:
2889
2890        guard_bb:
2891          if (cond)
2892            goto merge_bb;
2893          else
2894            goto skip_loop;
2895
2896      skip_loop:
2897        header_a:
2898          i_1 = PHI<i_0, i_2>;
2899          ...
2900          i_2 = i_1 + 1;
2901          if (cond_a)
2902            goto latch_a;
2903          else
2904            goto exit_a;
2905        latch_a:
2906          goto header_a;
2907
2908        exit_a:
2909          i_5 = PHI<i_2>;
2910
2911        merge_bb:
2912          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2913
2914      update_loop:
2915        header_b:
2916          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2917          ...
2918          i_4 = i_3 + 1;
2919          if (cond_b)
2920            goto latch_b;
2921          else
2922            goto exit_bb;
2923        latch_b:
2924          goto header_b;
2925
2926        exit_bb:
2927
2928    This function creates PHI nodes at merge_bb and replaces the use of i_5
2929    in the update_loop's PHI node with the result of new PHI result.  */
2930
2931 static void
2932 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2933                                     class loop *update_loop,
2934                                     edge guard_edge, edge merge_edge)
2935 {
2936   location_t merge_loc, guard_loc;
2937   edge orig_e = loop_preheader_edge (skip_loop);
2938   edge update_e = loop_preheader_edge (update_loop);
2939   gphi_iterator gsi_orig, gsi_update;
2940
2941   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2942         gsi_update = gsi_start_phis (update_loop->header));
2943        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2944        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2945     {
2946       gphi *orig_phi = gsi_orig.phi ();
2947       gphi *update_phi = gsi_update.phi ();
2948
2949       /* Generate new phi node at merge bb of the guard.  */
2950       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2951       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2952
2953       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2954          args in NEW_PHI for these edges.  */
2955       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2956       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2957       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2958       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2959       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2960       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2961
2962       /* Update phi in UPDATE_PHI.  */
2963       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2964     }
2965 }
2966
2967 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2968    Return a value that equals:
2969
2970    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2971    - SKIP_VALUE when the main loop is skipped.  */
2972
2973 tree
2974 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2975                            tree skip_value)
2976 {
2977   gcc_assert (loop_vinfo->main_loop_edge);
2978
2979   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2980   basic_block bb = loop_vinfo->main_loop_edge->dest;
2981   gphi *new_phi = create_phi_node (phi_result, bb);
2982   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2983                UNKNOWN_LOCATION);
2984   add_phi_arg (new_phi, skip_value,
2985                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2986   return phi_result;
2987 }
2988
2989 /* Function vect_do_peeling.
2990
2991    Input:
2992    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2993
2994        preheader:
2995      LOOP:
2996        header_bb:
2997          loop_body
2998          if (exit_loop_cond) goto exit_bb
2999          else                goto header_bb
3000        exit_bb:
3001
3002    - NITERS: The number of iterations of the loop.
3003    - NITERSM1: The number of iterations of the loop's latch.
3004    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3005    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3006                               CHECK_PROFITABILITY is true.
3007    Output:
3008    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3009      iterate after vectorization; see vect_set_loop_condition for details.
3010    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3011      should be set to the number of scalar iterations handled by the
3012      vector loop.  The SSA name is only used on exit from the loop.
3013
3014    This function peels prolog and epilog from the loop, adds guards skipping
3015    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
3016    would look like:
3017
3018        guard_bb_1:
3019          if (prefer_scalar_loop) goto merge_bb_1
3020          else                    goto guard_bb_2
3021
3022        guard_bb_2:
3023          if (skip_prolog) goto merge_bb_2
3024          else             goto prolog_preheader
3025
3026        prolog_preheader:
3027      PROLOG:
3028        prolog_header_bb:
3029          prolog_body
3030          if (exit_prolog_cond) goto prolog_exit_bb
3031          else                  goto prolog_header_bb
3032        prolog_exit_bb:
3033
3034        merge_bb_2:
3035
3036        vector_preheader:
3037      VECTOR LOOP:
3038        vector_header_bb:
3039          vector_body
3040          if (exit_vector_cond) goto vector_exit_bb
3041          else                  goto vector_header_bb
3042        vector_exit_bb:
3043
3044        guard_bb_3:
3045          if (skip_epilog) goto merge_bb_3
3046          else             goto epilog_preheader
3047
3048        merge_bb_1:
3049
3050        epilog_preheader:
3051      EPILOG:
3052        epilog_header_bb:
3053          epilog_body
3054          if (exit_epilog_cond) goto merge_bb_3
3055          else                  goto epilog_header_bb
3056
3057        merge_bb_3:
3058
3059    Note this function peels prolog and epilog only if it's necessary,
3060    as well as guards.
3061    This function returns the epilogue loop if a decision was made to vectorize
3062    it, otherwise NULL.
3063
3064    The analysis resulting in this epilogue loop's loop_vec_info was performed
3065    in the same vect_analyze_loop call as the main loop's.  At that time
3066    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3067    vectorization factors than the main loop.  This list is stored in the main
3068    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
3069    vectorize the epilogue loop for a lower vectorization factor,  the
3070    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
3071    updated and linked to the epilogue loop.  This is later used to vectorize
3072    the epilogue.  The reason the loop_vec_info needs updating is that it was
3073    constructed based on the original main loop, and the epilogue loop is a
3074    copy of this loop, so all links pointing to statements in the original loop
3075    need updating.  Furthermore, these loop_vec_infos share the
3076    data_reference's records, which will also need to be updated.
3077
3078    TODO: Guard for prefer_scalar_loop should be emitted along with
3079    versioning conditions if loop versioning is needed.  */
3080
3081
3082 class loop *
3083 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3084                  tree *niters_vector, tree *step_vector,
3085                  tree *niters_vector_mult_vf_var, int th,
3086                  bool check_profitability, bool niters_no_overflow,
3087                  tree *advance)
3088 {
3089   edge e, guard_e;
3090   tree type = TREE_TYPE (niters), guard_cond;
3091   basic_block guard_bb, guard_to;
3092   profile_probability prob_prolog, prob_vector, prob_epilog;
3093   int estimated_vf;
3094   int prolog_peeling = 0;
3095   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
3096   /* We currently do not support prolog peeling if the target alignment is not
3097      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
3098      target alignment being constant.  */
3099   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3100   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
3101     return NULL;
3102
3103   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3104     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3105
3106   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3107   poly_uint64 bound_epilog = 0;
3108   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3109       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3110     bound_epilog += vf - 1;
3111   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3112     bound_epilog += 1;
3113
3114   /* For early breaks the scalar loop needs to execute at most VF times
3115      to find the element that caused the break.  */
3116   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3117     bound_epilog = vf;
3118
3119   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3120   poly_uint64 bound_scalar = bound_epilog;
3121
3122   if (!prolog_peeling && !epilog_peeling)
3123     return NULL;
3124
3125   /* Before doing any peeling make sure to reset debug binds outside of
3126      the loop refering to defs not in LC SSA.  */
3127   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3128   for (unsigned i = 0; i < loop->num_nodes; ++i)
3129     {
3130       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3131       imm_use_iterator ui;
3132       gimple *use_stmt;
3133       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3134            gsi_next (&gsi))
3135         {
3136           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3137             if (gimple_debug_bind_p (use_stmt)
3138                 && loop != gimple_bb (use_stmt)->loop_father
3139                 && !flow_loop_nested_p (loop,
3140                                         gimple_bb (use_stmt)->loop_father))
3141               {
3142                 gimple_debug_bind_reset_value (use_stmt);
3143                 update_stmt (use_stmt);
3144               }
3145         }
3146       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3147            gsi_next (&gsi))
3148         {
3149           ssa_op_iter op_iter;
3150           def_operand_p def_p;
3151           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3152             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3153               if (gimple_debug_bind_p (use_stmt)
3154                   && loop != gimple_bb (use_stmt)->loop_father
3155                   && !flow_loop_nested_p (loop,
3156                                           gimple_bb (use_stmt)->loop_father))
3157                 {
3158                   gimple_debug_bind_reset_value (use_stmt);
3159                   update_stmt (use_stmt);
3160                 }
3161         }
3162     }
3163
3164   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3165   estimated_vf = vect_vf_for_cost (loop_vinfo);
3166   if (estimated_vf == 2)
3167     estimated_vf = 3;
3168   prob_prolog = prob_epilog = profile_probability::guessed_always ()
3169                         .apply_scale (estimated_vf - 1, estimated_vf);
3170
3171   class loop *prolog, *epilog = NULL;
3172   class loop *first_loop = loop;
3173   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3174
3175   /* SSA form needs to be up-to-date since we are going to manually
3176      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3177      update SSA state after that, so we have to make sure to not lose any
3178      pending update needs.  */
3179   gcc_assert (!need_ssa_update_p (cfun));
3180
3181   /* If we're vectorizing an epilogue loop, we have ensured that the
3182      virtual operand is in SSA form throughout the vectorized main loop.
3183      Normally it is possible to trace the updated
3184      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3185      back to scalar-stmt vuses, meaning that the effect of the SSA update
3186      remains local to the main loop.  However, there are rare cases in
3187      which the vectorized loop should have vdefs even when the original scalar
3188      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
3189      introduces clobbers of the temporary vector array, which in turn
3190      needs new vdefs.  If the scalar loop doesn't write to memory, these
3191      new vdefs will be the only ones in the vector loop.
3192      We are currently defering updating virtual SSA form and creating
3193      of a virtual PHI for this case so we do not have to make sure the
3194      newly introduced virtual def is in LCSSA form.  */
3195
3196   if (MAY_HAVE_DEBUG_BIND_STMTS)
3197     {
3198       gcc_assert (!adjust_vec.exists ());
3199       adjust_vec.create (32);
3200     }
3201   initialize_original_copy_tables ();
3202
3203   /* Record the anchor bb at which the guard should be placed if the scalar
3204      loop might be preferred.  */
3205   basic_block anchor = loop_preheader_edge (loop)->src;
3206
3207   /* Generate the number of iterations for the prolog loop.  We do this here
3208      so that we can also get the upper bound on the number of iterations.  */
3209   tree niters_prolog;
3210   int bound_prolog = 0;
3211   if (prolog_peeling)
3212     {
3213       niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3214                                                     &bound_prolog);
3215       /* If algonment peeling is known, we will always execute prolog.  */
3216       if (TREE_CODE (niters_prolog) == INTEGER_CST)
3217         prob_prolog = profile_probability::always ();
3218     }
3219   else
3220     niters_prolog = build_int_cst (type, 0);
3221
3222   loop_vec_info epilogue_vinfo = NULL;
3223   if (vect_epilogues)
3224     {
3225       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
3226       loop_vinfo->epilogue_vinfos.ordered_remove (0);
3227     }
3228
3229   tree niters_vector_mult_vf = NULL_TREE;
3230   /* Saving NITERs before the loop, as this may be changed by prologue.  */
3231   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3232   edge update_e = NULL, skip_e = NULL;
3233   unsigned int lowest_vf = constant_lower_bound (vf);
3234   /* Prolog loop may be skipped.  */
3235   bool skip_prolog = (prolog_peeling != 0);
3236   /* Skip this loop to epilog when there are not enough iterations to enter this
3237      vectorized loop.  If true we should perform runtime checks on the NITERS
3238      to check whether we should skip the current vectorized loop.  If we know
3239      the number of scalar iterations we may choose to add a runtime check if
3240      this number "maybe" smaller than the number of iterations required
3241      when we know the number of scalar iterations may potentially
3242      be smaller than the number of iterations required to enter this loop, for
3243      this we use the upper bounds on the prolog and epilog peeling.  When we
3244      don't know the number of iterations and don't require versioning it is
3245      because we have asserted that there are enough scalar iterations to enter
3246      the main loop, so this skip is not necessary.  When we are versioning then
3247      we only add such a skip if we have chosen to vectorize the epilogue.  */
3248   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3249                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3250                                   bound_prolog + bound_epilog)
3251                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3252                          || vect_epilogues));
3253
3254   /* Epilog loop must be executed if the number of iterations for epilog
3255      loop is known at compile time, otherwise we need to add a check at
3256      the end of vector loop and skip to the end of epilog loop.  */
3257   bool skip_epilog = (prolog_peeling < 0
3258                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3259                       || !vf.is_constant ());
3260   /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3261      loop must be executed.  */
3262   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3263       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3264     skip_epilog = false;
3265
3266   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3267   auto_vec<profile_count> original_counts;
3268   basic_block *original_bbs = NULL;
3269
3270   if (skip_vector)
3271     {
3272       split_edge (loop_preheader_edge (loop));
3273
3274       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3275         {
3276           original_bbs = get_loop_body (loop);
3277           for (unsigned int i = 0; i < loop->num_nodes; i++)
3278             original_counts.safe_push(original_bbs[i]->count);
3279         }
3280
3281       /* Due to the order in which we peel prolog and epilog, we first
3282          propagate probability to the whole loop.  The purpose is to
3283          avoid adjusting probabilities of both prolog and vector loops
3284          separately.  Note in this case, the probability of epilog loop
3285          needs to be scaled back later.  */
3286       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3287       if (prob_vector.initialized_p ())
3288         {
3289           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3290           scale_loop_profile (loop, prob_vector, -1);
3291         }
3292     }
3293
3294   if (vect_epilogues)
3295     {
3296       /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3297          use the original scalar loop as remaining epilogue if necessary.  */
3298       LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3299         = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3300       LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3301         = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3302     }
3303
3304   if (prolog_peeling)
3305     {
3306       e = loop_preheader_edge (loop);
3307       edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3308       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3309                            && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo));
3310
3311       /* Peel prolog and put it on preheader edge of loop.  */
3312       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3313       edge prolog_e = NULL;
3314       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3315                                                        scalar_loop, scalar_e,
3316                                                        e, &prolog_e);
3317       gcc_assert (prolog);
3318       prolog->force_vectorize = false;
3319
3320       first_loop = prolog;
3321       reset_original_copy_tables ();
3322
3323       /* Update the number of iterations for prolog loop.  */
3324       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3325       vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3326                                step_prolog, NULL_TREE, false);
3327
3328       /* Skip the prolog loop.  */
3329       if (skip_prolog)
3330         {
3331           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3332                                     niters_prolog, build_int_cst (type, 0));
3333           guard_bb = loop_preheader_edge (prolog)->src;
3334           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3335           guard_to = split_edge (loop_preheader_edge (loop));
3336           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3337                                            guard_to, guard_bb,
3338                                            prob_prolog.invert (),
3339                                            irred_flag);
3340           e = EDGE_PRED (guard_to, 0);
3341           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3342           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3343
3344           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3345           scale_loop_profile (prolog, prob_prolog, bound_prolog - 1);
3346         }
3347
3348       /* Update init address of DRs.  */
3349       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3350       /* Update niters for vector loop.  */
3351       LOOP_VINFO_NITERS (loop_vinfo)
3352         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3353       LOOP_VINFO_NITERSM1 (loop_vinfo)
3354         = fold_build2 (MINUS_EXPR, type,
3355                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3356       bool new_var_p = false;
3357       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3358       /* It's guaranteed that vector loop bound before vectorization is at
3359          least VF, so set range information for newly generated var.  */
3360       if (new_var_p)
3361         {
3362           value_range vr (type,
3363                           wi::to_wide (build_int_cst (type, lowest_vf)),
3364                           wi::to_wide (TYPE_MAX_VALUE (type)));
3365           set_range_info (niters, vr);
3366         }
3367
3368       /* Prolog iterates at most bound_prolog times, latch iterates at
3369          most bound_prolog - 1 times.  */
3370       record_niter_bound (prolog, bound_prolog - 1, false, true);
3371       delete_update_ssa ();
3372       adjust_vec_debug_stmts ();
3373       scev_reset ();
3374     }
3375   basic_block bb_before_epilog = NULL;
3376
3377   if (epilog_peeling)
3378     {
3379       e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3380       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3381
3382       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
3383          said epilog then we should use a copy of the main loop as a starting
3384          point.  This loop may have already had some preliminary transformations
3385          to allow for more optimal vectorization, for example if-conversion.
3386          If we are not vectorizing the epilog then we should use the scalar loop
3387          as the transformations mentioned above make less or no sense when not
3388          vectorizing.  */
3389       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3390       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3391       edge epilog_e = vect_epilogues ? e : scalar_e;
3392       edge new_epilog_e = NULL;
3393       auto_vec<basic_block> doms;
3394       epilog
3395         = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3396                                                   &new_epilog_e, true, &doms);
3397
3398       LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3399       gcc_assert (epilog);
3400       gcc_assert (new_epilog_e);
3401       epilog->force_vectorize = false;
3402       bb_before_epilog = loop_preheader_edge (epilog)->src;
3403
3404       /* Scalar version loop may be preferred.  In this case, add guard
3405          and skip to epilog.  Note this only happens when the number of
3406          iterations of loop is unknown at compile time, otherwise this
3407          won't be vectorized.  */
3408       if (skip_vector)
3409         {
3410           /* Additional epilogue iteration is peeled if gap exists.  */
3411           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3412                                                 bound_prolog, bound_epilog,
3413                                                 th, &bound_scalar,
3414                                                 check_profitability);
3415           /* Build guard against NITERSM1 since NITERS may overflow.  */
3416           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3417           guard_bb = anchor;
3418           guard_to = split_edge (loop_preheader_edge (epilog));
3419           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3420                                            guard_to, guard_bb,
3421                                            prob_vector.invert (),
3422                                            irred_flag);
3423           skip_e = guard_e;
3424           e = EDGE_PRED (guard_to, 0);
3425           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3426           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3427
3428           /* Simply propagate profile info from guard_bb to guard_to which is
3429              a merge point of control flow.  */
3430           profile_count old_count = guard_to->count;
3431           guard_to->count = guard_bb->count;
3432
3433           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3434           if (vect_epilogues || scalar_loop == NULL)
3435             {
3436               gcc_assert(epilog->num_nodes == loop->num_nodes);
3437               basic_block *bbs = get_loop_body (epilog);
3438               for (unsigned int i = 0; i < epilog->num_nodes; i++)
3439                 {
3440                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3441                   bbs[i]->count = original_counts[i];
3442                 }
3443               free (bbs);
3444               free (original_bbs);
3445             }
3446           else if (old_count.nonzero_p ())
3447             scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3448
3449           /* Only need to handle basic block before epilog loop if it's not
3450              the guard_bb, which is the case when skip_vector is true.  */
3451           if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3452             bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3453           bb_before_epilog = loop_preheader_edge (epilog)->src;
3454         }
3455
3456       /* If loop is peeled for non-zero constant times, now niters refers to
3457          orig_niters - prolog_peeling, it won't overflow even the orig_niters
3458          overflows.  */
3459       niters_no_overflow |= (prolog_peeling > 0);
3460       vect_gen_vector_loop_niters (loop_vinfo, niters,
3461                                    niters_vector, step_vector,
3462                                    niters_no_overflow);
3463       if (!integer_onep (*step_vector))
3464         {
3465           /* On exit from the loop we will have an easy way of calcalating
3466              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3467              until then.  */
3468           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3469           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3470           *niters_vector_mult_vf_var = niters_vector_mult_vf;
3471         }
3472       else
3473         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3474                                              &niters_vector_mult_vf);
3475       /* Update IVs of original loop as if they were advanced by
3476          niters_vector_mult_vf steps.  */
3477       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3478       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3479       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3480         update_e = single_succ_edge (LOOP_VINFO_IV_EXIT (loop_vinfo)->dest);
3481
3482       /* If we have a peeled vector iteration, all exits are the same, leave it
3483          and so the main exit needs to be treated the same as the alternative
3484          exits in that we leave their updates to vectorizable_live_operations.
3485          */
3486       if (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3487         vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3488                                           update_e);
3489
3490       /* If we have a peeled vector iteration we will never skip the epilog loop
3491          and we can simplify the cfg a lot by not doing the edge split.  */
3492       if (skip_epilog || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3493         {
3494           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3495                                     niters, niters_vector_mult_vf);
3496
3497           guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3498           edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3499           guard_to = epilog_e->dest;
3500           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3501                                            skip_vector ? anchor : guard_bb,
3502                                            prob_epilog.invert (),
3503                                            irred_flag);
3504           doms.safe_push (guard_to);
3505           if (vect_epilogues)
3506             epilogue_vinfo->skip_this_loop_edge = guard_e;
3507           edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3508           gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3509           for (gphi_iterator gsi = gsi_start_phis (guard_to);
3510                !gsi_end_p (gsi); gsi_next (&gsi))
3511             {
3512               /* We are expecting all of the PHIs we have on epilog_e
3513                  to be also on the main loop exit.  But sometimes
3514                  a stray virtual definition can appear at epilog_e
3515                  which we can then take as the same on all exits,
3516                  we've removed the LC SSA PHI on the main exit before
3517                  so we wouldn't need to create a loop PHI for it.  */
3518               if (virtual_operand_p (gimple_phi_result (*gsi))
3519                   && (gsi_end_p (gsi2)
3520                       || !virtual_operand_p (gimple_phi_result (*gsi2))))
3521                 add_phi_arg (*gsi,
3522                              gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3523                              guard_e, UNKNOWN_LOCATION);
3524               else
3525                 {
3526                   add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3527                                UNKNOWN_LOCATION);
3528                   gsi_next (&gsi2);
3529                 }
3530             }
3531
3532           /* Only need to handle basic block before epilog loop if it's not
3533              the guard_bb, which is the case when skip_vector is true.  */
3534           if (guard_bb != bb_before_epilog)
3535             {
3536               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3537
3538               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3539             }
3540           scale_loop_profile (epilog, prob_epilog, -1);
3541         }
3542
3543       /* Recalculate the dominators after adding the guard edge.  */
3544       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3545         iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3546
3547       /* When we do not have a loop-around edge to the epilog we know
3548          the vector loop covered at least VF scalar iterations unless
3549          we have early breaks.
3550          Update any known upper bound with this knowledge.  */
3551       if (! skip_vector
3552           && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3553         {
3554           if (epilog->any_upper_bound)
3555             epilog->nb_iterations_upper_bound -= lowest_vf;
3556           if (epilog->any_likely_upper_bound)
3557             epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3558           if (epilog->any_estimate)
3559             epilog->nb_iterations_estimate -= lowest_vf;
3560         }
3561
3562       unsigned HOST_WIDE_INT bound;
3563       if (bound_scalar.is_constant (&bound))
3564         {
3565           gcc_assert (bound != 0);
3566           /* Adjust the upper bound by the extra peeled vector iteration if we
3567              are an epilogue of an peeled vect loop and not VLA.  For VLA the
3568              loop bounds are unknown.  */
3569           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3570               && vf.is_constant ())
3571             bound += vf.to_constant ();
3572           /* -1 to convert loop iterations to latch iterations.  */
3573           record_niter_bound (epilog, bound - 1, false, true);
3574           scale_loop_profile (epilog, profile_probability::always (),
3575                               bound - 1);
3576         }
3577
3578       delete_update_ssa ();
3579       adjust_vec_debug_stmts ();
3580       scev_reset ();
3581     }
3582
3583   if (vect_epilogues)
3584     {
3585       epilog->aux = epilogue_vinfo;
3586       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3587       LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3588         = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3589
3590       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3591
3592       /* We now must calculate the number of NITERS performed by the previous
3593          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3594       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3595                                  niters_prolog, niters_vector_mult_vf);
3596
3597       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3598          determine whether we are coming from the previous vectorized loop
3599          using the update_e edge or the skip_vector basic block using the
3600          skip_e edge.  */
3601       if (skip_vector)
3602         {
3603           gcc_assert (update_e != NULL && skip_e != NULL);
3604           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3605                                            update_e->dest);
3606           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3607           gimple *stmt = gimple_build_assign (new_ssa, niters);
3608           gimple_stmt_iterator gsi;
3609           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3610               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3611             {
3612               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3613               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3614             }
3615           else
3616             {
3617               gsi = gsi_last_bb (update_e->src);
3618               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3619             }
3620
3621           niters = new_ssa;
3622           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3623           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3624                        UNKNOWN_LOCATION);
3625           niters = PHI_RESULT (new_phi);
3626           epilogue_vinfo->main_loop_edge = update_e;
3627           epilogue_vinfo->skip_main_loop_edge = skip_e;
3628         }
3629
3630       /* Set ADVANCE to the number of iterations performed by the previous
3631          loop and its prologue.  */
3632       *advance = niters;
3633
3634       /* Subtract the number of iterations performed by the vectorized loop
3635          from the number of total iterations.  */
3636       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3637                                           before_loop_niters,
3638                                           niters);
3639
3640       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3641       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3642         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3643                        epilogue_niters,
3644                        build_one_cst (TREE_TYPE (epilogue_niters)));
3645
3646       /* Decide what to do if the number of epilogue iterations is not
3647          a multiple of the epilogue loop's vectorization factor.
3648          We should have rejected the loop during the analysis phase
3649          if this fails.  */
3650       bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo);
3651       gcc_assert (res);
3652     }
3653
3654   adjust_vec.release ();
3655   free_original_copy_tables ();
3656
3657   return vect_epilogues ? epilog : NULL;
3658 }
3659
3660 /* Function vect_create_cond_for_niters_checks.
3661
3662    Create a conditional expression that represents the run-time checks for
3663    loop's niter.  The loop is guaranteed to terminate if the run-time
3664    checks hold.
3665
3666    Input:
3667    COND_EXPR  - input conditional expression.  New conditions will be chained
3668                 with logical AND operation.  If it is NULL, then the function
3669                 is used to return the number of alias checks.
3670    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3671                 to be checked.
3672
3673    Output:
3674    COND_EXPR - conditional expression.
3675
3676    The returned COND_EXPR is the conditional expression to be used in the
3677    if statement that controls which version of the loop gets executed at
3678    runtime.  */
3679
3680 static void
3681 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3682 {
3683   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3684
3685   if (*cond_expr)
3686     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3687                               *cond_expr, part_cond_expr);
3688   else
3689     *cond_expr = part_cond_expr;
3690 }
3691
3692 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3693    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3694
3695 static void
3696 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3697 {
3698   if (*cond_expr)
3699     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3700                               *cond_expr, part_cond_expr);
3701   else
3702     *cond_expr = part_cond_expr;
3703 }
3704
3705 /* Function vect_create_cond_for_align_checks.
3706
3707    Create a conditional expression that represents the alignment checks for
3708    all of data references (array element references) whose alignment must be
3709    checked at runtime.
3710
3711    Input:
3712    COND_EXPR  - input conditional expression.  New conditions will be chained
3713                 with logical AND operation.
3714    LOOP_VINFO - two fields of the loop information are used.
3715                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3716                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3717
3718    Output:
3719    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3720                          expression.
3721    The returned value is the conditional expression to be used in the if
3722    statement that controls which version of the loop gets executed at runtime.
3723
3724    The algorithm makes two assumptions:
3725      1) The number of bytes "n" in a vector is a power of 2.
3726      2) An address "a" is aligned if a%n is zero and that this
3727         test can be done as a&(n-1) == 0.  For example, for 16
3728         byte vectors the test is a&0xf == 0.  */
3729
3730 static void
3731 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3732                                    tree *cond_expr,
3733                                    gimple_seq *cond_expr_stmt_list)
3734 {
3735   const vec<stmt_vec_info> &may_misalign_stmts
3736     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3737   stmt_vec_info stmt_info;
3738   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3739   tree mask_cst;
3740   unsigned int i;
3741   tree int_ptrsize_type;
3742   char tmp_name[20];
3743   tree or_tmp_name = NULL_TREE;
3744   tree and_tmp_name;
3745   gimple *and_stmt;
3746   tree ptrsize_zero;
3747   tree part_cond_expr;
3748
3749   /* Check that mask is one less than a power of 2, i.e., mask is
3750      all zeros followed by all ones.  */
3751   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3752
3753   int_ptrsize_type = signed_type_for (ptr_type_node);
3754
3755   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3756      of the first vector of the i'th data reference. */
3757
3758   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3759     {
3760       gimple_seq new_stmt_list = NULL;
3761       tree addr_base;
3762       tree addr_tmp_name;
3763       tree new_or_tmp_name;
3764       gimple *addr_stmt, *or_stmt;
3765       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3766       bool negative = tree_int_cst_compare
3767         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3768       tree offset = negative
3769         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3770                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3771         : size_zero_node;
3772
3773       /* create: addr_tmp = (int)(address_of_first_vector) */
3774       addr_base =
3775         vect_create_addr_base_for_vector_ref (loop_vinfo,
3776                                               stmt_info, &new_stmt_list,
3777                                               offset);
3778       if (new_stmt_list != NULL)
3779         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3780
3781       sprintf (tmp_name, "addr2int%d", i);
3782       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3783       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3784       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3785
3786       /* The addresses are OR together.  */
3787
3788       if (or_tmp_name != NULL_TREE)
3789         {
3790           /* create: or_tmp = or_tmp | addr_tmp */
3791           sprintf (tmp_name, "orptrs%d", i);
3792           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3793           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3794                                          or_tmp_name, addr_tmp_name);
3795           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3796           or_tmp_name = new_or_tmp_name;
3797         }
3798       else
3799         or_tmp_name = addr_tmp_name;
3800
3801     } /* end for i */
3802
3803   mask_cst = build_int_cst (int_ptrsize_type, mask);
3804
3805   /* create: and_tmp = or_tmp & mask  */
3806   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3807
3808   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3809                                   or_tmp_name, mask_cst);
3810   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3811
3812   /* Make and_tmp the left operand of the conditional test against zero.
3813      if and_tmp has a nonzero bit then some address is unaligned.  */
3814   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3815   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3816                                 and_tmp_name, ptrsize_zero);
3817   chain_cond_expr (cond_expr, part_cond_expr);
3818 }
3819
3820 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3821    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3822    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3823    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3824
3825 static void
3826 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3827 {
3828   const vec<vec_object_pair> &pairs
3829     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3830   unsigned int i;
3831   vec_object_pair *pair;
3832   FOR_EACH_VEC_ELT (pairs, i, pair)
3833     {
3834       tree addr1 = build_fold_addr_expr (pair->first);
3835       tree addr2 = build_fold_addr_expr (pair->second);
3836       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3837                                          addr1, addr2);
3838       chain_cond_expr (cond_expr, part_cond_expr);
3839     }
3840 }
3841
3842 /* Create an expression that is true when all lower-bound conditions for
3843    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3844
3845 static void
3846 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3847 {
3848   const vec<vec_lower_bound> &lower_bounds
3849     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3850   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3851     {
3852       tree expr = lower_bounds[i].expr;
3853       tree type = unsigned_type_for (TREE_TYPE (expr));
3854       expr = fold_convert (type, expr);
3855       poly_uint64 bound = lower_bounds[i].min_value;
3856       if (!lower_bounds[i].unsigned_p)
3857         {
3858           expr = fold_build2 (PLUS_EXPR, type, expr,
3859                               build_int_cstu (type, bound - 1));
3860           bound += bound - 1;
3861         }
3862       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3863                                          build_int_cstu (type, bound));
3864       chain_cond_expr (cond_expr, part_cond_expr);
3865     }
3866 }
3867
3868 /* Function vect_create_cond_for_alias_checks.
3869
3870    Create a conditional expression that represents the run-time checks for
3871    overlapping of address ranges represented by a list of data references
3872    relations passed as input.
3873
3874    Input:
3875    COND_EXPR  - input conditional expression.  New conditions will be chained
3876                 with logical AND operation.  If it is NULL, then the function
3877                 is used to return the number of alias checks.
3878    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3879                 to be checked.
3880
3881    Output:
3882    COND_EXPR - conditional expression.
3883
3884    The returned COND_EXPR is the conditional expression to be used in the if
3885    statement that controls which version of the loop gets executed at runtime.
3886 */
3887
3888 void
3889 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3890 {
3891   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3892     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3893
3894   if (comp_alias_ddrs.is_empty ())
3895     return;
3896
3897   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3898                                &comp_alias_ddrs, cond_expr);
3899   if (dump_enabled_p ())
3900     dump_printf_loc (MSG_NOTE, vect_location,
3901                      "created %u versioning for alias checks.\n",
3902                      comp_alias_ddrs.length ());
3903 }
3904
3905
3906 /* Function vect_loop_versioning.
3907
3908    If the loop has data references that may or may not be aligned or/and
3909    has data reference relations whose independence was not proven then
3910    two versions of the loop need to be generated, one which is vectorized
3911    and one which isn't.  A test is then generated to control which of the
3912    loops is executed.  The test checks for the alignment of all of the
3913    data references that may or may not be aligned.  An additional
3914    sequence of runtime tests is generated for each pairs of DDRs whose
3915    independence was not proven.  The vectorized version of loop is
3916    executed only if both alias and alignment tests are passed.
3917
3918    The test generated to check which version of loop is executed
3919    is modified to also check for profitability as indicated by the
3920    cost model threshold TH.
3921
3922    The versioning precondition(s) are placed in *COND_EXPR and
3923    *COND_EXPR_STMT_LIST.  */
3924
3925 class loop *
3926 vect_loop_versioning (loop_vec_info loop_vinfo,
3927                       gimple *loop_vectorized_call)
3928 {
3929   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3930   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3931   basic_block condition_bb;
3932   gphi_iterator gsi;
3933   gimple_stmt_iterator cond_exp_gsi;
3934   basic_block merge_bb;
3935   basic_block new_exit_bb;
3936   edge new_exit_e, e;
3937   gphi *orig_phi, *new_phi;
3938   tree cond_expr = NULL_TREE;
3939   gimple_seq cond_expr_stmt_list = NULL;
3940   tree arg;
3941   profile_probability prob = profile_probability::likely ();
3942   gimple_seq gimplify_stmt_list = NULL;
3943   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3944   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3945   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3946   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3947   poly_uint64 versioning_threshold
3948     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3949   tree version_simd_if_cond
3950     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3951   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3952
3953   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3954       && !ordered_p (th, versioning_threshold))
3955     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3956                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3957                                             th - 1));
3958   if (maybe_ne (versioning_threshold, 0U))
3959     {
3960       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3961                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3962                                               versioning_threshold - 1));
3963       if (cond_expr)
3964         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3965                                  expr, cond_expr);
3966       else
3967         cond_expr = expr;
3968     }
3969
3970   tree cost_name = NULL_TREE;
3971   profile_probability prob2 = profile_probability::always ();
3972   if (cond_expr
3973       && EXPR_P (cond_expr)
3974       && (version_niter
3975           || version_align
3976           || version_alias
3977           || version_simd_if_cond))
3978     {
3979       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3980                                                       &cond_expr_stmt_list,
3981                                                       is_gimple_val, NULL_TREE);
3982       /* Split prob () into two so that the overall probability of passing
3983          both the cost-model and versioning checks is the orig prob.  */
3984       prob2 = prob = prob.sqrt ();
3985     }
3986
3987   if (version_niter)
3988     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3989
3990   if (cond_expr)
3991     {
3992       gimple_seq tem = NULL;
3993       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3994                                           &tem, is_gimple_condexpr_for_cond,
3995                                           NULL_TREE);
3996       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3997     }
3998
3999   if (version_align)
4000     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
4001                                        &cond_expr_stmt_list);
4002
4003   if (version_alias)
4004     {
4005       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
4006       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
4007       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
4008     }
4009
4010   if (version_simd_if_cond)
4011     {
4012       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4013       if (flag_checking)
4014         if (basic_block bb
4015             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4016           gcc_assert (bb != loop->header
4017                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4018                       && (scalar_loop == NULL
4019                           || (bb != scalar_loop->header
4020                               && dominated_by_p (CDI_DOMINATORS,
4021                                                  scalar_loop->header, bb))));
4022       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4023       tree c = fold_build2 (NE_EXPR, boolean_type_node,
4024                             version_simd_if_cond, zero);
4025       if (cond_expr)
4026         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4027                                  c, cond_expr);
4028       else
4029         cond_expr = c;
4030       if (dump_enabled_p ())
4031         dump_printf_loc (MSG_NOTE, vect_location,
4032                          "created versioning for simd if condition check.\n");
4033     }
4034
4035   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4036                                       &gimplify_stmt_list,
4037                                       is_gimple_condexpr_for_cond, NULL_TREE);
4038   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4039
4040   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4041      invariant in.  */
4042   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4043   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4044        !gsi_end_p (gsi); gsi_next (&gsi))
4045     {
4046       gimple *stmt = gsi_stmt (gsi);
4047       update_stmt (stmt);
4048       ssa_op_iter iter;
4049       use_operand_p use_p;
4050       basic_block def_bb;
4051       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4052         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4053             && flow_bb_inside_loop_p (outermost, def_bb))
4054           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4055     }
4056
4057   /* Search for the outermost loop we can version.  Avoid versioning of
4058      non-perfect nests but allow if-conversion versioned loops inside.  */
4059   class loop *loop_to_version = loop;
4060   if (flow_loop_nested_p (outermost, loop))
4061     {
4062       if (dump_enabled_p ())
4063         dump_printf_loc (MSG_NOTE, vect_location,
4064                          "trying to apply versioning to outer loop %d\n",
4065                          outermost->num);
4066       if (outermost->num == 0)
4067         outermost = superloop_at_depth (loop, 1);
4068       /* And avoid applying versioning on non-perfect nests.  */
4069       while (loop_to_version != outermost
4070              && (e = single_exit (loop_outer (loop_to_version)))
4071              && !(e->flags & EDGE_COMPLEX)
4072              && (!loop_outer (loop_to_version)->inner->next
4073                  || vect_loop_vectorized_call (loop_to_version))
4074              && (!loop_outer (loop_to_version)->inner->next
4075                  || !loop_outer (loop_to_version)->inner->next->next))
4076         loop_to_version = loop_outer (loop_to_version);
4077     }
4078
4079   /* Apply versioning.  If there is already a scalar version created by
4080      if-conversion re-use that.  Note we cannot re-use the copy of
4081      an if-converted outer-loop when vectorizing the inner loop only.  */
4082   gcond *cond;
4083   if ((!loop_to_version->inner || loop == loop_to_version)
4084       && loop_vectorized_call)
4085     {
4086       gcc_assert (scalar_loop);
4087       condition_bb = gimple_bb (loop_vectorized_call);
4088       cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4089       gimple_cond_set_condition_from_tree (cond, cond_expr);
4090       update_stmt (cond);
4091
4092       if (cond_expr_stmt_list)
4093         {
4094           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4095           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4096                                  GSI_SAME_STMT);
4097         }
4098
4099       /* if-conversion uses profile_probability::always () for both paths,
4100          reset the paths probabilities appropriately.  */
4101       edge te, fe;
4102       extract_true_false_edges_from_block (condition_bb, &te, &fe);
4103       te->probability = prob;
4104       fe->probability = prob.invert ();
4105       /* We can scale loops counts immediately but have to postpone
4106          scaling the scalar loop because we re-use it during peeling.
4107
4108          Ifcvt duplicates loop preheader, loop body and produces an basic
4109          block after loop exit.  We need to scale all that.  */
4110       basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4111       preheader->count = preheader->count.apply_probability (prob * prob2);
4112       scale_loop_frequencies (loop_to_version, prob * prob2);
4113       /* When the loop has multiple exits then we can only version itself.
4114         This is denoted by loop_to_version == loop.  In this case we can
4115         do the versioning by selecting the exit edge the vectorizer is
4116         currently using.  */
4117       edge exit_edge;
4118       if (loop_to_version == loop)
4119        exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4120       else
4121        exit_edge = single_exit (loop_to_version);
4122       exit_edge->dest->count = preheader->count;
4123       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4124
4125       nloop = scalar_loop;
4126       if (dump_enabled_p ())
4127         dump_printf_loc (MSG_NOTE, vect_location,
4128                          "reusing %sloop version created by if conversion\n",
4129                          loop_to_version != loop ? "outer " : "");
4130     }
4131   else
4132     {
4133       if (loop_to_version != loop
4134           && dump_enabled_p ())
4135         dump_printf_loc (MSG_NOTE, vect_location,
4136                          "applying loop versioning to outer loop %d\n",
4137                          loop_to_version->num);
4138
4139       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4140
4141       initialize_original_copy_tables ();
4142       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4143                             prob * prob2, (prob * prob2).invert (),
4144                             prob * prob2, (prob * prob2).invert (),
4145                             true);
4146       /* We will later insert second conditional so overall outcome of
4147          both is prob * prob2.  */
4148       edge true_e, false_e;
4149       extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4150       true_e->probability = prob;
4151       false_e->probability = prob.invert ();
4152       gcc_assert (nloop);
4153       nloop = get_loop_copy (loop);
4154
4155       /* For cycle vectorization with SLP we rely on the PHI arguments
4156          appearing in the same order as the SLP node operands which for the
4157          loop PHI nodes means the preheader edge dest index needs to remain
4158          the same for the analyzed loop which also becomes the vectorized one.
4159          Make it so in case the state after versioning differs by redirecting
4160          the first edge into the header to the same destination which moves
4161          it last.  */
4162       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4163         {
4164           edge e = EDGE_PRED (loop->header, 0);
4165           ssa_redirect_edge (e, e->dest);
4166           flush_pending_stmts (e);
4167         }
4168       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4169
4170       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4171          reap those otherwise;  they also refer to the original
4172          loops.  */
4173       class loop *l = loop;
4174       while (gimple *call = vect_loop_vectorized_call (l))
4175         {
4176           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4177           fold_loop_internal_call (call, boolean_false_node);
4178           l = loop_outer (l);
4179         }
4180       free_original_copy_tables ();
4181
4182       if (cond_expr_stmt_list)
4183         {
4184           cond_exp_gsi = gsi_last_bb (condition_bb);
4185           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4186                                  GSI_SAME_STMT);
4187         }
4188
4189       /* Loop versioning violates an assumption we try to maintain during
4190          vectorization - that the loop exit block has a single predecessor.
4191          After versioning, the exit block of both loop versions is the same
4192          basic block (i.e. it has two predecessors). Just in order to simplify
4193          following transformations in the vectorizer, we fix this situation
4194          here by adding a new (empty) block on the exit-edge of the loop,
4195          with the proper loop-exit phis to maintain loop-closed-form.
4196          If loop versioning wasn't done from loop, but scalar_loop instead,
4197          merge_bb will have already just a single successor.  */
4198
4199       /* When the loop has multiple exits then we can only version itself.
4200          This is denoted by loop_to_version == loop.  In this case we can
4201          do the versioning by selecting the exit edge the vectorizer is
4202          currently using.  */
4203       edge exit_edge;
4204       if (loop_to_version == loop)
4205         exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4206       else
4207         exit_edge = single_exit (loop_to_version);
4208
4209       gcc_assert (exit_edge);
4210       merge_bb = exit_edge->dest;
4211       if (EDGE_COUNT (merge_bb->preds) >= 2)
4212         {
4213           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4214           new_exit_bb = split_edge (exit_edge);
4215           new_exit_e = exit_edge;
4216           e = EDGE_SUCC (new_exit_bb, 0);
4217
4218           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4219                gsi_next (&gsi))
4220             {
4221               tree new_res;
4222               orig_phi = gsi.phi ();
4223               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4224               new_phi = create_phi_node (new_res, new_exit_bb);
4225               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4226               add_phi_arg (new_phi, arg, new_exit_e,
4227                            gimple_phi_arg_location_from_edge (orig_phi, e));
4228               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4229             }
4230         }
4231
4232       update_ssa (TODO_update_ssa_no_phi);
4233     }
4234
4235   /* Split the cost model check off to a separate BB.  Costing assumes
4236      this is the only thing we perform when we enter the scalar loop
4237      from a failed cost decision.  */
4238   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4239     {
4240       gimple *def = SSA_NAME_DEF_STMT (cost_name);
4241       gcc_assert (gimple_bb (def) == condition_bb);
4242       /* All uses of the cost check are 'true' after the check we
4243          are going to insert.  */
4244       replace_uses_by (cost_name, boolean_true_node);
4245       /* And we're going to build the new single use of it.  */
4246       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4247                                        NULL_TREE, NULL_TREE);
4248       edge e = split_block (gimple_bb (def), def);
4249       gimple_stmt_iterator gsi = gsi_for_stmt (def);
4250       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4251       edge true_e, false_e;
4252       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4253       e->flags &= ~EDGE_FALLTHRU;
4254       e->flags |= EDGE_TRUE_VALUE;
4255       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4256       e->probability = prob2;
4257       e2->probability = prob2.invert ();
4258       e->dest->count = e->count ();
4259       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4260       auto_vec<basic_block, 3> adj;
4261       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4262            son;
4263            son = next_dom_son (CDI_DOMINATORS, son))
4264         if (EDGE_COUNT (son->preds) > 1)
4265           adj.safe_push (son);
4266       for (auto son : adj)
4267         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4268       //debug_bb (condition_bb);
4269       //debug_bb (e->src);
4270     }
4271
4272   if (version_niter)
4273     {
4274       /* The versioned loop could be infinite, we need to clear existing
4275          niter information which is copied from the original loop.  */
4276       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4277       vect_free_loop_info_assumptions (nloop);
4278     }
4279
4280   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4281       && dump_enabled_p ())
4282     {
4283       if (version_alias)
4284         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4285                          vect_location,
4286                          "loop versioned for vectorization because of "
4287                          "possible aliasing\n");
4288       if (version_align)
4289         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4290                          vect_location,
4291                          "loop versioned for vectorization to enhance "
4292                          "alignment\n");
4293
4294     }
4295
4296   return nloop;
4297 }