gcc/tree-vect-loop-manip.cc

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53 #include "langhooks.h"
  54 #include "tree-vector-builder.h"
  55 #include "optabs-tree.h"
  56
  57 /*************************************************************************
  58   Simple Loop Peeling Utilities
  59
  60   Utilities to support loop peeling for vectorization purposes.
  61  *************************************************************************/
  62
  63
  64 /* Renames the use *OP_P.  */
  65
  66 static void
  67 rename_use_op (use_operand_p op_p)
  68 {
  69   tree new_name;
  70
  71   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  72     return;
  73
  74   new_name = get_current_def (USE_FROM_PTR (op_p));
  75
  76   /* Something defined outside of the loop.  */
  77   if (!new_name)
  78     return;
  79
  80   /* An ordinary ssa name defined in the loop.  */
  81
  82   SET_USE (op_p, new_name);
  83 }
  84
  85
  86 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  87    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  88    true.  */
  89
  90 static void
  91 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  92 {
  93   gimple *stmt;
  94   use_operand_p use_p;
  95   ssa_op_iter iter;
  96   edge e;
  97   edge_iterator ei;
  98   class loop *loop = bb->loop_father;
  99   class loop *outer_loop = NULL;
 100
 101   if (rename_from_outer_loop)
 102     {
 103       gcc_assert (loop);
 104       outer_loop = loop_outer (loop);
 105     }
 106
 107   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 108        gsi_next (&gsi))
 109     {
 110       stmt = gsi_stmt (gsi);
 111       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 112         rename_use_op (use_p);
 113     }
 114
 115   FOR_EACH_EDGE (e, ei, bb->preds)
 116     {
 117       if (!flow_bb_inside_loop_p (loop, e->src))
 118         {
 119           if (!rename_from_outer_loop)
 120             continue;
 121           if (e->src != outer_loop->header)
 122             {
 123               if (outer_loop->inner->next)
 124                 {
 125                   /* If outer_loop has 2 inner loops, allow there to
 126                      be an extra basic block which decides which of the
 127                      two loops to use using LOOP_VECTORIZED.  */
 128                   if (!single_pred_p (e->src)
 129                       || single_pred (e->src) != outer_loop->header)
 130                     continue;
 131                 }
 132             }
 133         }
 134       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 135            gsi_next (&gsi))
 136         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 137     }
 138 }
 139
 140
 141 struct adjust_info
 142 {
 143   tree from, to;
 144   basic_block bb;
 145 };
 146
 147 /* A stack of values to be adjusted in debug stmts.  We have to
 148    process them LIFO, so that the closest substitution applies.  If we
 149    processed them FIFO, without the stack, we might substitute uses
 150    with a PHI DEF that would soon become non-dominant, and when we got
 151    to the suitable one, it wouldn't have anything to substitute any
 152    more.  */
 153 static vec<adjust_info, va_heap> adjust_vec;
 154
 155 /* Adjust any debug stmts that referenced AI->from values to use the
 156    loop-closed AI->to, if the references are dominated by AI->bb and
 157    not by the definition of AI->from.  */
 158
 159 static void
 160 adjust_debug_stmts_now (adjust_info *ai)
 161 {
 162   basic_block bbphi = ai->bb;
 163   tree orig_def = ai->from;
 164   tree new_def = ai->to;
 165   imm_use_iterator imm_iter;
 166   gimple *stmt;
 167   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 168
 169   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 170
 171   /* Adjust any debug stmts that held onto non-loop-closed
 172      references.  */
 173   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 174     {
 175       use_operand_p use_p;
 176       basic_block bbuse;
 177
 178       if (!is_gimple_debug (stmt))
 179         continue;
 180
 181       gcc_assert (gimple_debug_bind_p (stmt));
 182
 183       bbuse = gimple_bb (stmt);
 184
 185       if ((bbuse == bbphi
 186            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 187           && !(bbuse == bbdef
 188                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 189         {
 190           if (new_def)
 191             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 192               SET_USE (use_p, new_def);
 193           else
 194             {
 195               gimple_debug_bind_reset_value (stmt);
 196               update_stmt (stmt);
 197             }
 198         }
 199     }
 200 }
 201
 202 /* Adjust debug stmts as scheduled before.  */
 203
 204 static void
 205 adjust_vec_debug_stmts (void)
 206 {
 207   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 208     return;
 209
 210   gcc_assert (adjust_vec.exists ());
 211
 212   while (!adjust_vec.is_empty ())
 213     {
 214       adjust_debug_stmts_now (&adjust_vec.last ());
 215       adjust_vec.pop ();
 216     }
 217 }
 218
 219 /* Adjust any debug stmts that referenced FROM values to use the
 220    loop-closed TO, if the references are dominated by BB and not by
 221    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 222    will be postponed until adjust_vec_debug_stmts is called.  */
 223
 224 static void
 225 adjust_debug_stmts (tree from, tree to, basic_block bb)
 226 {
 227   adjust_info ai;
 228
 229   if (MAY_HAVE_DEBUG_BIND_STMTS
 230       && TREE_CODE (from) == SSA_NAME
 231       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 232       && ! virtual_operand_p (from))
 233     {
 234       ai.from = from;
 235       ai.to = to;
 236       ai.bb = bb;
 237
 238       if (adjust_vec.exists ())
 239         adjust_vec.safe_push (ai);
 240       else
 241         adjust_debug_stmts_now (&ai);
 242     }
 243 }
 244
 245 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 246    to adjust any debug stmts that referenced the old phi arg,
 247    presumably non-loop-closed references left over from other
 248    transformations.  */
 249
 250 static void
 251 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 252 {
 253   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 254
 255   gcc_assert (TREE_CODE (orig_def) != SSA_NAME
 256               || orig_def != new_def);
 257
 258   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 259
 260   if (MAY_HAVE_DEBUG_BIND_STMTS)
 261     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 262                         gimple_bb (update_phi));
 263 }
 264
 265 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 266    that the control should have during the first iteration and NEXT_CTRL is the
 267    value that it should have on subsequent iterations.  */
 268
 269 static void
 270 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 271                        tree next_ctrl)
 272 {
 273   gphi *phi = create_phi_node (ctrl, loop->header);
 274   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 275   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 276 }
 277
 278 /* Add SEQ to the end of LOOP's preheader block.  */
 279
 280 static void
 281 add_preheader_seq (class loop *loop, gimple_seq seq)
 282 {
 283   if (seq)
 284     {
 285       edge pe = loop_preheader_edge (loop);
 286       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 287       gcc_assert (!new_bb);
 288     }
 289 }
 290
 291 /* Add SEQ to the beginning of LOOP's header block.  */
 292
 293 static void
 294 add_header_seq (class loop *loop, gimple_seq seq)
 295 {
 296   if (seq)
 297     {
 298       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 299       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 300     }
 301 }
 302
 303 /* Return true if the target can interleave elements of two vectors.
 304    OFFSET is 0 if the first half of the vectors should be interleaved
 305    or 1 if the second half should.  When returning true, store the
 306    associated permutation in INDICES.  */
 307
 308 static bool
 309 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 310                         unsigned int offset)
 311 {
 312   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 313   poly_uint64 base = exact_div (nelts, 2) * offset;
 314   vec_perm_builder sel (nelts, 2, 3);
 315   for (unsigned int i = 0; i < 3; ++i)
 316     {
 317       sel.quick_push (base + i);
 318       sel.quick_push (base + i + nelts);
 319     }
 320   indices->new_vector (sel, 2, nelts);
 321   return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
 322                                *indices);
 323 }
 324
 325 /* Try to use permutes to define the masks in DEST_RGM using the masks
 326    in SRC_RGM, given that the former has twice as many masks as the
 327    latter.  Return true on success, adding any new statements to SEQ.  */
 328
 329 static bool
 330 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 331                                rgroup_controls *src_rgm)
 332 {
 333   tree src_masktype = src_rgm->type;
 334   tree dest_masktype = dest_rgm->type;
 335   machine_mode src_mode = TYPE_MODE (src_masktype);
 336   insn_code icode1, icode2;
 337   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 338       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 339                                   src_mode)) != CODE_FOR_nothing
 340       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 341                                   src_mode)) != CODE_FOR_nothing)
 342     {
 343       /* Unpacking the source masks gives at least as many mask bits as
 344          we need.  We can then VIEW_CONVERT any excess bits away.  */
 345       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 346       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 347       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 348       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 349         {
 350           tree src = src_rgm->controls[i / 2];
 351           tree dest = dest_rgm->controls[i];
 352           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 353                             ? VEC_UNPACK_HI_EXPR
 354                             : VEC_UNPACK_LO_EXPR);
 355           gassign *stmt;
 356           if (dest_masktype == unpack_masktype)
 357             stmt = gimple_build_assign (dest, code, src);
 358           else
 359             {
 360               tree temp = make_ssa_name (unpack_masktype);
 361               stmt = gimple_build_assign (temp, code, src);
 362               gimple_seq_add_stmt (seq, stmt);
 363               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 364                                           build1 (VIEW_CONVERT_EXPR,
 365                                                   dest_masktype, temp));
 366             }
 367           gimple_seq_add_stmt (seq, stmt);
 368         }
 369       return true;
 370     }
 371   vec_perm_indices indices[2];
 372   if (dest_masktype == src_masktype
 373       && interleave_supported_p (&indices[0], src_masktype, 0)
 374       && interleave_supported_p (&indices[1], src_masktype, 1))
 375     {
 376       /* The destination requires twice as many mask bits as the source, so
 377          we can use interleaving permutes to double up the number of bits.  */
 378       tree masks[2];
 379       for (unsigned int i = 0; i < 2; ++i)
 380         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 381       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 382         {
 383           tree src = src_rgm->controls[i / 2];
 384           tree dest = dest_rgm->controls[i];
 385           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 386                                               src, src, masks[i & 1]);
 387           gimple_seq_add_stmt (seq, stmt);
 388         }
 389       return true;
 390     }
 391   return false;
 392 }
 393
 394 /* Populate DEST_RGM->controls, given that they should add up to STEP.
 395
 396      STEP = MIN_EXPR <ivtmp_34, VF>;
 397
 398      First length (MIN (X, VF/N)):
 399        loop_len_15 = MIN_EXPR <STEP, VF/N>;
 400
 401      Second length:
 402        tmp = STEP - loop_len_15;
 403        loop_len_16 = MIN (tmp, VF/N);
 404
 405      Third length:
 406        tmp2 = tmp - loop_len_16;
 407        loop_len_17 = MIN (tmp2, VF/N);
 408
 409      Last length:
 410        loop_len_18 = tmp2 - loop_len_17;
 411 */
 412
 413 static void
 414 vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
 415                                rgroup_controls *dest_rgm, tree step)
 416 {
 417   tree ctrl_type = dest_rgm->type;
 418   poly_uint64 nitems_per_ctrl
 419     = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
 420   tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
 421
 422   for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 423     {
 424       tree ctrl = dest_rgm->controls[i];
 425       if (i == 0)
 426         {
 427           /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
 428           gassign *assign
 429             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 430           gimple_seq_add_stmt (seq, assign);
 431         }
 432       else if (i == dest_rgm->controls.length () - 1)
 433         {
 434           /* Last iteration: Remain capped to the range [0, VF/N].  */
 435           gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
 436                                                  dest_rgm->controls[i - 1]);
 437           gimple_seq_add_stmt (seq, assign);
 438         }
 439       else
 440         {
 441           /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
 442           step = gimple_build (seq, MINUS_EXPR, iv_type, step,
 443                                dest_rgm->controls[i - 1]);
 444           gassign *assign
 445             = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
 446           gimple_seq_add_stmt (seq, assign);
 447         }
 448     }
 449 }
 450
 451 /* Stores the standard position for induction variable increment in belonging to
 452    LOOP_EXIT (just before the exit condition of the given exit to BSI.
 453    INSERT_AFTER is set to true if the increment should be inserted after
 454    *BSI.  */
 455
 456 void
 457 vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
 458                             bool *insert_after)
 459 {
 460   basic_block bb = loop_exit->src;
 461   *bsi = gsi_last_bb (bb);
 462   *insert_after = false;
 463 }
 464
 465 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 466    for all the rgroup controls in RGC and return a control that is nonzero
 467    when the loop needs to iterate.  Add any new preheader statements to
 468    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 469
 470    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 471    times and has been vectorized according to LOOP_VINFO.
 472
 473    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 474    starts with NITERS_SKIP dummy iterations of the scalar loop before
 475    the real work starts.  The mask elements for these dummy iterations
 476    must be 0, to ensure that the extra iterations do not have an effect.
 477
 478    It is known that:
 479
 480      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 481
 482    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 483    variable that starts at 0 and has step:
 484
 485      VF * RGC->max_nscalars_per_iter * RGC->factor
 486
 487    might overflow before hitting a value above:
 488
 489      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 490
 491    This means that we cannot guarantee that such an induction variable
 492    would ever hit a value that produces a set of all-false masks or zero
 493    lengths for RGC.
 494
 495    Note: the cost of the code generated by this function is modeled
 496    by vect_estimate_min_profitable_iters, so changes here may need
 497    corresponding changes there.  */
 498
 499 static tree
 500 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 501                                  gimple_seq *preheader_seq,
 502                                  gimple_seq *header_seq,
 503                                  gimple_stmt_iterator loop_cond_gsi,
 504                                  rgroup_controls *rgc, tree niters,
 505                                  tree niters_skip, bool might_wrap_p,
 506                                  tree *iv_step, tree *compare_step)
 507 {
 508   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 509   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 510   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 511
 512   tree ctrl_type = rgc->type;
 513   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 514   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 515   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 516   tree length_limit = NULL_TREE;
 517   /* For length, we need length_limit to ensure length in range.  */
 518   if (!use_masks_p)
 519     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 520
 521   /* Calculate the maximum number of item values that the rgroup
 522      handles in total, the number that it handles for each iteration
 523      of the vector loop, and the number that it should skip during the
 524      first iteration of the vector loop.  */
 525   tree nitems_total = niters;
 526   tree nitems_step = build_int_cst (iv_type, vf);
 527   tree nitems_skip = niters_skip;
 528   if (nitems_per_iter != 1)
 529     {
 530       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 531          these multiplications don't overflow.  */
 532       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 533       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 534       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 535                                    nitems_total, compare_factor);
 536       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 537                                   nitems_step, iv_factor);
 538       if (nitems_skip)
 539         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 540                                     nitems_skip, compare_factor);
 541     }
 542
 543   /* Create an induction variable that counts the number of items
 544      processed.  */
 545   tree index_before_incr, index_after_incr;
 546   gimple_stmt_iterator incr_gsi;
 547   bool insert_after;
 548   edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
 549   vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
 550   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
 551     {
 552       /* Create an IV that counts down from niters_total and whose step
 553          is the (variable) amount processed in the current iteration:
 554            ...
 555            _10 = (unsigned long) count_12(D);
 556            ...
 557            # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
 558            _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
 559            ...
 560            vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
 561            ...
 562            ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
 563            ...
 564            if (ivtmp_9 > POLY_INT_CST [4, 4])
 565              goto <bb 4>; [83.33%]
 566            else
 567              goto <bb 5>; [16.67%]
 568       */
 569       nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
 570       tree step = rgc->controls.length () == 1 ? rgc->controls[0]
 571                                                : make_ssa_name (iv_type);
 572       /* Create decrement IV.  */
 573       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 574         {
 575           create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
 576                      insert_after, &index_before_incr, &index_after_incr);
 577           tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
 578                                    index_before_incr, nitems_step);
 579           gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
 580         }
 581       else
 582         {
 583           create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
 584                      &incr_gsi, insert_after, &index_before_incr,
 585                      &index_after_incr);
 586           gimple_seq_add_stmt (header_seq,
 587                                gimple_build_assign (step, MIN_EXPR,
 588                                                     index_before_incr,
 589                                                     nitems_step));
 590         }
 591       *iv_step = step;
 592       *compare_step = nitems_step;
 593       return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
 594                                                        : index_before_incr;
 595     }
 596
 597   /* Create increment IV.  */
 598   create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
 599              loop, &incr_gsi, insert_after, &index_before_incr,
 600              &index_after_incr);
 601
 602   tree zero_index = build_int_cst (compare_type, 0);
 603   tree test_index, test_limit, first_limit;
 604   gimple_stmt_iterator *test_gsi;
 605   if (might_wrap_p)
 606     {
 607       /* In principle the loop should stop iterating once the incremented
 608          IV reaches a value greater than or equal to:
 609
 610            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 611
 612          However, there's no guarantee that this addition doesn't overflow
 613          the comparison type, or that the IV hits a value above it before
 614          wrapping around.  We therefore adjust the limit down by one
 615          IV step:
 616
 617            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 618            -[infinite-prec] NITEMS_STEP
 619
 620          and compare the IV against this limit _before_ incrementing it.
 621          Since the comparison type is unsigned, we actually want the
 622          subtraction to saturate at zero:
 623
 624            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 625            -[sat] NITEMS_STEP
 626
 627          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 628
 629            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 630
 631          where the rightmost subtraction can be done directly in
 632          COMPARE_TYPE.  */
 633       test_index = index_before_incr;
 634       tree adjust = gimple_convert (preheader_seq, compare_type,
 635                                     nitems_step);
 636       if (nitems_skip)
 637         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 638                                adjust, nitems_skip);
 639       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 640                                  nitems_total, adjust);
 641       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 642                                  test_limit, adjust);
 643       test_gsi = &incr_gsi;
 644
 645       /* Get a safe limit for the first iteration.  */
 646       if (nitems_skip)
 647         {
 648           /* The first vector iteration can handle at most NITEMS_STEP
 649              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 650              NITEMS_SKIP to that cannot overflow.  */
 651           tree const_limit = build_int_cst (compare_type,
 652                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 653                                             * nitems_per_iter);
 654           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 655                                       nitems_total, const_limit);
 656           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 657                                       first_limit, nitems_skip);
 658         }
 659       else
 660         /* For the first iteration it doesn't matter whether the IV hits
 661            a value above NITEMS_TOTAL.  That only matters for the latch
 662            condition.  */
 663         first_limit = nitems_total;
 664     }
 665   else
 666     {
 667       /* Test the incremented IV, which will always hit a value above
 668          the bound before wrapping.  */
 669       test_index = index_after_incr;
 670       test_limit = nitems_total;
 671       if (nitems_skip)
 672         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 673                                    test_limit, nitems_skip);
 674       test_gsi = &loop_cond_gsi;
 675
 676       first_limit = test_limit;
 677     }
 678
 679   /* Convert the IV value to the comparison type (either a no-op or
 680      a demotion).  */
 681   gimple_seq test_seq = NULL;
 682   test_index = gimple_convert (&test_seq, compare_type, test_index);
 683   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 684
 685   /* Provide a definition of each control in the group.  */
 686   tree next_ctrl = NULL_TREE;
 687   tree ctrl;
 688   unsigned int i;
 689   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 690     {
 691       /* Previous controls will cover BIAS items.  This control covers the
 692          next batch.  */
 693       poly_uint64 bias = nitems_per_ctrl * i;
 694       tree bias_tree = build_int_cst (compare_type, bias);
 695
 696       /* See whether the first iteration of the vector loop is known
 697          to have a full control.  */
 698       poly_uint64 const_limit;
 699       bool first_iteration_full
 700         = (poly_int_tree_p (first_limit, &const_limit)
 701            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 702
 703       /* Rather than have a new IV that starts at BIAS and goes up to
 704          TEST_LIMIT, prefer to use the same 0-based IV for each control
 705          and adjust the bound down by BIAS.  */
 706       tree this_test_limit = test_limit;
 707       if (i != 0)
 708         {
 709           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 710                                           compare_type, this_test_limit,
 711                                           bias_tree);
 712           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 713                                           compare_type, this_test_limit,
 714                                           bias_tree);
 715         }
 716
 717       /* Create the initial control.  First include all items that
 718          are within the loop limit.  */
 719       tree init_ctrl = NULL_TREE;
 720       if (!first_iteration_full)
 721         {
 722           tree start, end;
 723           if (first_limit == test_limit)
 724             {
 725               /* Use a natural test between zero (the initial IV value)
 726                  and the loop limit.  The "else" block would be valid too,
 727                  but this choice can avoid the need to load BIAS_TREE into
 728                  a register.  */
 729               start = zero_index;
 730               end = this_test_limit;
 731             }
 732           else
 733             {
 734               /* FIRST_LIMIT is the maximum number of items handled by the
 735                  first iteration of the vector loop.  Test the portion
 736                  associated with this control.  */
 737               start = bias_tree;
 738               end = first_limit;
 739             }
 740
 741           if (use_masks_p)
 742             init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
 743                                         start, end, "max_mask");
 744           else
 745             {
 746               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 747               gimple_seq seq = vect_gen_len (init_ctrl, start,
 748                                              end, length_limit);
 749               gimple_seq_add_seq (preheader_seq, seq);
 750             }
 751         }
 752
 753       /* Now AND out the bits that are within the number of skipped
 754          items.  */
 755       poly_uint64 const_skip;
 756       if (nitems_skip
 757           && !(poly_int_tree_p (nitems_skip, &const_skip)
 758                && known_le (const_skip, bias)))
 759         {
 760           gcc_assert (use_masks_p);
 761           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 762                                                     bias_tree, nitems_skip);
 763           if (init_ctrl)
 764             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 765                                       init_ctrl, unskipped_mask);
 766           else
 767             init_ctrl = unskipped_mask;
 768         }
 769
 770       if (!init_ctrl)
 771         {
 772           /* First iteration is full.  */
 773           if (use_masks_p)
 774             init_ctrl = build_minus_one_cst (ctrl_type);
 775           else
 776             init_ctrl = length_limit;
 777         }
 778
 779       /* Get the control value for the next iteration of the loop.  */
 780       if (use_masks_p)
 781         {
 782           gimple_seq stmts = NULL;
 783           next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
 784                                       this_test_limit, "next_mask");
 785           gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
 786         }
 787       else
 788         {
 789           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 790           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 791                                          length_limit);
 792           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 793         }
 794
 795       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 796     }
 797
 798   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 799   if (partial_load_bias != 0)
 800     {
 801       tree adjusted_len = rgc->bias_adjusted_ctrl;
 802       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
 803                                             rgc->controls[0],
 804                                             build_int_cst
 805                                             (TREE_TYPE (rgc->controls[0]),
 806                                              partial_load_bias));
 807       gimple_seq_add_stmt (header_seq, minus);
 808     }
 809
 810   return next_ctrl;
 811 }
 812
 813 /* Set up the iteration condition and rgroup controls for LOOP, given
 814    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 815    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 816    the number of iterations of the original scalar loop that should be
 817    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 818    for vect_set_loop_condition.
 819
 820    Insert the branch-back condition before LOOP_COND_GSI and return the
 821    final gcond.  */
 822
 823 static gcond *
 824 vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
 825                                          loop_vec_info loop_vinfo, tree niters,
 826                                          tree final_iv, bool niters_maybe_zero,
 827                                          gimple_stmt_iterator loop_cond_gsi)
 828 {
 829   gimple_seq preheader_seq = NULL;
 830   gimple_seq header_seq = NULL;
 831
 832   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 833   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 834   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 835   tree orig_niters = niters;
 836
 837   /* Type of the initial value of NITERS.  */
 838   tree ni_actual_type = TREE_TYPE (niters);
 839   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 840   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 841   if (niters_skip)
 842     niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
 843
 844   /* Convert NITERS to the same size as the compare.  */
 845   if (compare_precision > ni_actual_precision
 846       && niters_maybe_zero)
 847     {
 848       /* We know that there is always at least one iteration, so if the
 849          count is zero then it must have wrapped.  Cope with this by
 850          subtracting 1 before the conversion and adding 1 to the result.  */
 851       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 852       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 853                              niters, build_minus_one_cst (ni_actual_type));
 854       niters = gimple_convert (&preheader_seq, compare_type, niters);
 855       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 856                              niters, build_one_cst (compare_type));
 857     }
 858   else
 859     niters = gimple_convert (&preheader_seq, compare_type, niters);
 860
 861   /* Iterate over all the rgroups and fill in their controls.  We could use
 862      the first control from any rgroup for the loop condition; here we
 863      arbitrarily pick the last.  */
 864   tree test_ctrl = NULL_TREE;
 865   tree iv_step = NULL_TREE;
 866   tree compare_step = NULL_TREE;
 867   rgroup_controls *rgc;
 868   rgroup_controls *iv_rgc = nullptr;
 869   unsigned int i;
 870   auto_vec<rgroup_controls> *controls = use_masks_p
 871                                           ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
 872                                           : &LOOP_VINFO_LENS (loop_vinfo);
 873   FOR_EACH_VEC_ELT (*controls, i, rgc)
 874     if (!rgc->controls.is_empty ())
 875       {
 876         /* First try using permutes.  This adds a single vector
 877            instruction to the loop for each mask, but needs no extra
 878            loop invariants or IVs.  */
 879         unsigned int nmasks = i + 1;
 880         if (use_masks_p && (nmasks & 1) == 0)
 881           {
 882             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 883             if (!half_rgc->controls.is_empty ()
 884                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 885               continue;
 886           }
 887
 888         if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 889             || !iv_rgc
 890             || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
 891                 != rgc->max_nscalars_per_iter * rgc->factor))
 892           {
 893             /* See whether zero-based IV would ever generate all-false masks
 894                or zero length before wrapping around.  */
 895             bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 896
 897             /* Set up all controls for this group.  */
 898             test_ctrl
 899               = vect_set_loop_controls_directly (loop, loop_vinfo,
 900                                                  &preheader_seq, &header_seq,
 901                                                  loop_cond_gsi, rgc, niters,
 902                                                  niters_skip, might_wrap_p,
 903                                                  &iv_step, &compare_step);
 904
 905             iv_rgc = rgc;
 906           }
 907
 908         if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 909             && rgc->controls.length () > 1)
 910           {
 911             /* vect_set_loop_controls_directly creates an IV whose step
 912                is equal to the expected sum of RGC->controls.  Use that
 913                information to populate RGC->controls.  */
 914             tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 915             gcc_assert (iv_step);
 916             vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
 917           }
 918       }
 919
 920   /* Emit all accumulated statements.  */
 921   add_preheader_seq (loop, preheader_seq);
 922   add_header_seq (loop, header_seq);
 923
 924   /* Get a boolean result that tells us whether to iterate.  */
 925   gcond *cond_stmt;
 926   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
 927       && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
 928     {
 929       gcc_assert (compare_step);
 930       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
 931       cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
 932                                      NULL_TREE);
 933     }
 934   else
 935     {
 936       tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 937       tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 938       cond_stmt
 939         = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
 940     }
 941   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 942
 943   /* The loop iterates (NITERS - 1) / VF + 1 times.
 944      Subtract one from this to get the latch count.  */
 945   tree step = build_int_cst (compare_type,
 946                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 947   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 948                                        build_minus_one_cst (compare_type));
 949   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 950                                      niters_minus_one, step);
 951
 952   if (final_iv)
 953     {
 954       gassign *assign;
 955       /* If vectorizing an inverted early break loop we have to restart the
 956          scalar loop at niters - vf.  This matches what we do in
 957          vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
 958       if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 959         {
 960           tree ftype = TREE_TYPE (orig_niters);
 961           tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 962           assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
 963         }
 964        else
 965         assign = gimple_build_assign (final_iv, orig_niters);
 966       gsi_insert_on_edge_immediate (exit_edge, assign);
 967     }
 968
 969   return cond_stmt;
 970 }
 971
 972 /* Set up the iteration condition and rgroup controls for LOOP in AVX512
 973    style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
 974    vectorized loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 975    the number of iterations of the original scalar loop that should be
 976    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 977    for vect_set_loop_condition.
 978
 979    Insert the branch-back condition before LOOP_COND_GSI and return the
 980    final gcond.  */
 981
 982 static gcond *
 983 vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
 984                                          edge exit_edge,
 985                                          loop_vec_info loop_vinfo, tree niters,
 986                                          tree final_iv,
 987                                          bool niters_maybe_zero,
 988                                          gimple_stmt_iterator loop_cond_gsi)
 989 {
 990   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 991   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 992   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 993   tree orig_niters = niters;
 994   gimple_seq preheader_seq = NULL;
 995
 996   /* Create an IV that counts down from niters and whose step
 997      is the number of iterations processed in the current iteration.
 998      Produce the controls with compares like the following.
 999
1000        # iv_2 = PHI <niters, iv_3>
1001        rem_4 = MIN <iv_2, VF>;
1002        remv_6 = { rem_4, rem_4, rem_4, ... }
1003        mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1004        iv_3 = iv_2 - VF;
1005        if (iv_2 > VF)
1006          continue;
1007
1008      Where the constant is built with elements at most VF - 1 and
1009      repetitions according to max_nscalars_per_iter which is guarnateed
1010      to be the same within a group.  */
1011
1012   /* Convert NITERS to the determined IV type.  */
1013   if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1014       && niters_maybe_zero)
1015     {
1016       /* We know that there is always at least one iteration, so if the
1017          count is zero then it must have wrapped.  Cope with this by
1018          subtracting 1 before the conversion and adding 1 to the result.  */
1019       gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1020       niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1021                              niters, build_minus_one_cst (TREE_TYPE (niters)));
1022       niters = gimple_convert (&preheader_seq, iv_type, niters);
1023       niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1024                              niters, build_one_cst (iv_type));
1025     }
1026   else
1027     niters = gimple_convert (&preheader_seq, iv_type, niters);
1028
1029   /* Bias the initial value of the IV in case we need to skip iterations
1030      at the beginning.  */
1031   tree niters_adj = niters;
1032   if (niters_skip)
1033     {
1034       tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1035       niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1036                                  iv_type, niters, skip);
1037     }
1038
1039   /* The iteration step is the vectorization factor.  */
1040   tree iv_step = build_int_cst (iv_type, vf);
1041
1042   /* Create the decrement IV.  */
1043   tree index_before_incr, index_after_incr;
1044   gimple_stmt_iterator incr_gsi;
1045   bool insert_after;
1046   vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1047   create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1048              &incr_gsi, insert_after, &index_before_incr,
1049              &index_after_incr);
1050
1051   /* Iterate over all the rgroups and fill in their controls.  */
1052   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1053     {
1054       if (rgc.controls.is_empty ())
1055         continue;
1056
1057       tree ctrl_type = rgc.type;
1058       poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1059
1060       tree vectype = rgc.compare_type;
1061
1062       /* index_after_incr is the IV specifying the remaining iterations in
1063          the next iteration.  */
1064       tree rem = index_after_incr;
1065       /* When the data type for the compare to produce the mask is
1066          smaller than the IV type we need to saturate.  Saturate to
1067          the smallest possible value (IV_TYPE) so we only have to
1068          saturate once (CSE will catch redundant ones we add).  */
1069       if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1070         rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1071                             UNKNOWN_LOCATION,
1072                             MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1073       rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1074                             UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1075
1076       /* Build a data vector composed of the remaining iterations.  */
1077       rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078                                           UNKNOWN_LOCATION, vectype, rem);
1079
1080       /* Provide a definition of each vector in the control group.  */
1081       tree next_ctrl = NULL_TREE;
1082       tree first_rem = NULL_TREE;
1083       tree ctrl;
1084       unsigned int i;
1085       FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1086         {
1087           /* Previous controls will cover BIAS items.  This control covers the
1088              next batch.  */
1089           poly_uint64 bias = nitems_per_ctrl * i;
1090
1091           /* Build the constant to compare the remaining iters against,
1092              this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1093              split into pieces.  */
1094           unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1095           tree_vector_builder builder (vectype, n, 1);
1096           for (unsigned i = 0; i < n; ++i)
1097             {
1098               unsigned HOST_WIDE_INT val
1099                 = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1100               gcc_assert (val < vf.to_constant ());
1101               builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1102             }
1103           tree cmp_series = builder.build ();
1104
1105           /* Create the initial control.  First include all items that
1106              are within the loop limit.  */
1107           tree init_ctrl = NULL_TREE;
1108           poly_uint64 const_limit;
1109           /* See whether the first iteration of the vector loop is known
1110              to have a full control.  */
1111           if (poly_int_tree_p (niters, &const_limit)
1112               && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1113             init_ctrl = build_minus_one_cst (ctrl_type);
1114           else
1115             {
1116               /* The remaining work items initially are niters.  Saturate,
1117                  splat and compare.  */
1118               if (!first_rem)
1119                 {
1120                   first_rem = niters;
1121                   if (TYPE_PRECISION (TREE_TYPE (vectype))
1122                       < TYPE_PRECISION (iv_type))
1123                     first_rem = gimple_build (&preheader_seq,
1124                                               MIN_EXPR, TREE_TYPE (first_rem),
1125                                               first_rem, iv_step);
1126                   first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1127                                               first_rem);
1128                   first_rem = gimple_build_vector_from_val (&preheader_seq,
1129                                                             vectype, first_rem);
1130                 }
1131               init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1132                                         cmp_series, first_rem);
1133             }
1134
1135           /* Now AND out the bits that are within the number of skipped
1136              items.  */
1137           poly_uint64 const_skip;
1138           if (niters_skip
1139               && !(poly_int_tree_p (niters_skip, &const_skip)
1140                    && known_le (const_skip, bias)))
1141             {
1142               /* For integer mode masks it's cheaper to shift out the bits
1143                  since that avoids loading a constant.  */
1144               gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1145               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1146                                         lang_hooks.types.type_for_mode
1147                                           (TYPE_MODE (ctrl_type), 1),
1148                                         init_ctrl);
1149               /* ???  But when the shift amount isn't constant this requires
1150                  a round-trip to GRPs.  We could apply the bias to either
1151                  side of the compare instead.  */
1152               tree shift = gimple_build (&preheader_seq, MULT_EXPR,
1153                                          TREE_TYPE (niters_skip), niters_skip,
1154                                          build_int_cst (TREE_TYPE (niters_skip),
1155                                                         rgc.max_nscalars_per_iter));
1156               init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1157                                         TREE_TYPE (init_ctrl),
1158                                         init_ctrl, shift);
1159               init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1160                                         ctrl_type, init_ctrl);
1161             }
1162
1163           /* Get the control value for the next iteration of the loop.  */
1164           next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1165                                     UNKNOWN_LOCATION,
1166                                     LT_EXPR, ctrl_type, cmp_series, rem);
1167
1168           vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1169         }
1170     }
1171
1172   /* Emit all accumulated statements.  */
1173   add_preheader_seq (loop, preheader_seq);
1174
1175   /* Adjust the exit test using the decrementing IV.  */
1176   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1177   /* When we peel for alignment with niter_skip != 0 this can
1178      cause niter + niter_skip to wrap and since we are comparing the
1179      value before the decrement here we get a false early exit.
1180      We can't compare the value after decrement either because that
1181      decrement could wrap as well as we're not doing a saturating
1182      decrement.  To avoid this situation we force a larger
1183      iv_type.  */
1184   gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1185                                         NULL_TREE, NULL_TREE);
1186   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1187
1188   /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1189      Subtract one from this to get the latch count.  */
1190   tree niters_minus_one
1191     = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1192                    build_minus_one_cst (TREE_TYPE (orig_niters)));
1193   tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1194   if (niters_skip)
1195     niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1196                                fold_convert (iv_type, niters_skip));
1197   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1198                                      niters_adj2, iv_step);
1199
1200   if (final_iv)
1201     {
1202       gassign *assign;
1203       /* If vectorizing an inverted early break loop we have to restart the
1204          scalar loop at niters - vf.  This matches what we do in
1205          vect_gen_vector_loop_niters_mult_vf for non-masked loops.  */
1206       if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1207         {
1208           tree ftype = TREE_TYPE (orig_niters);
1209           tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1210           assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1211         }
1212        else
1213         assign = gimple_build_assign (final_iv, orig_niters);
1214       gsi_insert_on_edge_immediate (exit_edge, assign);
1215     }
1216
1217   return cond_stmt;
1218 }
1219
1220
1221 /* Like vect_set_loop_condition, but handle the case in which the vector
1222    loop handles exactly VF scalars per iteration.  */
1223
1224 static gcond *
1225 vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1226                                 class loop *loop, tree niters, tree step,
1227                                 tree final_iv, bool niters_maybe_zero,
1228                                 gimple_stmt_iterator loop_cond_gsi)
1229 {
1230   tree indx_before_incr, indx_after_incr;
1231   gcond *cond_stmt;
1232   gcond *orig_cond;
1233   edge pe = loop_preheader_edge (loop);
1234   gimple_stmt_iterator incr_gsi;
1235   bool insert_after;
1236   enum tree_code code;
1237   tree niters_type = TREE_TYPE (niters);
1238
1239   orig_cond = get_loop_exit_condition (exit_edge);
1240   gcc_assert (orig_cond);
1241   loop_cond_gsi = gsi_for_stmt (orig_cond);
1242
1243   tree init, limit;
1244   if (!niters_maybe_zero && integer_onep (step))
1245     {
1246       /* In this case we can use a simple 0-based IV:
1247
1248          A:
1249            x = 0;
1250            do
1251              {
1252                ...
1253                x += 1;
1254              }
1255            while (x < NITERS);  */
1256       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1257       init = build_zero_cst (niters_type);
1258       limit = niters;
1259     }
1260   else
1261     {
1262       /* The following works for all values of NITERS except 0:
1263
1264          B:
1265            x = 0;
1266            do
1267              {
1268                ...
1269                x += STEP;
1270              }
1271            while (x <= NITERS - STEP);
1272
1273          so that the loop continues to iterate if x + STEP - 1 < NITERS
1274          but stops if x + STEP - 1 >= NITERS.
1275
1276          However, if NITERS is zero, x never hits a value above NITERS - STEP
1277          before wrapping around.  There are two obvious ways of dealing with
1278          this:
1279
1280          - start at STEP - 1 and compare x before incrementing it
1281          - start at -1 and compare x after incrementing it
1282
1283          The latter is simpler and is what we use.  The loop in this case
1284          looks like:
1285
1286          C:
1287            x = -1;
1288            do
1289              {
1290                ...
1291                x += STEP;
1292              }
1293            while (x < NITERS - STEP);
1294
1295          In both cases the loop limit is NITERS - STEP.  */
1296       gimple_seq seq = NULL;
1297       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1298       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1299       if (seq)
1300         {
1301           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1302           gcc_assert (!new_bb);
1303         }
1304       if (niters_maybe_zero)
1305         {
1306           /* Case C.  */
1307           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1308           init = build_all_ones_cst (niters_type);
1309         }
1310       else
1311         {
1312           /* Case B.  */
1313           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1314           init = build_zero_cst (niters_type);
1315         }
1316     }
1317
1318   vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1319   create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1320              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1321   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1322                                               true, NULL_TREE, true,
1323                                               GSI_SAME_STMT);
1324   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1325                                      true, GSI_SAME_STMT);
1326
1327   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1328                                  NULL_TREE);
1329
1330   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1331
1332   /* Record the number of latch iterations.  */
1333   if (limit == niters)
1334     /* Case A: the loop iterates NITERS times.  Subtract one to get the
1335        latch count.  */
1336     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1337                                        build_int_cst (niters_type, 1));
1338   else
1339     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1340        Subtract one from this to get the latch count.  */
1341     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1342                                        limit, step);
1343
1344   if (final_iv)
1345     {
1346       gassign *assign;
1347       gcc_assert (single_pred_p (exit_edge->dest));
1348       tree phi_dest
1349         = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1350       /* Make sure to maintain LC SSA form here and elide the subtraction
1351          if the value is zero.  */
1352       gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1353       add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1354       if (!integer_zerop (init))
1355         {
1356           assign = gimple_build_assign (final_iv, MINUS_EXPR,
1357                                         phi_dest, init);
1358           gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1359           gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1360         }
1361     }
1362
1363   return cond_stmt;
1364 }
1365
1366 /* If we're using fully-masked loops, make LOOP iterate:
1367
1368       N == (NITERS - 1) / STEP + 1
1369
1370    times.  When NITERS is zero, this is equivalent to making the loop
1371    execute (1 << M) / STEP times, where M is the precision of NITERS.
1372    NITERS_MAYBE_ZERO is true if this last case might occur.
1373
1374    If we're not using fully-masked loops, make LOOP iterate:
1375
1376       N == (NITERS - STEP) / STEP + 1
1377
1378    times, where NITERS is known to be outside the range [1, STEP - 1].
1379    This is equivalent to making the loop execute NITERS / STEP times
1380    when NITERS is nonzero and (1 << M) / STEP times otherwise.
1381    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1382
1383    If FINAL_IV is nonnull, it is an SSA name that should be set to
1384    N * STEP on exit from the loop.
1385
1386    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
1387
1388 void
1389 vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1390                          tree niters, tree step, tree final_iv,
1391                          bool niters_maybe_zero)
1392 {
1393   gcond *cond_stmt;
1394   gcond *orig_cond = get_loop_exit_condition (loop_e);
1395   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1396
1397   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1398     {
1399       if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1400         cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1401                                                                     loop_vinfo,
1402                                                                     niters, final_iv,
1403                                                                     niters_maybe_zero,
1404                                                                     loop_cond_gsi);
1405       else
1406         cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1407                                                              loop_vinfo,
1408                                                              niters, final_iv,
1409                                                              niters_maybe_zero,
1410                                                              loop_cond_gsi);
1411     }
1412   else
1413     cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1414                                                 niters,
1415                                                 step, final_iv,
1416                                                 niters_maybe_zero,
1417                                                 loop_cond_gsi);
1418
1419   /* Remove old loop exit test.  */
1420   stmt_vec_info orig_cond_info;
1421   if (loop_vinfo
1422       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1423     loop_vinfo->remove_stmt (orig_cond_info);
1424   else
1425     gsi_remove (&loop_cond_gsi, true);
1426
1427   if (dump_enabled_p ())
1428     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1429                      (gimple *) cond_stmt);
1430 }
1431
1432 /* Given LOOP this function generates a new copy of it and puts it
1433    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1434    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1435    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1436    entry or exit of LOOP.  If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1437    continuation.  This is correct for cases where one loop continues from the
1438    other like in the vectorizer, but not true for uses in e.g. loop distribution
1439    where the contents of the loop body are split but the iteration space of both
1440    copies remains the same.
1441
1442    If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1443    dominators were updated during the peeling.  When doing early break vectorization
1444    then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1445    memory references that need to be updated should we decide to vectorize.  */
1446
1447 class loop *
1448 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1449                                         class loop *scalar_loop,
1450                                         edge scalar_exit, edge e, edge *new_e,
1451                                         bool flow_loops,
1452                                         vec<basic_block> *updated_doms)
1453 {
1454   class loop *new_loop;
1455   basic_block *new_bbs, *bbs, *pbbs;
1456   bool at_exit;
1457   bool was_imm_dom;
1458   basic_block exit_dest;
1459   edge exit, new_exit;
1460   bool duplicate_outer_loop = false;
1461
1462   exit = loop_exit;
1463   at_exit = (e == exit);
1464   if (!at_exit && e != loop_preheader_edge (loop))
1465     return NULL;
1466
1467   if (scalar_loop == NULL)
1468     {
1469       scalar_loop = loop;
1470       scalar_exit = loop_exit;
1471     }
1472   else if (scalar_loop == loop)
1473     scalar_exit = loop_exit;
1474   else
1475     {
1476       /* Loop has been version, match exits up using the aux index.  */
1477       for (edge exit : get_loop_exit_edges (scalar_loop))
1478         if (exit->aux == loop_exit->aux)
1479           {
1480             scalar_exit = exit;
1481             break;
1482           }
1483
1484       gcc_assert (scalar_exit);
1485     }
1486
1487   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1488   pbbs = bbs + 1;
1489   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1490   /* Allow duplication of outer loops.  */
1491   if (scalar_loop->inner)
1492     duplicate_outer_loop = true;
1493
1494   /* Generate new loop structure.  */
1495   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1496   duplicate_subloops (scalar_loop, new_loop);
1497
1498   exit_dest = exit->dest;
1499   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1500                                           exit_dest) == loop->header ?
1501                  true : false);
1502
1503   /* Also copy the pre-header, this avoids jumping through hoops to
1504      duplicate the loop entry PHI arguments.  Create an empty
1505      pre-header unconditionally for this.  */
1506   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1507   edge entry_e = single_pred_edge (preheader);
1508   bbs[0] = preheader;
1509   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1510
1511   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1512             &scalar_exit, 1, &new_exit, NULL,
1513             at_exit ? loop->latch : e->src, true);
1514   exit = loop_exit;
1515   basic_block new_preheader = new_bbs[0];
1516
1517   gcc_assert (new_exit);
1518
1519   /* Record the new loop exit information.  new_loop doesn't have SCEV data and
1520      so we must initialize the exit information.  */
1521   if (new_e)
1522     *new_e = new_exit;
1523
1524   /* Before installing PHI arguments make sure that the edges
1525      into them match that of the scalar loop we analyzed.  This
1526      makes sure the SLP tree matches up between the main vectorized
1527      loop and the epilogue vectorized copies.  */
1528   if (single_succ_edge (preheader)->dest_idx
1529       != single_succ_edge (new_bbs[0])->dest_idx)
1530     {
1531       basic_block swap_bb = new_bbs[1];
1532       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1533       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1534       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1535       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1536     }
1537   if (duplicate_outer_loop)
1538     {
1539       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1540       if (loop_preheader_edge (scalar_loop)->dest_idx
1541           != loop_preheader_edge (new_inner_loop)->dest_idx)
1542         {
1543           basic_block swap_bb = new_inner_loop->header;
1544           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1545           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1546           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1547           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1548         }
1549     }
1550
1551   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1552
1553   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1554   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1555     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1556
1557   /* Rename the exit uses.  */
1558   for (edge exit : get_loop_exit_edges (new_loop))
1559     for (auto gsi = gsi_start_phis (exit->dest);
1560          !gsi_end_p (gsi); gsi_next (&gsi))
1561       {
1562         tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1563         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1564         if (MAY_HAVE_DEBUG_BIND_STMTS)
1565           adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1566       }
1567
1568   auto loop_exits = get_loop_exit_edges (loop);
1569   bool multiple_exits_p = loop_exits.length () > 1;
1570   auto_vec<basic_block> doms;
1571   class loop *update_loop = NULL;
1572
1573   if (at_exit) /* Add the loop copy at exit.  */
1574     {
1575       if (scalar_loop != loop && new_exit->dest != exit_dest)
1576         {
1577           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1578           flush_pending_stmts (new_exit);
1579         }
1580
1581       bool multiple_exits_p = loop_exits.length () > 1;
1582       basic_block main_loop_exit_block = new_preheader;
1583       basic_block alt_loop_exit_block = NULL;
1584       /* Create intermediate edge for main exit.  But only useful for early
1585          exits.  */
1586       if (multiple_exits_p)
1587         {
1588           edge loop_e = single_succ_edge (new_preheader);
1589           new_preheader = split_edge (loop_e);
1590         }
1591
1592       auto_vec <gimple *> new_phis;
1593       hash_map <tree, tree> new_phi_args;
1594       /* First create the empty phi nodes so that when we flush the
1595          statements they can be filled in.   However because there is no order
1596          between the PHI nodes in the exits and the loop headers we need to
1597          order them base on the order of the two headers.  First record the new
1598          phi nodes. Then redirect the edges and flush the changes.  This writes
1599          out the new SSA names.  */
1600       for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1601            !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1602         {
1603           gimple *from_phi = gsi_stmt (gsi_from);
1604           tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1605           gphi *res = create_phi_node (new_res, main_loop_exit_block);
1606           new_phis.safe_push (res);
1607         }
1608
1609       for (auto exit : loop_exits)
1610         {
1611           basic_block dest = main_loop_exit_block;
1612           if (exit != loop_exit)
1613             {
1614               if (!alt_loop_exit_block)
1615                 {
1616                   alt_loop_exit_block = split_edge (exit);
1617                   edge res = redirect_edge_and_branch (
1618                                 single_succ_edge (alt_loop_exit_block),
1619                                 new_preheader);
1620                   flush_pending_stmts (res);
1621                   continue;
1622                 }
1623               dest = alt_loop_exit_block;
1624             }
1625           edge e = redirect_edge_and_branch (exit, dest);
1626           flush_pending_stmts (e);
1627         }
1628
1629       bool peeled_iters = single_pred (loop->latch) != loop_exit->src;
1630       /* Record the new SSA names in the cache so that we can skip materializing
1631          them again when we fill in the rest of the LCSSA variables.  */
1632       for (auto phi : new_phis)
1633         {
1634           tree new_arg = gimple_phi_arg_def (phi, loop_exit->dest_idx);
1635
1636           if (!SSA_VAR_P (new_arg))
1637             continue;
1638
1639           /* If the PHI MEM node dominates the loop then we shouldn't create
1640              a new LC-SSSA PHI for it in the intermediate block.   */
1641           /* A MEM phi that consitutes a new DEF for the vUSE chain can either
1642              be a .VDEF or a PHI that operates on MEM. And said definition
1643              must not be inside the main loop.  Or we must be a parameter.
1644              In the last two cases we may remove a non-MEM PHI node, but since
1645              they dominate both loops the removal is unlikely to cause trouble
1646              as the exits must already be using them.  */
1647           if (virtual_operand_p (new_arg)
1648               && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
1649                   || !flow_bb_inside_loop_p (loop,
1650                                 gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
1651             {
1652               auto gsi = gsi_for_stmt (phi);
1653               remove_phi_node (&gsi, true);
1654               continue;
1655             }
1656
1657           /* If we decided not to remove the PHI node we should also not
1658              rematerialize it later on.  */
1659           new_phi_args.put (new_arg, gimple_phi_result (phi));
1660
1661           if (TREE_CODE (new_arg) != SSA_NAME)
1662             continue;
1663         }
1664
1665       /* Copy the current loop LC PHI nodes between the original loop exit
1666          block and the new loop header.  This allows us to later split the
1667          preheader block and still find the right LC nodes.  */
1668       edge loop_entry = single_succ_edge (new_preheader);
1669       if (flow_loops)
1670         {
1671           /* Link through the main exit first.  */
1672           for (auto gsi_from = gsi_start_phis (loop->header),
1673                gsi_to = gsi_start_phis (new_loop->header);
1674                !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1675                gsi_next (&gsi_from), gsi_next (&gsi_to))
1676             {
1677               gimple *from_phi = gsi_stmt (gsi_from);
1678               gimple *to_phi = gsi_stmt (gsi_to);
1679               tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1680                                                     loop_latch_edge (loop));
1681
1682               /* Check if we've already created a new phi node during edge
1683                  redirection.  If we have, only propagate the value
1684                  downwards.  */
1685               if (tree *res = new_phi_args.get (new_arg))
1686                 {
1687                   if (multiple_exits_p)
1688                     new_arg = *res;
1689                   else
1690                     {
1691                       adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1692                       continue;
1693                     }
1694                 }
1695               /* If we have multiple exits and the vector loop is peeled then we
1696                  need to use the value at start of loop.  If we're looking at
1697                  virtual operands we have to keep the original link.   Virtual
1698                  operands don't all become the same because we'll corrupt the
1699                  vUSE chains among others.  */
1700               if (peeled_iters)
1701                 {
1702                   tree tmp_arg = gimple_phi_result (from_phi);
1703                   /* Similar to the single exit case, If we have an existing
1704                      LCSSA variable thread through the original value otherwise
1705                      skip it and directly use the final value.  */
1706                   if (tree *res = new_phi_args.get (tmp_arg))
1707                     new_arg = *res;
1708                   else if (!virtual_operand_p (new_arg))
1709                     new_arg = tmp_arg;
1710                 }
1711
1712               tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1713               gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1714
1715               /* Otherwise, main loop exit should use the final iter value.  */
1716               SET_PHI_ARG_DEF (lcssa_phi, loop_exit->dest_idx, new_arg);
1717
1718               adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1719             }
1720
1721           set_immediate_dominator (CDI_DOMINATORS, main_loop_exit_block,
1722                                    loop_exit->src);
1723
1724           /* Now link the alternative exits.  */
1725           if (multiple_exits_p)
1726             {
1727               for (auto gsi_from = gsi_start_phis (loop->header),
1728                    gsi_to = gsi_start_phis (new_preheader);
1729                    !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1730                    gsi_next (&gsi_from), gsi_next (&gsi_to))
1731                 {
1732                   gimple *from_phi = gsi_stmt (gsi_from);
1733                   gimple *to_phi = gsi_stmt (gsi_to);
1734
1735                   tree alt_arg = gimple_phi_result (from_phi);
1736                   edge main_e = single_succ_edge (alt_loop_exit_block);
1737
1738                   /* Now update the virtual PHI nodes with the right value.  */
1739                   if (peeled_iters
1740                       && virtual_operand_p (alt_arg)
1741                       && flow_bb_inside_loop_p (loop,
1742                                 gimple_bb (SSA_NAME_DEF_STMT (alt_arg))))
1743                     {
1744                         /* Link the alternative exit one.  */
1745                         tree def
1746                           = gimple_phi_arg_def (to_phi, loop_exit->dest_idx);
1747                         gphi *def_phi = as_a <gphi *> (SSA_NAME_DEF_STMT (def));
1748                         SET_PHI_ARG_DEF (def_phi, 0, alt_arg);
1749
1750                         /* And now the main merge block.  */
1751                         gphi *iter_phi
1752                           = as_a <gphi *> (SSA_NAME_DEF_STMT (alt_arg));
1753                         unsigned latch_idx
1754                           = single_succ_edge (loop->latch)->dest_idx;
1755                         tree exit_val
1756                           = gimple_phi_arg_def (iter_phi, latch_idx);
1757                         alt_arg = copy_ssa_name (def);
1758                         gphi *l_phi = create_phi_node (alt_arg, main_e->src);
1759                         SET_PHI_ARG_DEF (l_phi, 0, exit_val);
1760                     }
1761                   SET_PHI_ARG_DEF (to_phi, main_e->dest_idx, alt_arg);
1762                 }
1763
1764               set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1765                                        loop->header);
1766             }
1767         }
1768
1769       if (was_imm_dom || duplicate_outer_loop)
1770         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1771
1772       /* And remove the non-necessary forwarder again.  Keep the other
1773          one so we have a proper pre-header for the loop at the exit edge.  */
1774       redirect_edge_pred (single_succ_edge (preheader),
1775                           single_pred (preheader));
1776       delete_basic_block (preheader);
1777       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1778                                loop_preheader_edge (scalar_loop)->src);
1779
1780       /* Finally after wiring the new epilogue we need to update its main exit
1781          to the original function exit we recorded.  Other exits are already
1782          correct.  */
1783       if (multiple_exits_p)
1784         {
1785           update_loop = new_loop;
1786           doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1787           for (unsigned i = 0; i < doms.length (); ++i)
1788             if (flow_bb_inside_loop_p (loop, doms[i]))
1789               doms.unordered_remove (i);
1790         }
1791     }
1792   else /* Add the copy at entry.  */
1793     {
1794       /* Copy the current loop LC PHI nodes between the original loop exit
1795          block and the new loop header.  This allows us to later split the
1796          preheader block and still find the right LC nodes.  */
1797       if (flow_loops)
1798         for (auto gsi_from = gsi_start_phis (new_loop->header),
1799              gsi_to = gsi_start_phis (loop->header);
1800              !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1801              gsi_next (&gsi_from), gsi_next (&gsi_to))
1802           {
1803             gimple *from_phi = gsi_stmt (gsi_from);
1804             gimple *to_phi = gsi_stmt (gsi_to);
1805             tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1806                                                   loop_latch_edge (new_loop));
1807             adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1808                                         new_arg);
1809           }
1810
1811       if (scalar_loop != loop)
1812         {
1813           /* Remove the non-necessary forwarder of scalar_loop again.  */
1814           redirect_edge_pred (single_succ_edge (preheader),
1815                               single_pred (preheader));
1816           delete_basic_block (preheader);
1817           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1818                                    loop_preheader_edge (scalar_loop)->src);
1819           preheader = split_edge (loop_preheader_edge (loop));
1820           entry_e = single_pred_edge (preheader);
1821         }
1822
1823       redirect_edge_and_branch_force (entry_e, new_preheader);
1824       flush_pending_stmts (entry_e);
1825       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1826
1827       redirect_edge_and_branch_force (new_exit, preheader);
1828       flush_pending_stmts (new_exit);
1829       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1830
1831       /* And remove the non-necessary forwarder again.  Keep the other
1832          one so we have a proper pre-header for the loop at the exit edge.  */
1833       redirect_edge_pred (single_succ_edge (new_preheader),
1834                           single_pred (new_preheader));
1835       delete_basic_block (new_preheader);
1836       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1837                                loop_preheader_edge (new_loop)->src);
1838
1839       if (multiple_exits_p)
1840         update_loop = loop;
1841     }
1842
1843   if (multiple_exits_p)
1844     {
1845       for (edge e : get_loop_exit_edges (update_loop))
1846         {
1847           edge ex;
1848           edge_iterator ei;
1849           FOR_EACH_EDGE (ex, ei, e->dest->succs)
1850             {
1851               /* Find the first non-fallthrough block as fall-throughs can't
1852                  dominate other blocks.  */
1853               if (single_succ_p (ex->dest))
1854                 {
1855                   doms.safe_push (ex->dest);
1856                   ex = single_succ_edge (ex->dest);
1857                 }
1858               doms.safe_push (ex->dest);
1859             }
1860           doms.safe_push (e->dest);
1861         }
1862
1863       iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1864       if (updated_doms)
1865         updated_doms->safe_splice (doms);
1866     }
1867
1868   free (new_bbs);
1869   free (bbs);
1870
1871   checking_verify_dominators (CDI_DOMINATORS);
1872
1873   return new_loop;
1874 }
1875
1876
1877 /* Given the condition expression COND, put it as the last statement of
1878    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1879    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1880    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1881    new edge as irreducible if IRREDUCIBLE_P is true.  */
1882
1883 static edge
1884 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1885                        basic_block guard_to, basic_block dom_bb,
1886                        profile_probability probability, bool irreducible_p)
1887 {
1888   gimple_stmt_iterator gsi;
1889   edge new_e, enter_e;
1890   gcond *cond_stmt;
1891   gimple_seq gimplify_stmt_list = NULL;
1892
1893   enter_e = EDGE_SUCC (guard_bb, 0);
1894   enter_e->flags &= ~EDGE_FALLTHRU;
1895   enter_e->flags |= EDGE_FALSE_VALUE;
1896   gsi = gsi_last_bb (guard_bb);
1897
1898   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
1899                                  is_gimple_condexpr_for_cond, NULL_TREE);
1900   if (gimplify_stmt_list)
1901     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1902
1903   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1904   gsi = gsi_last_bb (guard_bb);
1905   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1906
1907   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1908   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1909
1910   new_e->probability = probability;
1911   if (irreducible_p)
1912     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1913
1914   enter_e->probability = probability.invert ();
1915   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1916
1917   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1918   if (enter_e->dest->loop_father->header == enter_e->dest)
1919     split_edge (enter_e);
1920
1921   return new_e;
1922 }
1923
1924
1925 /* This function verifies that the following restrictions apply to LOOP:
1926    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1927        for innermost loop and 5 basic blocks for outer-loop.
1928    (2) it is single entry, single exit
1929    (3) its exit condition is the last stmt in the header
1930    (4) E is the entry/exit edge of LOOP.
1931  */
1932
1933 bool
1934 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
1935                              const_edge e)
1936 {
1937   edge entry_e = loop_preheader_edge (loop);
1938   gcond *orig_cond = get_loop_exit_condition (exit_e);
1939   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1940
1941   /* All loops have an outer scope; the only case loop->outer is NULL is for
1942      the function itself.  */
1943   if (!loop_outer (loop)
1944       || !empty_block_p (loop->latch)
1945       || !exit_e
1946       /* Verify that new loop exit condition can be trivially modified.  */
1947       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1948       || (e != exit_e && e != entry_e))
1949     return false;
1950
1951   basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
1952   get_loop_body_with_size (loop, bbs, loop->num_nodes);
1953   bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
1954   free (bbs);
1955   return ret;
1956 }
1957
1958 /* Function find_loop_location.
1959
1960    Extract the location of the loop in the source code.
1961    If the loop is not well formed for vectorization, an estimated
1962    location is calculated.
1963    Return the loop location if succeed and NULL if not.  */
1964
1965 dump_user_location_t
1966 find_loop_location (class loop *loop)
1967 {
1968   gimple *stmt = NULL;
1969   basic_block bb;
1970   gimple_stmt_iterator si;
1971
1972   if (!loop)
1973     return dump_user_location_t ();
1974
1975   /* For the root of the loop tree return the function location.  */
1976   if (!loop_outer (loop))
1977     return dump_user_location_t::from_function_decl (cfun->decl);
1978
1979   if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
1980     {
1981       /* We only care about the loop location, so use any exit with location
1982          information.  */
1983       for (edge e : get_loop_exit_edges (loop))
1984         {
1985           stmt = get_loop_exit_condition (e);
1986
1987           if (stmt
1988               && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1989             return stmt;
1990         }
1991     }
1992
1993   /* If we got here the loop is probably not "well formed",
1994      try to estimate the loop location */
1995
1996   if (!loop->header)
1997     return dump_user_location_t ();
1998
1999   bb = loop->header;
2000
2001   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2002     {
2003       stmt = gsi_stmt (si);
2004       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2005         return stmt;
2006     }
2007
2008   return dump_user_location_t ();
2009 }
2010
2011 /* Return true if the phi described by STMT_INFO defines an IV of the
2012    loop to be vectorized.  */
2013
2014 static bool
2015 iv_phi_p (stmt_vec_info stmt_info)
2016 {
2017   gphi *phi = as_a <gphi *> (stmt_info->stmt);
2018   if (virtual_operand_p (PHI_RESULT (phi)))
2019     return false;
2020
2021   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2022       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2023     return false;
2024
2025   return true;
2026 }
2027
2028 /* Return true if vectorizer can peel for nonlinear iv.  */
2029 static bool
2030 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2031                               stmt_vec_info stmt_info)
2032 {
2033   enum vect_induction_op_type induction_type
2034     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2035   tree niters_skip;
2036   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2037      if niters or vf is unkown:
2038      For shift, when shift mount >= precision, there would be UD.
2039      For mult, don't known how to generate
2040      init_expr * pow (step, niters) for variable niters.
2041      For neg, it should be ok, since niters of vectorized main loop
2042      will always be multiple of 2.  */
2043   if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2044        || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
2045       && induction_type != vect_step_op_neg)
2046     {
2047       if (dump_enabled_p ())
2048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2049                          "Peeling for epilogue is not supported"
2050                          " for nonlinear induction except neg"
2051                          " when iteration count is unknown.\n");
2052       return false;
2053     }
2054
2055   /* Avoid compile time hog on vect_peel_nonlinear_iv_init.  */
2056   if (induction_type == vect_step_op_mul)
2057     {
2058       tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2059       tree type = TREE_TYPE (step_expr);
2060
2061       if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2062           && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2063         {
2064           if (dump_enabled_p ())
2065             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2066                              "Avoid compile time hog on"
2067                              " vect_peel_nonlinear_iv_init"
2068                              " for nonlinear induction vec_step_op_mul"
2069                              " when iteration count is too big.\n");
2070           return false;
2071         }
2072     }
2073
2074   /* Also doens't support peel for neg when niter is variable.
2075      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
2076   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2077   if ((niters_skip != NULL_TREE
2078        && (TREE_CODE (niters_skip) != INTEGER_CST
2079            || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2080       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2081           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2082     {
2083       if (dump_enabled_p ())
2084         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2085                          "Peeling for alignement is not supported"
2086                          " for nonlinear induction when niters_skip"
2087                          " is not constant.\n");
2088       return false;
2089     }
2090
2091   /* We can't support partial vectors and early breaks with an induction
2092      type other than add or neg since we require the epilog and can't
2093      perform the peeling.  The below condition mirrors that of
2094      vect_gen_vector_loop_niters  where niters_vector_mult_vf_var then sets
2095      step_vector to VF rather than 1.  This is what creates the nonlinear
2096      IV.  PR113163.  */
2097   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
2098       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2099       && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2100       && induction_type != vect_step_op_neg)
2101     {
2102       if (dump_enabled_p ())
2103         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2104                          "Peeling for epilogue is not supported"
2105                          " for nonlinear induction except neg"
2106                          " when VF is known and early breaks.\n");
2107       return false;
2108     }
2109
2110   return true;
2111 }
2112
2113 /* Function vect_can_advance_ivs_p
2114
2115    In case the number of iterations that LOOP iterates is unknown at compile
2116    time, an epilog loop will be generated, and the loop induction variables
2117    (IVs) will be "advanced" to the value they are supposed to take just before
2118    the epilog loop.  Here we check that the access function of the loop IVs
2119    and the expression that represents the loop bound are simple enough.
2120    These restrictions will be relaxed in the future.  */
2121
2122 bool
2123 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2124 {
2125   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2126   basic_block bb = loop->header;
2127   gphi_iterator gsi;
2128
2129   /* Analyze phi functions of the loop header.  */
2130
2131   if (dump_enabled_p ())
2132     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2133   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2134     {
2135       tree evolution_part;
2136       enum vect_induction_op_type induction_type;
2137
2138       gphi *phi = gsi.phi ();
2139       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2140       if (dump_enabled_p ())
2141         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2142                          phi_info->stmt);
2143
2144       /* Skip virtual phi's. The data dependences that are associated with
2145          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2146
2147          Skip reduction phis.  */
2148       if (!iv_phi_p (phi_info))
2149         {
2150           if (dump_enabled_p ())
2151             dump_printf_loc (MSG_NOTE, vect_location,
2152                              "reduc or virtual phi. skip.\n");
2153           continue;
2154         }
2155
2156       induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2157       if (induction_type != vect_step_op_add)
2158         {
2159           if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2160             return false;
2161
2162           continue;
2163         }
2164
2165       /* Analyze the evolution function.  */
2166
2167       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2168       if (evolution_part == NULL_TREE)
2169         {
2170           if (dump_enabled_p ())
2171             dump_printf (MSG_MISSED_OPTIMIZATION,
2172                          "No access function or evolution.\n");
2173           return false;
2174         }
2175
2176       /* FORNOW: We do not transform initial conditions of IVs
2177          which evolution functions are not invariants in the loop.  */
2178
2179       if (!expr_invariant_in_loop_p (loop, evolution_part))
2180         {
2181           if (dump_enabled_p ())
2182             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2183                              "evolution not invariant in loop.\n");
2184           return false;
2185         }
2186
2187       /* FORNOW: We do not transform initial conditions of IVs
2188          which evolution functions are a polynomial of degree >= 2.  */
2189
2190       if (tree_is_chrec (evolution_part))
2191         {
2192           if (dump_enabled_p ())
2193             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2194                              "evolution is chrec.\n");
2195           return false;
2196         }
2197     }
2198
2199   return true;
2200 }
2201
2202
2203 /*   Function vect_update_ivs_after_vectorizer.
2204
2205      "Advance" the induction variables of LOOP to the value they should take
2206      after the execution of LOOP.  This is currently necessary because the
2207      vectorizer does not handle induction variables that are used after the
2208      loop.  Such a situation occurs when the last iterations of LOOP are
2209      peeled, because:
2210      1. We introduced new uses after LOOP for IVs that were not originally used
2211         after LOOP: the IVs of LOOP are now used by an epilog loop.
2212      2. LOOP is going to be vectorized; this means that it will iterate N/VF
2213         times, whereas the loop IVs should be bumped N times.
2214
2215      Input:
2216      - LOOP - a loop that is going to be vectorized. The last few iterations
2217               of LOOP were peeled.
2218      - NITERS - the number of iterations that LOOP executes (before it is
2219                 vectorized). i.e, the number of times the ivs should be bumped.
2220      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2221                   coming out from LOOP on which there are uses of the LOOP ivs
2222                   (this is the path from LOOP->exit to epilog_loop->preheader).
2223
2224                   The new definitions of the ivs are placed in LOOP->exit.
2225                   The phi args associated with the edge UPDATE_E in the bb
2226                   UPDATE_E->dest are updated accordingly.
2227
2228      Assumption 1: Like the rest of the vectorizer, this function assumes
2229      a single loop exit that has a single predecessor.
2230
2231      Assumption 2: The phi nodes in the LOOP header and in update_bb are
2232      organized in the same order.
2233
2234      Assumption 3: The access function of the ivs is simple enough (see
2235      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
2236
2237      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2238      coming out of LOOP on which the ivs of LOOP are used (this is the path
2239      that leads to the epilog loop; other paths skip the epilog loop).  This
2240      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2241      needs to have its phis updated.
2242  */
2243
2244 static void
2245 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2246                                   tree niters, edge update_e)
2247 {
2248   gphi_iterator gsi, gsi1;
2249   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2250   basic_block update_bb = update_e->dest;
2251   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2252   gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2253
2254   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2255        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2256        gsi_next (&gsi), gsi_next (&gsi1))
2257     {
2258       tree init_expr;
2259       tree step_expr, off;
2260       tree type;
2261       tree var, ni, ni_name;
2262
2263       gphi *phi = gsi.phi ();
2264       gphi *phi1 = gsi1.phi ();
2265       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2266       if (dump_enabled_p ())
2267         dump_printf_loc (MSG_NOTE, vect_location,
2268                          "vect_update_ivs_after_vectorizer: phi: %G",
2269                          (gimple *) phi);
2270
2271       /* Skip reduction and virtual phis.  */
2272       if (!iv_phi_p (phi_info))
2273         {
2274           if (dump_enabled_p ())
2275             dump_printf_loc (MSG_NOTE, vect_location,
2276                              "reduc or virtual phi. skip.\n");
2277           continue;
2278         }
2279
2280       type = TREE_TYPE (gimple_phi_result (phi));
2281       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2282       step_expr = unshare_expr (step_expr);
2283
2284       /* FORNOW: We do not support IVs whose evolution function is a polynomial
2285          of degree >= 2 or exponential.  */
2286       gcc_assert (!tree_is_chrec (step_expr));
2287
2288       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2289       gimple_seq stmts = NULL;
2290       enum vect_induction_op_type induction_type
2291         = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2292
2293       if (induction_type == vect_step_op_add)
2294         {
2295           tree stype = TREE_TYPE (step_expr);
2296           off = fold_build2 (MULT_EXPR, stype,
2297                                fold_convert (stype, niters), step_expr);
2298
2299           if (POINTER_TYPE_P (type))
2300             ni = fold_build_pointer_plus (init_expr, off);
2301           else
2302             ni = fold_convert (type,
2303                                fold_build2 (PLUS_EXPR, stype,
2304                                             fold_convert (stype, init_expr),
2305                                             off));
2306         }
2307       /* Don't bother call vect_peel_nonlinear_iv_init.  */
2308       else if (induction_type == vect_step_op_neg)
2309         ni = init_expr;
2310       else
2311         ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2312                                           niters, step_expr,
2313                                           induction_type);
2314
2315       var = create_tmp_var (type, "tmp");
2316
2317       gimple_seq new_stmts = NULL;
2318       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2319
2320       /* Exit_bb shouldn't be empty.  */
2321       if (!gsi_end_p (last_gsi))
2322         {
2323           gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2324           gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2325         }
2326       else
2327         {
2328           gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2329           gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2330         }
2331
2332       /* Fix phi expressions in the successor bb.  */
2333       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2334     }
2335 }
2336
2337 /* Return a gimple value containing the misalignment (measured in vector
2338    elements) for the loop described by LOOP_VINFO, i.e. how many elements
2339    it is away from a perfectly aligned address.  Add any new statements
2340    to SEQ.  */
2341
2342 static tree
2343 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2344 {
2345   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2346   stmt_vec_info stmt_info = dr_info->stmt;
2347   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2348
2349   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2350   unsigned HOST_WIDE_INT target_align_c;
2351   tree target_align_minus_1;
2352
2353   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2354                                         size_zero_node) < 0;
2355   tree offset = (negative
2356                  ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2357                              * TREE_INT_CST_LOW
2358                                  (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2359                  : size_zero_node);
2360   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2361                                                           stmt_info, seq,
2362                                                           offset);
2363   tree type = unsigned_type_for (TREE_TYPE (start_addr));
2364   if (target_align.is_constant (&target_align_c))
2365     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2366   else
2367     {
2368       tree vla = build_int_cst (type, target_align);
2369       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2370                                     fold_build2 (MINUS_EXPR, type,
2371                                                  build_int_cst (type, 0), vla));
2372       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2373                                           build_int_cst (type, 1));
2374     }
2375
2376   HOST_WIDE_INT elem_size
2377     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2378   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2379
2380   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
2381   tree int_start_addr = fold_convert (type, start_addr);
2382   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2383                                         target_align_minus_1);
2384
2385   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
2386   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2387                                         elem_size_log);
2388
2389   return misalign_in_elems;
2390 }
2391
2392 /* Function vect_gen_prolog_loop_niters
2393
2394    Generate the number of iterations which should be peeled as prolog for the
2395    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
2396    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2397    As a result, after the execution of this loop, the data reference DR will
2398    refer to an aligned location.  The following computation is generated:
2399
2400    If the misalignment of DR is known at compile time:
2401      addr_mis = int mis = DR_MISALIGNMENT (dr);
2402    Else, compute address misalignment in bytes:
2403      addr_mis = addr & (target_align - 1)
2404
2405    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2406
2407    (elem_size = element type size; an element is the scalar element whose type
2408    is the inner type of the vectype)
2409
2410    The computations will be emitted at the end of BB.  We also compute and
2411    store upper bound (included) of the result in BOUND.
2412
2413    When the step of the data-ref in the loop is not 1 (as in interleaved data
2414    and SLP), the number of iterations of the prolog must be divided by the step
2415    (which is equal to the size of interleaved group).
2416
2417    The above formulas assume that VF == number of elements in the vector. This
2418    may not hold when there are multiple-types in the loop.
2419    In this case, for some data-references in the loop the VF does not represent
2420    the number of elements that fit in the vector.  Therefore, instead of VF we
2421    use TYPE_VECTOR_SUBPARTS.  */
2422
2423 static tree
2424 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2425                              basic_block bb, int *bound)
2426 {
2427   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2428   tree var;
2429   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2430   gimple_seq stmts = NULL, new_stmts = NULL;
2431   tree iters, iters_name;
2432   stmt_vec_info stmt_info = dr_info->stmt;
2433   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2434   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2435
2436   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2437     {
2438       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2439
2440       if (dump_enabled_p ())
2441         dump_printf_loc (MSG_NOTE, vect_location,
2442                          "known peeling = %d.\n", npeel);
2443
2444       iters = build_int_cst (niters_type, npeel);
2445       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2446     }
2447   else
2448     {
2449       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2450       tree type = TREE_TYPE (misalign_in_elems);
2451       HOST_WIDE_INT elem_size
2452         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2453       /* We only do prolog peeling if the target alignment is known at compile
2454          time.  */
2455       poly_uint64 align_in_elems =
2456         exact_div (target_align, elem_size);
2457       tree align_in_elems_minus_1 =
2458         build_int_cst (type, align_in_elems - 1);
2459       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2460
2461       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
2462                                  & (align_in_elems - 1)).  */
2463       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2464                                             size_zero_node) < 0;
2465       if (negative)
2466         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2467                              align_in_elems_tree);
2468       else
2469         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2470                              misalign_in_elems);
2471       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2472       iters = fold_convert (niters_type, iters);
2473       unsigned HOST_WIDE_INT align_in_elems_c;
2474       if (align_in_elems.is_constant (&align_in_elems_c))
2475         *bound = align_in_elems_c - 1;
2476       else
2477         *bound = -1;
2478     }
2479
2480   if (dump_enabled_p ())
2481     dump_printf_loc (MSG_NOTE, vect_location,
2482                      "niters for prolog loop: %T\n", iters);
2483
2484   var = create_tmp_var (niters_type, "prolog_loop_niters");
2485   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2486
2487   if (new_stmts)
2488     gimple_seq_add_seq (&stmts, new_stmts);
2489   if (stmts)
2490     {
2491       gcc_assert (single_succ_p (bb));
2492       gimple_stmt_iterator gsi = gsi_last_bb (bb);
2493       if (gsi_end_p (gsi))
2494         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2495       else
2496         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2497     }
2498   return iters_name;
2499 }
2500
2501
2502 /* Function vect_update_init_of_dr
2503
2504    If CODE is PLUS, the vector loop starts NITERS iterations after the
2505    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2506    iterations before the scalar one (using masking to skip inactive
2507    elements).  This function updates the information recorded in DR to
2508    account for the difference.  Specifically, it updates the OFFSET
2509    field of DR_INFO.  */
2510
2511 static void
2512 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2513 {
2514   struct data_reference *dr = dr_info->dr;
2515   tree offset = dr_info->offset;
2516   if (!offset)
2517     offset = build_zero_cst (sizetype);
2518
2519   niters = fold_build2 (MULT_EXPR, sizetype,
2520                         fold_convert (sizetype, niters),
2521                         fold_convert (sizetype, DR_STEP (dr)));
2522   offset = fold_build2 (code, sizetype,
2523                         fold_convert (sizetype, offset), niters);
2524   dr_info->offset = offset;
2525 }
2526
2527
2528 /* Function vect_update_inits_of_drs
2529
2530    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2531    CODE and NITERS are as for vect_update_inits_of_dr.  */
2532
2533 void
2534 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2535                           tree_code code)
2536 {
2537   unsigned int i;
2538   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2539   struct data_reference *dr;
2540
2541   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2542
2543   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
2544      here, but since we might use these niters to update the epilogues niters
2545      and data references we can't insert them here as this definition might not
2546      always dominate its uses.  */
2547   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2548     niters = fold_convert (sizetype, niters);
2549
2550   FOR_EACH_VEC_ELT (datarefs, i, dr)
2551     {
2552       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2553       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2554           && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2555         vect_update_init_of_dr (dr_info, niters, code);
2556     }
2557 }
2558
2559 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2560    by masking.  This involves calculating the number of iterations to
2561    be peeled and then aligning all memory references appropriately.  */
2562
2563 void
2564 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2565 {
2566   tree misalign_in_elems;
2567   tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2568
2569   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2570
2571   /* From the information recorded in LOOP_VINFO get the number of iterations
2572      that need to be skipped via masking.  */
2573   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2574     {
2575       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2576                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2577       misalign_in_elems = build_int_cst (type, misalign);
2578     }
2579   else
2580     {
2581       gimple_seq seq1 = NULL, seq2 = NULL;
2582       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2583       misalign_in_elems = fold_convert (type, misalign_in_elems);
2584       misalign_in_elems = force_gimple_operand (misalign_in_elems,
2585                                                 &seq2, true, NULL_TREE);
2586       gimple_seq_add_seq (&seq1, seq2);
2587       if (seq1)
2588         {
2589           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2590           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2591           gcc_assert (!new_bb);
2592         }
2593     }
2594
2595   if (dump_enabled_p ())
2596     dump_printf_loc (MSG_NOTE, vect_location,
2597                      "misalignment for fully-masked loop: %T\n",
2598                      misalign_in_elems);
2599
2600   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2601
2602   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2603 }
2604
2605 /* This function builds ni_name = number of iterations.  Statements
2606    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
2607    it to TRUE if new ssa_var is generated.  */
2608
2609 tree
2610 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2611 {
2612   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2613   if (TREE_CODE (ni) == INTEGER_CST)
2614     return ni;
2615   else
2616     {
2617       tree ni_name, var;
2618       gimple_seq stmts = NULL;
2619       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2620
2621       var = create_tmp_var (TREE_TYPE (ni), "niters");
2622       ni_name = force_gimple_operand (ni, &stmts, false, var);
2623       if (stmts)
2624         {
2625           gsi_insert_seq_on_edge_immediate (pe, stmts);
2626           if (new_var_p != NULL)
2627             *new_var_p = true;
2628         }
2629
2630       return ni_name;
2631     }
2632 }
2633
2634 /* Calculate the number of iterations above which vectorized loop will be
2635    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
2636    of prolog loop.  If it's integer const, the integer number is also passed
2637    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
2638    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
2639    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
2640    threshold below which the scalar (rather than vectorized) loop will be
2641    executed.  This function stores the upper bound (inclusive) of the result
2642    in BOUND_SCALAR.  */
2643
2644 static tree
2645 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2646                              int bound_prolog, poly_int64 bound_epilog, int th,
2647                              poly_uint64 *bound_scalar,
2648                              bool check_profitability)
2649 {
2650   tree type = TREE_TYPE (niters_prolog);
2651   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2652                              build_int_cst (type, bound_epilog));
2653
2654   *bound_scalar = bound_prolog + bound_epilog;
2655   if (check_profitability)
2656     {
2657       /* TH indicates the minimum niters of vectorized loop, while we
2658          compute the maximum niters of scalar loop.  */
2659       th--;
2660       /* Peeling for constant times.  */
2661       if (int_niters_prolog >= 0)
2662         {
2663           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2664           return build_int_cst (type, *bound_scalar);
2665         }
2666       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
2667          and BOUND_EPILOG are inclusive upper bounds.  */
2668       if (known_ge (th, bound_prolog + bound_epilog))
2669         {
2670           *bound_scalar = th;
2671           return build_int_cst (type, th);
2672         }
2673       /* Need to do runtime comparison.  */
2674       else if (maybe_gt (th, bound_epilog))
2675         {
2676           *bound_scalar = upper_bound (*bound_scalar, th);
2677           return fold_build2 (MAX_EXPR, type,
2678                               build_int_cst (type, th), niters);
2679         }
2680     }
2681   return niters;
2682 }
2683
2684 /* NITERS is the number of times that the original scalar loop executes
2685    after peeling.  Work out the maximum number of iterations N that can
2686    be handled by the vectorized form of the loop and then either:
2687
2688    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2689
2690         niters_vector = N
2691
2692    b) set *STEP_VECTOR_PTR to one and generate:
2693
2694         niters_vector = N / vf
2695
2696    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2697    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
2698    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
2699
2700 void
2701 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2702                              tree *niters_vector_ptr, tree *step_vector_ptr,
2703                              bool niters_no_overflow)
2704 {
2705   tree ni_minus_gap, var;
2706   tree niters_vector, step_vector, type = TREE_TYPE (niters);
2707   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2708   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2709   tree log_vf = NULL_TREE;
2710
2711   /* If epilogue loop is required because of data accesses with gaps, we
2712      subtract one iteration from the total number of iterations here for
2713      correct calculation of RATIO.  */
2714   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2715     {
2716       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2717                                   build_one_cst (type));
2718       if (!is_gimple_val (ni_minus_gap))
2719         {
2720           var = create_tmp_var (type, "ni_gap");
2721           gimple *stmts = NULL;
2722           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2723                                                true, var);
2724           gsi_insert_seq_on_edge_immediate (pe, stmts);
2725         }
2726     }
2727   else
2728     ni_minus_gap = niters;
2729
2730   /* To silence some unexpected warnings, simply initialize to 0. */
2731   unsigned HOST_WIDE_INT const_vf = 0;
2732   if (vf.is_constant (&const_vf)
2733       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2734     {
2735       /* Create: niters >> log2(vf) */
2736       /* If it's known that niters == number of latch executions + 1 doesn't
2737          overflow, we can generate niters >> log2(vf); otherwise we generate
2738          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2739          will be at least one.  */
2740       log_vf = build_int_cst (type, exact_log2 (const_vf));
2741       if (niters_no_overflow)
2742         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2743       else
2744         niters_vector
2745           = fold_build2 (PLUS_EXPR, type,
2746                          fold_build2 (RSHIFT_EXPR, type,
2747                                       fold_build2 (MINUS_EXPR, type,
2748                                                    ni_minus_gap,
2749                                                    build_int_cst (type, vf)),
2750                                       log_vf),
2751                          build_int_cst (type, 1));
2752       step_vector = build_one_cst (type);
2753     }
2754   else
2755     {
2756       niters_vector = ni_minus_gap;
2757       step_vector = build_int_cst (type, vf);
2758     }
2759
2760   if (!is_gimple_val (niters_vector))
2761     {
2762       var = create_tmp_var (type, "bnd");
2763       gimple_seq stmts = NULL;
2764       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2765       gsi_insert_seq_on_edge_immediate (pe, stmts);
2766       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2767          we set range information to make niters analyzer's life easier.
2768          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2769          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2770       if (stmts != NULL && log_vf)
2771         {
2772           if (niters_no_overflow)
2773             {
2774               value_range vr (type,
2775                               wi::one (TYPE_PRECISION (type)),
2776                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2777                                                          TYPE_SIGN (type)),
2778                                           exact_log2 (const_vf),
2779                                           TYPE_SIGN (type)));
2780               set_range_info (niters_vector, vr);
2781             }
2782           /* For VF == 1 the vector IV might also overflow so we cannot
2783              assert a minimum value of 1.  */
2784           else if (const_vf > 1)
2785             {
2786               value_range vr (type,
2787                               wi::one (TYPE_PRECISION (type)),
2788                               wi::rshift (wi::max_value (TYPE_PRECISION (type),
2789                                                          TYPE_SIGN (type))
2790                                           - (const_vf - 1),
2791                                           exact_log2 (const_vf), TYPE_SIGN (type))
2792                               + 1);
2793               set_range_info (niters_vector, vr);
2794             }
2795         }
2796     }
2797   *niters_vector_ptr = niters_vector;
2798   *step_vector_ptr = step_vector;
2799
2800   return;
2801 }
2802
2803 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2804    loop specified by LOOP_VINFO after vectorization, compute the number
2805    of iterations before vectorization (niters_vector * vf) and store it
2806    to NITERS_VECTOR_MULT_VF_PTR.  */
2807
2808 static void
2809 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2810                                      tree niters_vector,
2811                                      tree *niters_vector_mult_vf_ptr)
2812 {
2813   /* We should be using a step_vector of VF if VF is variable.  */
2814   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2815   tree type = TREE_TYPE (niters_vector);
2816   tree log_vf = build_int_cst (type, exact_log2 (vf));
2817   tree tree_vf = build_int_cst (type, vf);
2818   basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2819
2820   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2821   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2822                                             niters_vector, log_vf);
2823
2824   /* If we've peeled a vector iteration then subtract one full vector
2825      iteration.  */
2826   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2827     niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
2828                                          niters_vector_mult_vf, tree_vf);
2829
2830   if (!is_gimple_val (niters_vector_mult_vf))
2831     {
2832       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2833       gimple_seq stmts = NULL;
2834       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2835                                                     &stmts, true, var);
2836       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2837       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2838     }
2839   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2840 }
2841
2842 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2843    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2844    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2845    appear like below:
2846
2847        guard_bb:
2848          if (cond)
2849            goto merge_bb;
2850          else
2851            goto skip_loop;
2852
2853      skip_loop:
2854        header_a:
2855          i_1 = PHI<i_0, i_2>;
2856          ...
2857          i_2 = i_1 + 1;
2858          if (cond_a)
2859            goto latch_a;
2860          else
2861            goto exit_a;
2862        latch_a:
2863          goto header_a;
2864
2865        exit_a:
2866          i_5 = PHI<i_2>;
2867
2868        merge_bb:
2869          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2870
2871      update_loop:
2872        header_b:
2873          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2874          ...
2875          i_4 = i_3 + 1;
2876          if (cond_b)
2877            goto latch_b;
2878          else
2879            goto exit_bb;
2880        latch_b:
2881          goto header_b;
2882
2883        exit_bb:
2884
2885    This function creates PHI nodes at merge_bb and replaces the use of i_5
2886    in the update_loop's PHI node with the result of new PHI result.  */
2887
2888 static void
2889 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2890                                     class loop *update_loop,
2891                                     edge guard_edge, edge merge_edge)
2892 {
2893   location_t merge_loc, guard_loc;
2894   edge orig_e = loop_preheader_edge (skip_loop);
2895   edge update_e = loop_preheader_edge (update_loop);
2896   gphi_iterator gsi_orig, gsi_update;
2897
2898   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2899         gsi_update = gsi_start_phis (update_loop->header));
2900        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2901        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2902     {
2903       gphi *orig_phi = gsi_orig.phi ();
2904       gphi *update_phi = gsi_update.phi ();
2905
2906       /* Generate new phi node at merge bb of the guard.  */
2907       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2908       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2909
2910       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2911          args in NEW_PHI for these edges.  */
2912       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2913       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2914       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2915       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2916       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2917       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2918
2919       /* Update phi in UPDATE_PHI.  */
2920       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2921     }
2922 }
2923
2924 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2925    Return a value that equals:
2926
2927    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2928    - SKIP_VALUE when the main loop is skipped.  */
2929
2930 tree
2931 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2932                            tree skip_value)
2933 {
2934   gcc_assert (loop_vinfo->main_loop_edge);
2935
2936   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2937   basic_block bb = loop_vinfo->main_loop_edge->dest;
2938   gphi *new_phi = create_phi_node (phi_result, bb);
2939   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2940                UNKNOWN_LOCATION);
2941   add_phi_arg (new_phi, skip_value,
2942                loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2943   return phi_result;
2944 }
2945
2946 /* Function vect_do_peeling.
2947
2948    Input:
2949    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2950
2951        preheader:
2952      LOOP:
2953        header_bb:
2954          loop_body
2955          if (exit_loop_cond) goto exit_bb
2956          else                goto header_bb
2957        exit_bb:
2958
2959    - NITERS: The number of iterations of the loop.
2960    - NITERSM1: The number of iterations of the loop's latch.
2961    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2962    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2963                               CHECK_PROFITABILITY is true.
2964    Output:
2965    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2966      iterate after vectorization; see vect_set_loop_condition for details.
2967    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2968      should be set to the number of scalar iterations handled by the
2969      vector loop.  The SSA name is only used on exit from the loop.
2970
2971    This function peels prolog and epilog from the loop, adds guards skipping
2972    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2973    would look like:
2974
2975        guard_bb_1:
2976          if (prefer_scalar_loop) goto merge_bb_1
2977          else                    goto guard_bb_2
2978
2979        guard_bb_2:
2980          if (skip_prolog) goto merge_bb_2
2981          else             goto prolog_preheader
2982
2983        prolog_preheader:
2984      PROLOG:
2985        prolog_header_bb:
2986          prolog_body
2987          if (exit_prolog_cond) goto prolog_exit_bb
2988          else                  goto prolog_header_bb
2989        prolog_exit_bb:
2990
2991        merge_bb_2:
2992
2993        vector_preheader:
2994      VECTOR LOOP:
2995        vector_header_bb:
2996          vector_body
2997          if (exit_vector_cond) goto vector_exit_bb
2998          else                  goto vector_header_bb
2999        vector_exit_bb:
3000
3001        guard_bb_3:
3002          if (skip_epilog) goto merge_bb_3
3003          else             goto epilog_preheader
3004
3005        merge_bb_1:
3006
3007        epilog_preheader:
3008      EPILOG:
3009        epilog_header_bb:
3010          epilog_body
3011          if (exit_epilog_cond) goto merge_bb_3
3012          else                  goto epilog_header_bb
3013
3014        merge_bb_3:
3015
3016    Note this function peels prolog and epilog only if it's necessary,
3017    as well as guards.
3018    This function returns the epilogue loop if a decision was made to vectorize
3019    it, otherwise NULL.
3020
3021    The analysis resulting in this epilogue loop's loop_vec_info was performed
3022    in the same vect_analyze_loop call as the main loop's.  At that time
3023    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3024    vectorization factors than the main loop.  This list is stored in the main
3025    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
3026    vectorize the epilogue loop for a lower vectorization factor,  the
3027    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
3028    updated and linked to the epilogue loop.  This is later used to vectorize
3029    the epilogue.  The reason the loop_vec_info needs updating is that it was
3030    constructed based on the original main loop, and the epilogue loop is a
3031    copy of this loop, so all links pointing to statements in the original loop
3032    need updating.  Furthermore, these loop_vec_infos share the
3033    data_reference's records, which will also need to be updated.
3034
3035    TODO: Guard for prefer_scalar_loop should be emitted along with
3036    versioning conditions if loop versioning is needed.  */
3037
3038
3039 class loop *
3040 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3041                  tree *niters_vector, tree *step_vector,
3042                  tree *niters_vector_mult_vf_var, int th,
3043                  bool check_profitability, bool niters_no_overflow,
3044                  tree *advance)
3045 {
3046   edge e, guard_e;
3047   tree type = TREE_TYPE (niters), guard_cond;
3048   basic_block guard_bb, guard_to;
3049   profile_probability prob_prolog, prob_vector, prob_epilog;
3050   int estimated_vf;
3051   int prolog_peeling = 0;
3052   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
3053   /* We currently do not support prolog peeling if the target alignment is not
3054      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
3055      target alignment being constant.  */
3056   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3057   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
3058     return NULL;
3059
3060   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3061     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3062
3063   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3064   poly_uint64 bound_epilog = 0;
3065   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3066       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3067     bound_epilog += vf - 1;
3068   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3069     bound_epilog += 1;
3070
3071   /* For early breaks the scalar loop needs to execute at most VF times
3072      to find the element that caused the break.  */
3073   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3074     bound_epilog = vf;
3075
3076   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3077   poly_uint64 bound_scalar = bound_epilog;
3078
3079   if (!prolog_peeling && !epilog_peeling)
3080     return NULL;
3081
3082   /* Before doing any peeling make sure to reset debug binds outside of
3083      the loop refering to defs not in LC SSA.  */
3084   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3085   for (unsigned i = 0; i < loop->num_nodes; ++i)
3086     {
3087       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3088       imm_use_iterator ui;
3089       gimple *use_stmt;
3090       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3091            gsi_next (&gsi))
3092         {
3093           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3094             if (gimple_debug_bind_p (use_stmt)
3095                 && loop != gimple_bb (use_stmt)->loop_father
3096                 && !flow_loop_nested_p (loop,
3097                                         gimple_bb (use_stmt)->loop_father))
3098               {
3099                 gimple_debug_bind_reset_value (use_stmt);
3100                 update_stmt (use_stmt);
3101               }
3102         }
3103       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3104            gsi_next (&gsi))
3105         {
3106           ssa_op_iter op_iter;
3107           def_operand_p def_p;
3108           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3109             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3110               if (gimple_debug_bind_p (use_stmt)
3111                   && loop != gimple_bb (use_stmt)->loop_father
3112                   && !flow_loop_nested_p (loop,
3113                                           gimple_bb (use_stmt)->loop_father))
3114                 {
3115                   gimple_debug_bind_reset_value (use_stmt);
3116                   update_stmt (use_stmt);
3117                 }
3118         }
3119     }
3120
3121   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3122   estimated_vf = vect_vf_for_cost (loop_vinfo);
3123   if (estimated_vf == 2)
3124     estimated_vf = 3;
3125   prob_prolog = prob_epilog = profile_probability::guessed_always ()
3126                         .apply_scale (estimated_vf - 1, estimated_vf);
3127
3128   class loop *prolog, *epilog = NULL;
3129   class loop *first_loop = loop;
3130   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3131
3132   /* SSA form needs to be up-to-date since we are going to manually
3133      update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3134      update SSA state after that, so we have to make sure to not lose any
3135      pending update needs.  */
3136   gcc_assert (!need_ssa_update_p (cfun));
3137
3138   /* If we're vectorizing an epilogue loop, we have ensured that the
3139      virtual operand is in SSA form throughout the vectorized main loop.
3140      Normally it is possible to trace the updated
3141      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3142      back to scalar-stmt vuses, meaning that the effect of the SSA update
3143      remains local to the main loop.  However, there are rare cases in
3144      which the vectorized loop should have vdefs even when the original scalar
3145      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
3146      introduces clobbers of the temporary vector array, which in turn
3147      needs new vdefs.  If the scalar loop doesn't write to memory, these
3148      new vdefs will be the only ones in the vector loop.
3149      We are currently defering updating virtual SSA form and creating
3150      of a virtual PHI for this case so we do not have to make sure the
3151      newly introduced virtual def is in LCSSA form.  */
3152
3153   if (MAY_HAVE_DEBUG_BIND_STMTS)
3154     {
3155       gcc_assert (!adjust_vec.exists ());
3156       adjust_vec.create (32);
3157     }
3158   initialize_original_copy_tables ();
3159
3160   /* Record the anchor bb at which the guard should be placed if the scalar
3161      loop might be preferred.  */
3162   basic_block anchor = loop_preheader_edge (loop)->src;
3163
3164   /* Generate the number of iterations for the prolog loop.  We do this here
3165      so that we can also get the upper bound on the number of iterations.  */
3166   tree niters_prolog;
3167   int bound_prolog = 0;
3168   if (prolog_peeling)
3169     {
3170       niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3171                                                     &bound_prolog);
3172       /* If algonment peeling is known, we will always execute prolog.  */
3173       if (TREE_CODE (niters_prolog) == INTEGER_CST)
3174         prob_prolog = profile_probability::always ();
3175     }
3176   else
3177     niters_prolog = build_int_cst (type, 0);
3178
3179   loop_vec_info epilogue_vinfo = NULL;
3180   if (vect_epilogues)
3181     {
3182       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
3183       loop_vinfo->epilogue_vinfos.ordered_remove (0);
3184     }
3185
3186   tree niters_vector_mult_vf = NULL_TREE;
3187   /* Saving NITERs before the loop, as this may be changed by prologue.  */
3188   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3189   edge update_e = NULL, skip_e = NULL;
3190   unsigned int lowest_vf = constant_lower_bound (vf);
3191   /* Prolog loop may be skipped.  */
3192   bool skip_prolog = (prolog_peeling != 0);
3193   /* Skip this loop to epilog when there are not enough iterations to enter this
3194      vectorized loop.  If true we should perform runtime checks on the NITERS
3195      to check whether we should skip the current vectorized loop.  If we know
3196      the number of scalar iterations we may choose to add a runtime check if
3197      this number "maybe" smaller than the number of iterations required
3198      when we know the number of scalar iterations may potentially
3199      be smaller than the number of iterations required to enter this loop, for
3200      this we use the upper bounds on the prolog and epilog peeling.  When we
3201      don't know the number of iterations and don't require versioning it is
3202      because we have asserted that there are enough scalar iterations to enter
3203      the main loop, so this skip is not necessary.  When we are versioning then
3204      we only add such a skip if we have chosen to vectorize the epilogue.  */
3205   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3206                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3207                                   bound_prolog + bound_epilog)
3208                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3209                          || vect_epilogues));
3210
3211   /* Epilog loop must be executed if the number of iterations for epilog
3212      loop is known at compile time, otherwise we need to add a check at
3213      the end of vector loop and skip to the end of epilog loop.  */
3214   bool skip_epilog = (prolog_peeling < 0
3215                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3216                       || !vf.is_constant ());
3217   /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3218      loop must be executed.  */
3219   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3220       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3221     skip_epilog = false;
3222
3223   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3224   auto_vec<profile_count> original_counts;
3225   basic_block *original_bbs = NULL;
3226
3227   if (skip_vector)
3228     {
3229       split_edge (loop_preheader_edge (loop));
3230
3231       if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3232         {
3233           original_bbs = get_loop_body (loop);
3234           for (unsigned int i = 0; i < loop->num_nodes; i++)
3235             original_counts.safe_push(original_bbs[i]->count);
3236         }
3237
3238       /* Due to the order in which we peel prolog and epilog, we first
3239          propagate probability to the whole loop.  The purpose is to
3240          avoid adjusting probabilities of both prolog and vector loops
3241          separately.  Note in this case, the probability of epilog loop
3242          needs to be scaled back later.  */
3243       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3244       if (prob_vector.initialized_p ())
3245         {
3246           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3247           scale_loop_profile (loop, prob_vector, -1);
3248         }
3249     }
3250
3251   if (vect_epilogues)
3252     {
3253       /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3254          use the original scalar loop as remaining epilogue if necessary.  */
3255       LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3256         = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3257       LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3258         = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3259     }
3260
3261   if (prolog_peeling)
3262     {
3263       e = loop_preheader_edge (loop);
3264       edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3265       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e));
3266
3267       /* Peel prolog and put it on preheader edge of loop.  */
3268       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3269       edge prolog_e = NULL;
3270       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3271                                                        scalar_loop, scalar_e,
3272                                                        e, &prolog_e);
3273       gcc_assert (prolog);
3274       prolog->force_vectorize = false;
3275
3276       first_loop = prolog;
3277       reset_original_copy_tables ();
3278
3279       /* Update the number of iterations for prolog loop.  */
3280       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3281       vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3282                                step_prolog, NULL_TREE, false);
3283
3284       /* Skip the prolog loop.  */
3285       if (skip_prolog)
3286         {
3287           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3288                                     niters_prolog, build_int_cst (type, 0));
3289           guard_bb = loop_preheader_edge (prolog)->src;
3290           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3291           guard_to = split_edge (loop_preheader_edge (loop));
3292           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3293                                            guard_to, guard_bb,
3294                                            prob_prolog.invert (),
3295                                            irred_flag);
3296           e = EDGE_PRED (guard_to, 0);
3297           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3298           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3299
3300           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3301           scale_loop_profile (prolog, prob_prolog, bound_prolog - 1);
3302         }
3303
3304       /* Update init address of DRs.  */
3305       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3306       /* Update niters for vector loop.  */
3307       LOOP_VINFO_NITERS (loop_vinfo)
3308         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3309       LOOP_VINFO_NITERSM1 (loop_vinfo)
3310         = fold_build2 (MINUS_EXPR, type,
3311                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3312       bool new_var_p = false;
3313       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3314       /* It's guaranteed that vector loop bound before vectorization is at
3315          least VF, so set range information for newly generated var.  */
3316       if (new_var_p)
3317         {
3318           value_range vr (type,
3319                           wi::to_wide (build_int_cst (type, lowest_vf)),
3320                           wi::to_wide (TYPE_MAX_VALUE (type)));
3321           set_range_info (niters, vr);
3322         }
3323
3324       /* Prolog iterates at most bound_prolog times, latch iterates at
3325          most bound_prolog - 1 times.  */
3326       record_niter_bound (prolog, bound_prolog - 1, false, true);
3327       delete_update_ssa ();
3328       adjust_vec_debug_stmts ();
3329       scev_reset ();
3330     }
3331   basic_block bb_before_epilog = NULL;
3332
3333   if (epilog_peeling)
3334     {
3335       e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3336       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3337
3338       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
3339          said epilog then we should use a copy of the main loop as a starting
3340          point.  This loop may have already had some preliminary transformations
3341          to allow for more optimal vectorization, for example if-conversion.
3342          If we are not vectorizing the epilog then we should use the scalar loop
3343          as the transformations mentioned above make less or no sense when not
3344          vectorizing.  */
3345       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3346       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3347       edge epilog_e = vect_epilogues ? e : scalar_e;
3348       edge new_epilog_e = NULL;
3349       auto_vec<basic_block> doms;
3350       epilog
3351         = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3352                                                   &new_epilog_e, true, &doms);
3353
3354       LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3355       gcc_assert (epilog);
3356       gcc_assert (new_epilog_e);
3357       epilog->force_vectorize = false;
3358       bb_before_epilog = loop_preheader_edge (epilog)->src;
3359
3360       /* Scalar version loop may be preferred.  In this case, add guard
3361          and skip to epilog.  Note this only happens when the number of
3362          iterations of loop is unknown at compile time, otherwise this
3363          won't be vectorized.  */
3364       if (skip_vector)
3365         {
3366           /* Additional epilogue iteration is peeled if gap exists.  */
3367           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3368                                                 bound_prolog, bound_epilog,
3369                                                 th, &bound_scalar,
3370                                                 check_profitability);
3371           /* Build guard against NITERSM1 since NITERS may overflow.  */
3372           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3373           guard_bb = anchor;
3374           guard_to = split_edge (loop_preheader_edge (epilog));
3375           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3376                                            guard_to, guard_bb,
3377                                            prob_vector.invert (),
3378                                            irred_flag);
3379           skip_e = guard_e;
3380           e = EDGE_PRED (guard_to, 0);
3381           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3382           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3383
3384           /* Simply propagate profile info from guard_bb to guard_to which is
3385              a merge point of control flow.  */
3386           profile_count old_count = guard_to->count;
3387           guard_to->count = guard_bb->count;
3388
3389           /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3390           if (vect_epilogues || scalar_loop == NULL)
3391             {
3392               gcc_assert(epilog->num_nodes == loop->num_nodes);
3393               basic_block *bbs = get_loop_body (epilog);
3394               for (unsigned int i = 0; i < epilog->num_nodes; i++)
3395                 {
3396                   gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3397                   bbs[i]->count = original_counts[i];
3398                 }
3399               free (bbs);
3400               free (original_bbs);
3401             }
3402           else if (old_count.nonzero_p ())
3403             scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3404
3405           /* Only need to handle basic block before epilog loop if it's not
3406              the guard_bb, which is the case when skip_vector is true.  */
3407           if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3408             bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3409           bb_before_epilog = loop_preheader_edge (epilog)->src;
3410         }
3411
3412       /* If loop is peeled for non-zero constant times, now niters refers to
3413          orig_niters - prolog_peeling, it won't overflow even the orig_niters
3414          overflows.  */
3415       niters_no_overflow |= (prolog_peeling > 0);
3416       vect_gen_vector_loop_niters (loop_vinfo, niters,
3417                                    niters_vector, step_vector,
3418                                    niters_no_overflow);
3419       if (!integer_onep (*step_vector))
3420         {
3421           /* On exit from the loop we will have an easy way of calcalating
3422              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3423              until then.  */
3424           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3425           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3426           *niters_vector_mult_vf_var = niters_vector_mult_vf;
3427         }
3428       else
3429         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3430                                              &niters_vector_mult_vf);
3431       /* Update IVs of original loop as if they were advanced by
3432          niters_vector_mult_vf steps.  */
3433       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3434       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3435       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3436         update_e = single_succ_edge (LOOP_VINFO_IV_EXIT (loop_vinfo)->dest);
3437
3438       /* If we have a peeled vector iteration, all exits are the same, leave it
3439          and so the main exit needs to be treated the same as the alternative
3440          exits in that we leave their updates to vectorizable_live_operations.
3441          */
3442       if (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3443         vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3444                                           update_e);
3445
3446       /* If we have a peeled vector iteration we will never skip the epilog loop
3447          and we can simplify the cfg a lot by not doing the edge split.  */
3448       if (skip_epilog || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3449         {
3450           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3451                                     niters, niters_vector_mult_vf);
3452
3453           guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3454           edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3455           guard_to = epilog_e->dest;
3456           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3457                                            skip_vector ? anchor : guard_bb,
3458                                            prob_epilog.invert (),
3459                                            irred_flag);
3460           doms.safe_push (guard_to);
3461           if (vect_epilogues)
3462             epilogue_vinfo->skip_this_loop_edge = guard_e;
3463           edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3464           gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3465           for (gphi_iterator gsi = gsi_start_phis (guard_to);
3466                !gsi_end_p (gsi); gsi_next (&gsi))
3467             {
3468               /* We are expecting all of the PHIs we have on epilog_e
3469                  to be also on the main loop exit.  But sometimes
3470                  a stray virtual definition can appear at epilog_e
3471                  which we can then take as the same on all exits,
3472                  we've removed the LC SSA PHI on the main exit before
3473                  so we wouldn't need to create a loop PHI for it.  */
3474               if (virtual_operand_p (gimple_phi_result (*gsi))
3475                   && (gsi_end_p (gsi2)
3476                       || !virtual_operand_p (gimple_phi_result (*gsi2))))
3477                 add_phi_arg (*gsi,
3478                              gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3479                              guard_e, UNKNOWN_LOCATION);
3480               else
3481                 {
3482                   add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3483                                UNKNOWN_LOCATION);
3484                   gsi_next (&gsi2);
3485                 }
3486             }
3487
3488           /* Only need to handle basic block before epilog loop if it's not
3489              the guard_bb, which is the case when skip_vector is true.  */
3490           if (guard_bb != bb_before_epilog)
3491             {
3492               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3493
3494               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3495             }
3496           scale_loop_profile (epilog, prob_epilog, -1);
3497         }
3498
3499       /* Recalculate the dominators after adding the guard edge.  */
3500       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3501         iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3502
3503       /* When we do not have a loop-around edge to the epilog we know
3504          the vector loop covered at least VF scalar iterations unless
3505          we have early breaks.
3506          Update any known upper bound with this knowledge.  */
3507       if (! skip_vector
3508           && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3509         {
3510           if (epilog->any_upper_bound)
3511             epilog->nb_iterations_upper_bound -= lowest_vf;
3512           if (epilog->any_likely_upper_bound)
3513             epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3514           if (epilog->any_estimate)
3515             epilog->nb_iterations_estimate -= lowest_vf;
3516         }
3517
3518       unsigned HOST_WIDE_INT bound;
3519       if (bound_scalar.is_constant (&bound))
3520         {
3521           gcc_assert (bound != 0);
3522           /* Adjust the upper bound by the extra peeled vector iteration if we
3523              are an epilogue of an peeled vect loop and not VLA.  For VLA the
3524              loop bounds are unknown.  */
3525           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3526               && vf.is_constant ())
3527             bound += vf.to_constant ();
3528           /* -1 to convert loop iterations to latch iterations.  */
3529           record_niter_bound (epilog, bound - 1, false, true);
3530           scale_loop_profile (epilog, profile_probability::always (),
3531                               bound - 1);
3532         }
3533
3534       delete_update_ssa ();
3535       adjust_vec_debug_stmts ();
3536       scev_reset ();
3537     }
3538
3539   if (vect_epilogues)
3540     {
3541       epilog->aux = epilogue_vinfo;
3542       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3543       LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3544         = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3545
3546       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3547
3548       /* We now must calculate the number of NITERS performed by the previous
3549          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3550       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3551                                  niters_prolog, niters_vector_mult_vf);
3552
3553       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3554          determine whether we are coming from the previous vectorized loop
3555          using the update_e edge or the skip_vector basic block using the
3556          skip_e edge.  */
3557       if (skip_vector)
3558         {
3559           gcc_assert (update_e != NULL && skip_e != NULL);
3560           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3561                                            update_e->dest);
3562           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3563           gimple *stmt = gimple_build_assign (new_ssa, niters);
3564           gimple_stmt_iterator gsi;
3565           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3566               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3567             {
3568               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3569               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3570             }
3571           else
3572             {
3573               gsi = gsi_last_bb (update_e->src);
3574               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3575             }
3576
3577           niters = new_ssa;
3578           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3579           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3580                        UNKNOWN_LOCATION);
3581           niters = PHI_RESULT (new_phi);
3582           epilogue_vinfo->main_loop_edge = update_e;
3583           epilogue_vinfo->skip_main_loop_edge = skip_e;
3584         }
3585
3586       /* Set ADVANCE to the number of iterations performed by the previous
3587          loop and its prologue.  */
3588       *advance = niters;
3589
3590       /* Subtract the number of iterations performed by the vectorized loop
3591          from the number of total iterations.  */
3592       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3593                                           before_loop_niters,
3594                                           niters);
3595
3596       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3597       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3598         = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3599                        epilogue_niters,
3600                        build_one_cst (TREE_TYPE (epilogue_niters)));
3601
3602       /* Decide what to do if the number of epilogue iterations is not
3603          a multiple of the epilogue loop's vectorization factor.
3604          We should have rejected the loop during the analysis phase
3605          if this fails.  */
3606       bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo);
3607       gcc_assert (res);
3608     }
3609
3610   adjust_vec.release ();
3611   free_original_copy_tables ();
3612
3613   return vect_epilogues ? epilog : NULL;
3614 }
3615
3616 /* Function vect_create_cond_for_niters_checks.
3617
3618    Create a conditional expression that represents the run-time checks for
3619    loop's niter.  The loop is guaranteed to terminate if the run-time
3620    checks hold.
3621
3622    Input:
3623    COND_EXPR  - input conditional expression.  New conditions will be chained
3624                 with logical AND operation.  If it is NULL, then the function
3625                 is used to return the number of alias checks.
3626    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3627                 to be checked.
3628
3629    Output:
3630    COND_EXPR - conditional expression.
3631
3632    The returned COND_EXPR is the conditional expression to be used in the
3633    if statement that controls which version of the loop gets executed at
3634    runtime.  */
3635
3636 static void
3637 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3638 {
3639   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3640
3641   if (*cond_expr)
3642     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3643                               *cond_expr, part_cond_expr);
3644   else
3645     *cond_expr = part_cond_expr;
3646 }
3647
3648 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3649    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3650
3651 static void
3652 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3653 {
3654   if (*cond_expr)
3655     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3656                               *cond_expr, part_cond_expr);
3657   else
3658     *cond_expr = part_cond_expr;
3659 }
3660
3661 /* Function vect_create_cond_for_align_checks.
3662
3663    Create a conditional expression that represents the alignment checks for
3664    all of data references (array element references) whose alignment must be
3665    checked at runtime.
3666
3667    Input:
3668    COND_EXPR  - input conditional expression.  New conditions will be chained
3669                 with logical AND operation.
3670    LOOP_VINFO - two fields of the loop information are used.
3671                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3672                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3673
3674    Output:
3675    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3676                          expression.
3677    The returned value is the conditional expression to be used in the if
3678    statement that controls which version of the loop gets executed at runtime.
3679
3680    The algorithm makes two assumptions:
3681      1) The number of bytes "n" in a vector is a power of 2.
3682      2) An address "a" is aligned if a%n is zero and that this
3683         test can be done as a&(n-1) == 0.  For example, for 16
3684         byte vectors the test is a&0xf == 0.  */
3685
3686 static void
3687 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3688                                    tree *cond_expr,
3689                                    gimple_seq *cond_expr_stmt_list)
3690 {
3691   const vec<stmt_vec_info> &may_misalign_stmts
3692     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3693   stmt_vec_info stmt_info;
3694   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3695   tree mask_cst;
3696   unsigned int i;
3697   tree int_ptrsize_type;
3698   char tmp_name[20];
3699   tree or_tmp_name = NULL_TREE;
3700   tree and_tmp_name;
3701   gimple *and_stmt;
3702   tree ptrsize_zero;
3703   tree part_cond_expr;
3704
3705   /* Check that mask is one less than a power of 2, i.e., mask is
3706      all zeros followed by all ones.  */
3707   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3708
3709   int_ptrsize_type = signed_type_for (ptr_type_node);
3710
3711   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3712      of the first vector of the i'th data reference. */
3713
3714   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3715     {
3716       gimple_seq new_stmt_list = NULL;
3717       tree addr_base;
3718       tree addr_tmp_name;
3719       tree new_or_tmp_name;
3720       gimple *addr_stmt, *or_stmt;
3721       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3722       bool negative = tree_int_cst_compare
3723         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3724       tree offset = negative
3725         ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3726                     * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3727         : size_zero_node;
3728
3729       /* create: addr_tmp = (int)(address_of_first_vector) */
3730       addr_base =
3731         vect_create_addr_base_for_vector_ref (loop_vinfo,
3732                                               stmt_info, &new_stmt_list,
3733                                               offset);
3734       if (new_stmt_list != NULL)
3735         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3736
3737       sprintf (tmp_name, "addr2int%d", i);
3738       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3739       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3740       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3741
3742       /* The addresses are OR together.  */
3743
3744       if (or_tmp_name != NULL_TREE)
3745         {
3746           /* create: or_tmp = or_tmp | addr_tmp */
3747           sprintf (tmp_name, "orptrs%d", i);
3748           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3749           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3750                                          or_tmp_name, addr_tmp_name);
3751           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3752           or_tmp_name = new_or_tmp_name;
3753         }
3754       else
3755         or_tmp_name = addr_tmp_name;
3756
3757     } /* end for i */
3758
3759   mask_cst = build_int_cst (int_ptrsize_type, mask);
3760
3761   /* create: and_tmp = or_tmp & mask  */
3762   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3763
3764   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3765                                   or_tmp_name, mask_cst);
3766   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3767
3768   /* Make and_tmp the left operand of the conditional test against zero.
3769      if and_tmp has a nonzero bit then some address is unaligned.  */
3770   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3771   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3772                                 and_tmp_name, ptrsize_zero);
3773   chain_cond_expr (cond_expr, part_cond_expr);
3774 }
3775
3776 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3777    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3778    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3779    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3780
3781 static void
3782 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3783 {
3784   const vec<vec_object_pair> &pairs
3785     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3786   unsigned int i;
3787   vec_object_pair *pair;
3788   FOR_EACH_VEC_ELT (pairs, i, pair)
3789     {
3790       tree addr1 = build_fold_addr_expr (pair->first);
3791       tree addr2 = build_fold_addr_expr (pair->second);
3792       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3793                                          addr1, addr2);
3794       chain_cond_expr (cond_expr, part_cond_expr);
3795     }
3796 }
3797
3798 /* Create an expression that is true when all lower-bound conditions for
3799    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3800
3801 static void
3802 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3803 {
3804   const vec<vec_lower_bound> &lower_bounds
3805     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3806   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3807     {
3808       tree expr = lower_bounds[i].expr;
3809       tree type = unsigned_type_for (TREE_TYPE (expr));
3810       expr = fold_convert (type, expr);
3811       poly_uint64 bound = lower_bounds[i].min_value;
3812       if (!lower_bounds[i].unsigned_p)
3813         {
3814           expr = fold_build2 (PLUS_EXPR, type, expr,
3815                               build_int_cstu (type, bound - 1));
3816           bound += bound - 1;
3817         }
3818       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3819                                          build_int_cstu (type, bound));
3820       chain_cond_expr (cond_expr, part_cond_expr);
3821     }
3822 }
3823
3824 /* Function vect_create_cond_for_alias_checks.
3825
3826    Create a conditional expression that represents the run-time checks for
3827    overlapping of address ranges represented by a list of data references
3828    relations passed as input.
3829
3830    Input:
3831    COND_EXPR  - input conditional expression.  New conditions will be chained
3832                 with logical AND operation.  If it is NULL, then the function
3833                 is used to return the number of alias checks.
3834    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3835                 to be checked.
3836
3837    Output:
3838    COND_EXPR - conditional expression.
3839
3840    The returned COND_EXPR is the conditional expression to be used in the if
3841    statement that controls which version of the loop gets executed at runtime.
3842 */
3843
3844 void
3845 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3846 {
3847   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3848     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3849
3850   if (comp_alias_ddrs.is_empty ())
3851     return;
3852
3853   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3854                                &comp_alias_ddrs, cond_expr);
3855   if (dump_enabled_p ())
3856     dump_printf_loc (MSG_NOTE, vect_location,
3857                      "created %u versioning for alias checks.\n",
3858                      comp_alias_ddrs.length ());
3859 }
3860
3861
3862 /* Function vect_loop_versioning.
3863
3864    If the loop has data references that may or may not be aligned or/and
3865    has data reference relations whose independence was not proven then
3866    two versions of the loop need to be generated, one which is vectorized
3867    and one which isn't.  A test is then generated to control which of the
3868    loops is executed.  The test checks for the alignment of all of the
3869    data references that may or may not be aligned.  An additional
3870    sequence of runtime tests is generated for each pairs of DDRs whose
3871    independence was not proven.  The vectorized version of loop is
3872    executed only if both alias and alignment tests are passed.
3873
3874    The test generated to check which version of loop is executed
3875    is modified to also check for profitability as indicated by the
3876    cost model threshold TH.
3877
3878    The versioning precondition(s) are placed in *COND_EXPR and
3879    *COND_EXPR_STMT_LIST.  */
3880
3881 class loop *
3882 vect_loop_versioning (loop_vec_info loop_vinfo,
3883                       gimple *loop_vectorized_call)
3884 {
3885   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3886   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3887   basic_block condition_bb;
3888   gphi_iterator gsi;
3889   gimple_stmt_iterator cond_exp_gsi;
3890   basic_block merge_bb;
3891   basic_block new_exit_bb;
3892   edge new_exit_e, e;
3893   gphi *orig_phi, *new_phi;
3894   tree cond_expr = NULL_TREE;
3895   gimple_seq cond_expr_stmt_list = NULL;
3896   tree arg;
3897   profile_probability prob = profile_probability::likely ();
3898   gimple_seq gimplify_stmt_list = NULL;
3899   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3900   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3901   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3902   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3903   poly_uint64 versioning_threshold
3904     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3905   tree version_simd_if_cond
3906     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3907   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3908
3909   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3910       && !ordered_p (th, versioning_threshold))
3911     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3912                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3913                                             th - 1));
3914   if (maybe_ne (versioning_threshold, 0U))
3915     {
3916       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3917                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3918                                               versioning_threshold - 1));
3919       if (cond_expr)
3920         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3921                                  expr, cond_expr);
3922       else
3923         cond_expr = expr;
3924     }
3925
3926   tree cost_name = NULL_TREE;
3927   profile_probability prob2 = profile_probability::always ();
3928   if (cond_expr
3929       && EXPR_P (cond_expr)
3930       && (version_niter
3931           || version_align
3932           || version_alias
3933           || version_simd_if_cond))
3934     {
3935       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3936                                                       &cond_expr_stmt_list,
3937                                                       is_gimple_val, NULL_TREE);
3938       /* Split prob () into two so that the overall probability of passing
3939          both the cost-model and versioning checks is the orig prob.  */
3940       prob2 = prob = prob.sqrt ();
3941     }
3942
3943   if (version_niter)
3944     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3945
3946   if (cond_expr)
3947     {
3948       gimple_seq tem = NULL;
3949       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3950                                           &tem, is_gimple_condexpr_for_cond,
3951                                           NULL_TREE);
3952       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3953     }
3954
3955   if (version_align)
3956     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3957                                        &cond_expr_stmt_list);
3958
3959   if (version_alias)
3960     {
3961       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3962       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3963       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3964     }
3965
3966   if (version_simd_if_cond)
3967     {
3968       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3969       if (flag_checking)
3970         if (basic_block bb
3971             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3972           gcc_assert (bb != loop->header
3973                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3974                       && (scalar_loop == NULL
3975                           || (bb != scalar_loop->header
3976                               && dominated_by_p (CDI_DOMINATORS,
3977                                                  scalar_loop->header, bb))));
3978       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3979       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3980                             version_simd_if_cond, zero);
3981       if (cond_expr)
3982         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3983                                  c, cond_expr);
3984       else
3985         cond_expr = c;
3986       if (dump_enabled_p ())
3987         dump_printf_loc (MSG_NOTE, vect_location,
3988                          "created versioning for simd if condition check.\n");
3989     }
3990
3991   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3992                                       &gimplify_stmt_list,
3993                                       is_gimple_condexpr_for_cond, NULL_TREE);
3994   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3995
3996   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3997      invariant in.  */
3998   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3999   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4000        !gsi_end_p (gsi); gsi_next (&gsi))
4001     {
4002       gimple *stmt = gsi_stmt (gsi);
4003       update_stmt (stmt);
4004       ssa_op_iter iter;
4005       use_operand_p use_p;
4006       basic_block def_bb;
4007       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4008         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4009             && flow_bb_inside_loop_p (outermost, def_bb))
4010           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4011     }
4012
4013   /* Search for the outermost loop we can version.  Avoid versioning of
4014      non-perfect nests but allow if-conversion versioned loops inside.  */
4015   class loop *loop_to_version = loop;
4016   if (flow_loop_nested_p (outermost, loop))
4017     {
4018       if (dump_enabled_p ())
4019         dump_printf_loc (MSG_NOTE, vect_location,
4020                          "trying to apply versioning to outer loop %d\n",
4021                          outermost->num);
4022       if (outermost->num == 0)
4023         outermost = superloop_at_depth (loop, 1);
4024       /* And avoid applying versioning on non-perfect nests.  */
4025       while (loop_to_version != outermost
4026              && (e = single_exit (loop_outer (loop_to_version)))
4027              && !(e->flags & EDGE_COMPLEX)
4028              && (!loop_outer (loop_to_version)->inner->next
4029                  || vect_loop_vectorized_call (loop_to_version))
4030              && (!loop_outer (loop_to_version)->inner->next
4031                  || !loop_outer (loop_to_version)->inner->next->next))
4032         loop_to_version = loop_outer (loop_to_version);
4033     }
4034
4035   /* Apply versioning.  If there is already a scalar version created by
4036      if-conversion re-use that.  Note we cannot re-use the copy of
4037      an if-converted outer-loop when vectorizing the inner loop only.  */
4038   gcond *cond;
4039   if ((!loop_to_version->inner || loop == loop_to_version)
4040       && loop_vectorized_call)
4041     {
4042       gcc_assert (scalar_loop);
4043       condition_bb = gimple_bb (loop_vectorized_call);
4044       cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4045       gimple_cond_set_condition_from_tree (cond, cond_expr);
4046       update_stmt (cond);
4047
4048       if (cond_expr_stmt_list)
4049         {
4050           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4051           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4052                                  GSI_SAME_STMT);
4053         }
4054
4055       /* if-conversion uses profile_probability::always () for both paths,
4056          reset the paths probabilities appropriately.  */
4057       edge te, fe;
4058       extract_true_false_edges_from_block (condition_bb, &te, &fe);
4059       te->probability = prob;
4060       fe->probability = prob.invert ();
4061       /* We can scale loops counts immediately but have to postpone
4062          scaling the scalar loop because we re-use it during peeling.
4063
4064          Ifcvt duplicates loop preheader, loop body and produces an basic
4065          block after loop exit.  We need to scale all that.  */
4066       basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4067       preheader->count = preheader->count.apply_probability (prob * prob2);
4068       scale_loop_frequencies (loop_to_version, prob * prob2);
4069       /* When the loop has multiple exits then we can only version itself.
4070         This is denoted by loop_to_version == loop.  In this case we can
4071         do the versioning by selecting the exit edge the vectorizer is
4072         currently using.  */
4073       edge exit_edge;
4074       if (loop_to_version == loop)
4075        exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4076       else
4077        exit_edge = single_exit (loop_to_version);
4078       exit_edge->dest->count = preheader->count;
4079       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4080
4081       nloop = scalar_loop;
4082       if (dump_enabled_p ())
4083         dump_printf_loc (MSG_NOTE, vect_location,
4084                          "reusing %sloop version created by if conversion\n",
4085                          loop_to_version != loop ? "outer " : "");
4086     }
4087   else
4088     {
4089       if (loop_to_version != loop
4090           && dump_enabled_p ())
4091         dump_printf_loc (MSG_NOTE, vect_location,
4092                          "applying loop versioning to outer loop %d\n",
4093                          loop_to_version->num);
4094
4095       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4096
4097       initialize_original_copy_tables ();
4098       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4099                             prob * prob2, (prob * prob2).invert (),
4100                             prob * prob2, (prob * prob2).invert (),
4101                             true);
4102       /* We will later insert second conditional so overall outcome of
4103          both is prob * prob2.  */
4104       edge true_e, false_e;
4105       extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4106       true_e->probability = prob;
4107       false_e->probability = prob.invert ();
4108       gcc_assert (nloop);
4109       nloop = get_loop_copy (loop);
4110
4111       /* For cycle vectorization with SLP we rely on the PHI arguments
4112          appearing in the same order as the SLP node operands which for the
4113          loop PHI nodes means the preheader edge dest index needs to remain
4114          the same for the analyzed loop which also becomes the vectorized one.
4115          Make it so in case the state after versioning differs by redirecting
4116          the first edge into the header to the same destination which moves
4117          it last.  */
4118       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4119         {
4120           edge e = EDGE_PRED (loop->header, 0);
4121           ssa_redirect_edge (e, e->dest);
4122           flush_pending_stmts (e);
4123         }
4124       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4125
4126       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4127          reap those otherwise;  they also refer to the original
4128          loops.  */
4129       class loop *l = loop;
4130       while (gimple *call = vect_loop_vectorized_call (l))
4131         {
4132           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4133           fold_loop_internal_call (call, boolean_false_node);
4134           l = loop_outer (l);
4135         }
4136       free_original_copy_tables ();
4137
4138       if (cond_expr_stmt_list)
4139         {
4140           cond_exp_gsi = gsi_last_bb (condition_bb);
4141           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4142                                  GSI_SAME_STMT);
4143         }
4144
4145       /* Loop versioning violates an assumption we try to maintain during
4146          vectorization - that the loop exit block has a single predecessor.
4147          After versioning, the exit block of both loop versions is the same
4148          basic block (i.e. it has two predecessors). Just in order to simplify
4149          following transformations in the vectorizer, we fix this situation
4150          here by adding a new (empty) block on the exit-edge of the loop,
4151          with the proper loop-exit phis to maintain loop-closed-form.
4152          If loop versioning wasn't done from loop, but scalar_loop instead,
4153          merge_bb will have already just a single successor.  */
4154
4155       /* When the loop has multiple exits then we can only version itself.
4156          This is denoted by loop_to_version == loop.  In this case we can
4157          do the versioning by selecting the exit edge the vectorizer is
4158          currently using.  */
4159       edge exit_edge;
4160       if (loop_to_version == loop)
4161         exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4162       else
4163         exit_edge = single_exit (loop_to_version);
4164
4165       gcc_assert (exit_edge);
4166       merge_bb = exit_edge->dest;
4167       if (EDGE_COUNT (merge_bb->preds) >= 2)
4168         {
4169           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4170           new_exit_bb = split_edge (exit_edge);
4171           new_exit_e = exit_edge;
4172           e = EDGE_SUCC (new_exit_bb, 0);
4173
4174           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4175                gsi_next (&gsi))
4176             {
4177               tree new_res;
4178               orig_phi = gsi.phi ();
4179               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4180               new_phi = create_phi_node (new_res, new_exit_bb);
4181               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4182               add_phi_arg (new_phi, arg, new_exit_e,
4183                            gimple_phi_arg_location_from_edge (orig_phi, e));
4184               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4185             }
4186         }
4187
4188       update_ssa (TODO_update_ssa_no_phi);
4189     }
4190
4191   /* Split the cost model check off to a separate BB.  Costing assumes
4192      this is the only thing we perform when we enter the scalar loop
4193      from a failed cost decision.  */
4194   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4195     {
4196       gimple *def = SSA_NAME_DEF_STMT (cost_name);
4197       gcc_assert (gimple_bb (def) == condition_bb);
4198       /* All uses of the cost check are 'true' after the check we
4199          are going to insert.  */
4200       replace_uses_by (cost_name, boolean_true_node);
4201       /* And we're going to build the new single use of it.  */
4202       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4203                                        NULL_TREE, NULL_TREE);
4204       edge e = split_block (gimple_bb (def), def);
4205       gimple_stmt_iterator gsi = gsi_for_stmt (def);
4206       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4207       edge true_e, false_e;
4208       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4209       e->flags &= ~EDGE_FALLTHRU;
4210       e->flags |= EDGE_TRUE_VALUE;
4211       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4212       e->probability = prob2;
4213       e2->probability = prob2.invert ();
4214       e->dest->count = e->count ();
4215       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4216       auto_vec<basic_block, 3> adj;
4217       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4218            son;
4219            son = next_dom_son (CDI_DOMINATORS, son))
4220         if (EDGE_COUNT (son->preds) > 1)
4221           adj.safe_push (son);
4222       for (auto son : adj)
4223         set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4224       //debug_bb (condition_bb);
4225       //debug_bb (e->src);
4226     }
4227
4228   if (version_niter)
4229     {
4230       /* The versioned loop could be infinite, we need to clear existing
4231          niter information which is copied from the original loop.  */
4232       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4233       vect_free_loop_info_assumptions (nloop);
4234     }
4235
4236   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4237       && dump_enabled_p ())
4238     {
4239       if (version_alias)
4240         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4241                          vect_location,
4242                          "loop versioned for vectorization because of "
4243                          "possible aliasing\n");
4244       if (version_align)
4245         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4246                          vect_location,
4247                          "loop versioned for vectorization to enhance "
4248                          "alignment\n");
4249
4250     }
4251
4252   return nloop;
4253 }