gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100 /* Helper function to identify a simd clone call.  If this is a call to a
 101    function with simd clones then return the corresponding cgraph_node,
 102    otherwise return NULL.  */
 103
 104 static cgraph_node*
 105 simd_clone_call_p (gimple *stmt)
 106 {
 107   gcall *call = dyn_cast <gcall *> (stmt);
 108   if (!call)
 109     return NULL;
 110
 111   tree fndecl = NULL_TREE;
 112   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 113     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 114   else
 115     fndecl = gimple_call_fndecl (stmt);
 116
 117   if (fndecl == NULL_TREE)
 118     return NULL;
 119
 120   cgraph_node *node = cgraph_node::get (fndecl);
 121   if (node && node->simd_clones != NULL)
 122     return node;
 123
 124   return NULL;
 125 }
 126
 127
 128
 129 /* Return the smallest scalar part of STMT_INFO.
 130    This is used to determine the vectype of the stmt.  We generally set the
 131    vectype according to the type of the result (lhs).  For stmts whose
 132    result-type is different than the type of the arguments (e.g., demotion,
 133    promotion), vectype will be reset appropriately (later).  Note that we have
 134    to visit the smallest datatype in this function, because that determines the
 135    VF.  If the smallest datatype in the loop is present only as the rhs of a
 136    promotion operation - we'd miss it.
 137    Such a case, where a variable of this datatype does not appear in the lhs
 138    anywhere in the loop, can only occur if it's an invariant: e.g.:
 139    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 140    invariant motion.  However, we cannot rely on invariant motion to always
 141    take invariants out of the loop, and so in the case of promotion we also
 142    have to check the rhs.
 143    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 144    types.  */
 145
 146 tree
 147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 148 {
 149   HOST_WIDE_INT lhs, rhs;
 150
 151   /* During the analysis phase, this function is called on arbitrary
 152      statements that might not have scalar results.  */
 153   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 154     return scalar_type;
 155
 156   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 157
 158   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 159   if (assign)
 160     {
 161       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 162       if (gimple_assign_cast_p (assign)
 163           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 164           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 166           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 167           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 170
 171           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 172           if (rhs < lhs)
 173             scalar_type = rhs_type;
 174         }
 175     }
 176   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 177     {
 178       auto clone = node->simd_clones->simdclone;
 179       for (unsigned int i = 0; i < clone->nargs; ++i)
 180         {
 181           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 182             {
 183               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 184               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 185               if (rhs < lhs)
 186                 {
 187                   scalar_type = arg_scalar_type;
 188                   lhs = rhs;
 189                 }
 190             }
 191         }
 192     }
 193   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 194     {
 195       unsigned int i = 0;
 196       if (gimple_call_internal_p (call))
 197         {
 198           internal_fn ifn = gimple_call_internal_fn (call);
 199           if (internal_load_fn_p (ifn))
 200             /* For loads the LHS type does the trick.  */
 201             i = ~0U;
 202           else if (internal_store_fn_p (ifn))
 203             {
 204               /* For stores use the tyep of the stored value.  */
 205               i = internal_fn_stored_value_index (ifn);
 206               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 207               i = ~0U;
 208             }
 209           else if (internal_fn_mask_index (ifn) == 0)
 210             i = 1;
 211         }
 212       if (i < gimple_call_num_args (call))
 213         {
 214           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 215           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 216             {
 217               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 218               if (rhs < lhs)
 219                 scalar_type = rhs_type;
 220             }
 221         }
 222     }
 223
 224   return scalar_type;
 225 }
 226
 227
 228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 229    tested at run-time.  Return TRUE if DDR was successfully inserted.
 230    Return false if versioning is not supported.  */
 231
 232 static opt_result
 233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 234 {
 235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236
 237   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 238     return opt_result::failure_at (vect_location,
 239                                    "will not create alias checks, as"
 240                                    " --param vect-max-version-for-alias-checks"
 241                                    " == 0\n");
 242
 243   opt_result res
 244     = runtime_alias_check_p (ddr, loop,
 245                              optimize_loop_nest_for_speed_p (loop));
 246   if (!res)
 247     return res;
 248
 249   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 250   return opt_result::success ();
 251 }
 252
 253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 254
 255 static void
 256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 257 {
 258   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 259   for (unsigned int i = 0; i < checks.length(); ++i)
 260     if (checks[i] == value)
 261       return;
 262
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location,
 265                      "need run-time check that %T is nonzero\n",
 266                      value);
 267   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 268 }
 269
 270 /* Return true if we know that the order of vectorized DR_INFO_A and
 271    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 272    DR_INFO_B.  At least one of the accesses is a write.  */
 273
 274 static bool
 275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 276 {
 277   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 278   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 279
 280   /* Single statements are always kept in their original order.  */
 281   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 282       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 283     return true;
 284
 285   /* If there is a loop invariant read involved we might vectorize it in
 286      the prologue, breaking scalar oder with respect to the in-loop store.  */
 287   if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
 288       || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
 289     return false;
 290
 291   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 292      emitted at the position of the first scalar load.
 293      Stores in a group are emitted at the position of the last scalar store.
 294      Compute that position and check whether the resulting order matches
 295      the current one.  */
 296   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 297   if (il_a)
 298     {
 299       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 300         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 301              s = DR_GROUP_NEXT_ELEMENT (s))
 302           il_a = get_later_stmt (il_a, s);
 303       else /* DR_IS_READ */
 304         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 305              s = DR_GROUP_NEXT_ELEMENT (s))
 306           if (get_later_stmt (il_a, s) == il_a)
 307             il_a = s;
 308     }
 309   else
 310     il_a = stmtinfo_a;
 311   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 312   if (il_b)
 313     {
 314       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 315         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 316              s = DR_GROUP_NEXT_ELEMENT (s))
 317           il_b = get_later_stmt (il_b, s);
 318       else /* DR_IS_READ */
 319         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 320              s = DR_GROUP_NEXT_ELEMENT (s))
 321           if (get_later_stmt (il_b, s) == il_b)
 322             il_b = s;
 323     }
 324   else
 325     il_b = stmtinfo_b;
 326   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 327   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 328 }
 329
 330 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 331    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 332    distances.  These distances are conservatively correct but they don't
 333    reflect a guaranteed dependence.
 334
 335    Return true if this function does all the work necessary to avoid
 336    an alias or false if the caller should use the dependence distances
 337    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 338    the depth of the loop described by LOOP_VINFO and the other arguments
 339    are as for vect_analyze_data_ref_dependence.  */
 340
 341 static bool
 342 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 343                                        loop_vec_info loop_vinfo,
 344                                        int loop_depth, unsigned int *max_vf)
 345 {
 346   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 347   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 348     {
 349       int dist = dist_v[loop_depth];
 350       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 351         {
 352           /* If the user asserted safelen >= DIST consecutive iterations
 353              can be executed concurrently, assume independence.
 354
 355              ??? An alternative would be to add the alias check even
 356              in this case, and vectorize the fallback loop with the
 357              maximum VF set to safelen.  However, if the user has
 358              explicitly given a length, it's less likely that that
 359              would be a win.  */
 360           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 361             {
 362               if ((unsigned int) loop->safelen < *max_vf)
 363                 *max_vf = loop->safelen;
 364               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 365               continue;
 366             }
 367
 368           /* For dependence distances of 2 or more, we have the option
 369              of limiting VF or checking for an alias at runtime.
 370              Prefer to check at runtime if we can, to avoid limiting
 371              the VF unnecessarily when the bases are in fact independent.
 372
 373              Note that the alias checks will be removed if the VF ends up
 374              being small enough.  */
 375           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 376           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 377           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 378                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 379                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 380         }
 381     }
 382   return true;
 383 }
 384
 385
 386 /* Function vect_analyze_data_ref_dependence.
 387
 388    FIXME: I needed to change the sense of the returned flag.
 389
 390    Return FALSE if there (might) exist a dependence between a memory-reference
 391    DRA and a memory-reference DRB.  When versioning for alias may check a
 392    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 393    the data dependence.  */
 394
 395 static opt_result
 396 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 397                                   loop_vec_info loop_vinfo,
 398                                   unsigned int *max_vf)
 399 {
 400   unsigned int i;
 401   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 402   struct data_reference *dra = DDR_A (ddr);
 403   struct data_reference *drb = DDR_B (ddr);
 404   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 405   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 406   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 407   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 408   lambda_vector dist_v;
 409   unsigned int loop_depth;
 410
 411   /* If user asserted safelen consecutive iterations can be
 412      executed concurrently, assume independence.  */
 413   auto apply_safelen = [&]()
 414     {
 415       if (loop->safelen >= 2)
 416         {
 417           if ((unsigned int) loop->safelen < *max_vf)
 418             *max_vf = loop->safelen;
 419           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 420           return true;
 421         }
 422       return false;
 423     };
 424
 425   /* In loop analysis all data references should be vectorizable.  */
 426   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 427       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 428     gcc_unreachable ();
 429
 430   /* Independent data accesses.  */
 431   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 432     return opt_result::success ();
 433
 434   if (dra == drb
 435       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 436     return opt_result::success ();
 437
 438   /* We do not have to consider dependences between accesses that belong
 439      to the same group, unless the stride could be smaller than the
 440      group size.  */
 441   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 442       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 443           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 444       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 445     return opt_result::success ();
 446
 447   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 448      least two scalar iterations, there is always also a true dependence.
 449      As the vectorizer does not re-order loads and stores we can ignore
 450      the anti-dependence if TBAA can disambiguate both DRs similar to the
 451      case with known negative distance anti-dependences (positive
 452      distance anti-dependences would violate TBAA constraints).  */
 453   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 454        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 455       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 456                                  get_alias_set (DR_REF (drb))))
 457     return opt_result::success ();
 458
 459   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 460       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 461     {
 462       if (apply_safelen ())
 463         return opt_result::success ();
 464
 465       return opt_result::failure_at
 466         (stmtinfo_a->stmt,
 467          "possible alias involving gather/scatter between %T and %T\n",
 468          DR_REF (dra), DR_REF (drb));
 469     }
 470
 471   /* Unknown data dependence.  */
 472   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 473     {
 474       if (apply_safelen ())
 475         return opt_result::success ();
 476
 477       if (dump_enabled_p ())
 478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 479                          "versioning for alias required: "
 480                          "can't determine dependence between %T and %T\n",
 481                          DR_REF (dra), DR_REF (drb));
 482
 483       /* Add to list of ddrs that need to be tested at run-time.  */
 484       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 485     }
 486
 487   /* Known data dependence.  */
 488   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 489     {
 490       if (apply_safelen ())
 491         return opt_result::success ();
 492
 493       if (dump_enabled_p ())
 494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 495                          "versioning for alias required: "
 496                          "bad dist vector for %T and %T\n",
 497                          DR_REF (dra), DR_REF (drb));
 498       /* Add to list of ddrs that need to be tested at run-time.  */
 499       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 500     }
 501
 502   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 503
 504   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 505       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 506                                                 loop_depth, max_vf))
 507     return opt_result::success ();
 508
 509   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 510     {
 511       int dist = dist_v[loop_depth];
 512
 513       if (dump_enabled_p ())
 514         dump_printf_loc (MSG_NOTE, vect_location,
 515                          "dependence distance  = %d.\n", dist);
 516
 517       if (dist == 0)
 518         {
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "dependence distance == 0 between %T and %T\n",
 522                              DR_REF (dra), DR_REF (drb));
 523
 524           /* When we perform grouped accesses and perform implicit CSE
 525              by detecting equal accesses and doing disambiguation with
 526              runtime alias tests like for
 527                 .. = a[i];
 528                 .. = a[i+1];
 529                 a[i] = ..;
 530                 a[i+1] = ..;
 531                 *p = ..;
 532                 .. = a[i];
 533                 .. = a[i+1];
 534              where we will end up loading { a[i], a[i+1] } once, make
 535              sure that inserting group loads before the first load and
 536              stores after the last store will do the right thing.
 537              Similar for groups like
 538                 a[i] = ...;
 539                 ... = a[i];
 540                 a[i+1] = ...;
 541              where loads from the group interleave with the store.  */
 542           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 543             return opt_result::failure_at (stmtinfo_a->stmt,
 544                                            "READ_WRITE dependence"
 545                                            " in interleaving.\n");
 546
 547           if (loop->safelen < 2)
 548             {
 549               tree indicator = dr_zero_step_indicator (dra);
 550               if (!indicator || integer_zerop (indicator))
 551                 return opt_result::failure_at (stmtinfo_a->stmt,
 552                                                "access also has a zero step\n");
 553               else if (TREE_CODE (indicator) != INTEGER_CST)
 554                 vect_check_nonzero_value (loop_vinfo, indicator);
 555             }
 556           continue;
 557         }
 558
 559       if (dist > 0 && DDR_REVERSED_P (ddr))
 560         {
 561           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 562              reversed (to make distance vector positive), and the actual
 563              distance is negative.  */
 564           if (dump_enabled_p ())
 565             dump_printf_loc (MSG_NOTE, vect_location,
 566                              "dependence distance negative.\n");
 567           /* When doing outer loop vectorization, we need to check if there is
 568              a backward dependence at the inner loop level if the dependence
 569              at the outer loop is reversed.  See PR81740.  */
 570           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 571               || nested_in_vect_loop_p (loop, stmtinfo_b))
 572             {
 573               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 574                                                          DDR_LOOP_NEST (ddr));
 575               if (dist_v[inner_depth] < 0)
 576                 return opt_result::failure_at (stmtinfo_a->stmt,
 577                                                "not vectorized, dependence "
 578                                                "between data-refs %T and %T\n",
 579                                                DR_REF (dra), DR_REF (drb));
 580             }
 581           /* Record a negative dependence distance to later limit the
 582              amount of stmt copying / unrolling we can perform.
 583              Only need to handle read-after-write dependence.  */
 584           if (DR_IS_READ (drb)
 585               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 586                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 587             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 588           continue;
 589         }
 590
 591       unsigned int abs_dist = abs (dist);
 592       if (abs_dist >= 2 && abs_dist < *max_vf)
 593         {
 594           /* The dependence distance requires reduction of the maximal
 595              vectorization factor.  */
 596           *max_vf = abs_dist;
 597           if (dump_enabled_p ())
 598             dump_printf_loc (MSG_NOTE, vect_location,
 599                              "adjusting maximal vectorization factor to %i\n",
 600                              *max_vf);
 601         }
 602
 603       if (abs_dist >= *max_vf)
 604         {
 605           /* Dependence distance does not create dependence, as far as
 606              vectorization is concerned, in this case.  */
 607           if (dump_enabled_p ())
 608             dump_printf_loc (MSG_NOTE, vect_location,
 609                              "dependence distance >= VF.\n");
 610           continue;
 611         }
 612
 613       return opt_result::failure_at (stmtinfo_a->stmt,
 614                                      "not vectorized, possible dependence "
 615                                      "between data-refs %T and %T\n",
 616                                      DR_REF (dra), DR_REF (drb));
 617     }
 618
 619   return opt_result::success ();
 620 }
 621
 622 /* Function vect_analyze_early_break_dependences.
 623
 624    Examine all the data references in the loop and make sure that if we have
 625    multiple exits that we are able to safely move stores such that they become
 626    safe for vectorization.  The function also calculates the place where to move
 627    the instructions to and computes what the new vUSE chain should be.
 628
 629    This works in tandem with the CFG that will be produced by
 630    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 631
 632    This function tries to validate whether an early break vectorization
 633    is possible for the current instruction sequence. Returns True i
 634    possible, otherwise False.
 635
 636    Requirements:
 637      - Any memory access must be to a fixed size buffer.
 638      - There must not be any loads and stores to the same object.
 639      - Multiple loads are allowed as long as they don't alias.
 640
 641    NOTE:
 642      This implementation is very conservative. Any overlapping loads/stores
 643      that take place before the early break statement gets rejected aside from
 644      WAR dependencies.
 645
 646      i.e.:
 647
 648         a[i] = 8
 649         c = a[i]
 650         if (b[i])
 651           ...
 652
 653         is not allowed, but
 654
 655         c = a[i]
 656         a[i] = 8
 657         if (b[i])
 658           ...
 659
 660         is which is the common case.  */
 661
 662 static opt_result
 663 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 664 {
 665   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 666
 667   /* List of all load data references found during traversal.  */
 668   auto_vec<data_reference *> bases;
 669   basic_block dest_bb = NULL;
 670
 671   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 672   class loop *loop_nest = loop_outer (loop);
 673
 674   if (dump_enabled_p ())
 675     dump_printf_loc (MSG_NOTE, vect_location,
 676                      "loop contains multiple exits, analyzing"
 677                      " statement dependencies.\n");
 678
 679   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 680     if (dump_enabled_p ())
 681       dump_printf_loc (MSG_NOTE, vect_location,
 682                        "alternate exit has been chosen as main exit.\n");
 683
 684   /* Since we don't support general control flow, the location we'll move the
 685      side-effects to is always the latch connected exit.  When we support
 686      general control flow we can do better but for now this is fine.  Move
 687      side-effects to the in-loop destination of the last early exit.  For the
 688      PEELED case we move the side-effects to the latch block as this is
 689      guaranteed to be the last block to be executed when a vector iteration
 690      finished.  */
 691   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 692     dest_bb = loop->latch;
 693   else
 694     dest_bb = single_pred (loop->latch);
 695
 696   /* We start looking from dest_bb, for the non-PEELED case we don't want to
 697      move any stores already present, but we do want to read and validate the
 698      loads.  */
 699   basic_block bb = dest_bb;
 700
 701   /* We move stores across all loads to the beginning of dest_bb, so
 702      the first block processed below doesn't need dependence checking.  */
 703   bool check_deps = false;
 704
 705   do
 706     {
 707       gimple_stmt_iterator gsi = gsi_last_bb (bb);
 708
 709       /* Now analyze all the remaining statements and try to determine which
 710          instructions are allowed/needed to be moved.  */
 711       while (!gsi_end_p (gsi))
 712         {
 713           gimple *stmt = gsi_stmt (gsi);
 714           gsi_prev (&gsi);
 715           if (is_gimple_debug (stmt))
 716             continue;
 717
 718           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 719           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 720           if (!dr_ref)
 721             continue;
 722
 723           /* We know everything below dest_bb is safe since we know we
 724              had a full vector iteration when reaching it.  Either by
 725              the loop entry / IV exit test being last or because this
 726              is the loop latch itself.  */
 727           if (!check_deps)
 728             continue;
 729
 730           /* Check if vector accesses to the object will be within bounds.
 731              must be a constant or assume loop will be versioned or niters
 732              bounded by VF so accesses are within range.  We only need to check
 733              the reads since writes are moved to a safe place where if we get
 734              there we know they are safe to perform.  */
 735           if (DR_IS_READ (dr_ref)
 736               && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
 737             {
 738               if (dump_enabled_p ())
 739                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 740                                  "early breaks not supported: vectorization "
 741                                  "would %s beyond size of obj.\n",
 742                                  DR_IS_READ (dr_ref) ? "read" : "write");
 743               return opt_result::failure_at (stmt,
 744                                  "can't safely apply code motion to "
 745                                  "dependencies of %G to vectorize "
 746                                  "the early exit.\n", stmt);
 747             }
 748
 749           if (DR_IS_READ (dr_ref))
 750             bases.safe_push (dr_ref);
 751           else if (DR_IS_WRITE (dr_ref))
 752             {
 753               /* We are moving writes down in the CFG.  To be sure that this
 754                  is valid after vectorization we have to check all the loads
 755                  we are sinking the stores past to see if any of them may
 756                  alias or are the same object.
 757
 758                  Same objects will not be an issue because unless the store
 759                  is marked volatile the value can be forwarded.  If the
 760                  store is marked volatile we don't vectorize the loop
 761                  anyway.
 762
 763                  That leaves the check for aliasing.  We don't really need
 764                  to care about the stores aliasing with each other since the
 765                  stores are moved in order so the effects are still observed
 766                  correctly.  This leaves the check for WAR dependencies
 767                  which we would be introducing here if the DR can alias.
 768                  The check is quadratic in loads/stores but I have not found
 769                  a better API to do this.  I believe all loads and stores
 770                  must be checked.  We also must check them when we
 771                  encountered the store, since we don't care about loads past
 772                  the store.  */
 773
 774               for (auto dr_read : bases)
 775                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 776                   {
 777                     if (dump_enabled_p ())
 778                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 779                                        vect_location,
 780                                        "early breaks not supported: "
 781                                        "overlapping loads and stores "
 782                                        "found before the break "
 783                                        "statement.\n");
 784
 785                     return opt_result::failure_at (stmt,
 786                              "can't safely apply code motion to dependencies"
 787                              " to vectorize the early exit. %G may alias with"
 788                              " %G\n", stmt, dr_read->stmt);
 789                   }
 790             }
 791
 792           if (gimple_vdef (stmt))
 793             {
 794               if (dump_enabled_p ())
 795                 dump_printf_loc (MSG_NOTE, vect_location,
 796                                  "==> recording stmt %G", stmt);
 797
 798               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 799             }
 800           else if (gimple_vuse (stmt))
 801             {
 802               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 803               if (dump_enabled_p ())
 804                 dump_printf_loc (MSG_NOTE, vect_location,
 805                                  "marked statement for vUSE update: %G", stmt);
 806             }
 807         }
 808
 809       if (!single_pred_p (bb))
 810         {
 811           gcc_assert (bb == loop->header);
 812           break;
 813         }
 814
 815       /* If we possibly sink through a virtual PHI make sure to elide that.  */
 816       if (gphi *vphi = get_virtual_phi (bb))
 817         LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
 818
 819       /* All earlier blocks need dependence checking.  */
 820       check_deps = true;
 821       bb = single_pred (bb);
 822     }
 823   while (1);
 824
 825   /* We don't allow outer -> inner loop transitions which should have been
 826      trapped already during loop form analysis.  */
 827   gcc_assert (dest_bb->loop_father == loop);
 828
 829   /* Check that the destination block we picked has only one pred.  To relax this we
 830      have to take special care when moving the statements.  We don't currently support
 831      such control flow however this check is there to simplify how we handle
 832      labels that may be present anywhere in the IL.  This check is to ensure that the
 833      labels aren't significant for the CFG.  */
 834   if (!single_pred (dest_bb))
 835     return opt_result::failure_at (vect_location,
 836                              "chosen loop exit block (BB %d) does not have a "
 837                              "single predecessor which is currently not "
 838                              "supported for early break vectorization.\n",
 839                              dest_bb->index);
 840
 841   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 842
 843   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 844     {
 845       /* All uses shall be updated to that of the first load.  Entries are
 846          stored in reverse order.  */
 847       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 848       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 849         {
 850           if (dump_enabled_p ())
 851           dump_printf_loc (MSG_NOTE, vect_location,
 852                            "will update use: %T, mem_ref: %G", vuse, g);
 853         }
 854     }
 855
 856   if (dump_enabled_p ())
 857     dump_printf_loc (MSG_NOTE, vect_location,
 858                      "recorded statements to be moved to BB %d\n",
 859                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 860
 861   return opt_result::success ();
 862 }
 863
 864 /* Function vect_analyze_data_ref_dependences.
 865
 866    Examine all the data references in the loop, and make sure there do not
 867    exist any data dependences between them.  Set *MAX_VF according to
 868    the maximum vectorization factor the data dependences allow.  */
 869
 870 opt_result
 871 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 872                                    unsigned int *max_vf)
 873 {
 874   unsigned int i;
 875   struct data_dependence_relation *ddr;
 876
 877   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 878
 879   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 880     {
 881       LOOP_VINFO_DDRS (loop_vinfo)
 882         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 883                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 884       /* We do not need read-read dependences.  */
 885       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 886                                           &LOOP_VINFO_DDRS (loop_vinfo),
 887                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 888                                           false);
 889       gcc_assert (res);
 890     }
 891
 892   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 893
 894   /* For epilogues we either have no aliases or alias versioning
 895      was applied to original loop.  Therefore we may just get max_vf
 896      using VF of original loop.  */
 897   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 898     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 899   else
 900     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 901       {
 902         opt_result res
 903           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 904         if (!res)
 905           return res;
 906       }
 907
 908   /* If we have early break statements in the loop, check to see if they
 909      are of a form we can vectorizer.  */
 910   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 911     return vect_analyze_early_break_dependences (loop_vinfo);
 912
 913   return opt_result::success ();
 914 }
 915
 916
 917 /* Function vect_slp_analyze_data_ref_dependence.
 918
 919    Return TRUE if there (might) exist a dependence between a memory-reference
 920    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 921    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 922    according to the data dependence.  */
 923
 924 static bool
 925 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 926                                       struct data_dependence_relation *ddr)
 927 {
 928   struct data_reference *dra = DDR_A (ddr);
 929   struct data_reference *drb = DDR_B (ddr);
 930   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 931   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 932
 933   /* We need to check dependences of statements marked as unvectorizable
 934      as well, they still can prohibit vectorization.  */
 935
 936   /* Independent data accesses.  */
 937   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 938     return false;
 939
 940   if (dra == drb)
 941     return false;
 942
 943   /* Read-read is OK.  */
 944   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 945     return false;
 946
 947   /* If dra and drb are part of the same interleaving chain consider
 948      them independent.  */
 949   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 950       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 951           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 952     return false;
 953
 954   /* Unknown data dependence.  */
 955   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 956     {
 957       if  (dump_enabled_p ())
 958         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 959                          "can't determine dependence between %T and %T\n",
 960                          DR_REF (dra), DR_REF (drb));
 961     }
 962   else if (dump_enabled_p ())
 963     dump_printf_loc (MSG_NOTE, vect_location,
 964                      "determined dependence between %T and %T\n",
 965                      DR_REF (dra), DR_REF (drb));
 966
 967   return true;
 968 }
 969
 970
 971 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 972
 973 static bool
 974 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 975 {
 976   /* This walks over all stmts involved in the SLP store done
 977      in NODE verifying we can sink them up to the last stmt in the
 978      group.  */
 979   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 980   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 981
 982   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 983     {
 984       stmt_vec_info access_info
 985         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 986       if (access_info == last_access_info)
 987         continue;
 988       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 989       ao_ref ref;
 990       bool ref_initialized_p = false;
 991       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 992            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 993         {
 994           gimple *stmt = gsi_stmt (gsi);
 995           if (! gimple_vuse (stmt))
 996             continue;
 997
 998           /* If we couldn't record a (single) data reference for this
 999              stmt we have to resort to the alias oracle.  */
1000           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1001           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1002           if (!dr_b)
1003             {
1004               /* We are moving a store - this means
1005                  we cannot use TBAA for disambiguation.  */
1006               if (!ref_initialized_p)
1007                 ao_ref_init (&ref, DR_REF (dr_a));
1008               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1009                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1010                 return false;
1011               continue;
1012             }
1013
1014           gcc_assert (!gimple_visited_p (stmt));
1015
1016           ddr_p ddr = initialize_data_dependence_relation (dr_a,
1017                                                            dr_b, vNULL);
1018           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1019           free_dependence_relation (ddr);
1020           if (dependent)
1021             return false;
1022         }
1023     }
1024   return true;
1025 }
1026
1027 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1028    contain the vector of scalar stores of this instance if we are
1029    disambiguating the loads.  */
1030
1031 static bool
1032 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1033                                    vec<stmt_vec_info> stores,
1034                                    stmt_vec_info last_store_info)
1035 {
1036   /* This walks over all stmts involved in the SLP load done
1037      in NODE verifying we can hoist them up to the first stmt in the
1038      group.  */
1039   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1040   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1041
1042   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1043     {
1044       stmt_vec_info access_info
1045         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1046       if (access_info == first_access_info)
1047         continue;
1048       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1049       ao_ref ref;
1050       bool ref_initialized_p = false;
1051       hash_set<stmt_vec_info> grp_visited;
1052       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1053            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1054         {
1055           gimple *stmt = gsi_stmt (gsi);
1056           if (! gimple_vdef (stmt))
1057             continue;
1058
1059           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1060
1061           /* If we run into a store of this same instance (we've just
1062              marked those) then delay dependence checking until we run
1063              into the last store because this is where it will have
1064              been sunk to (and we verified that we can do that already).  */
1065           if (gimple_visited_p (stmt))
1066             {
1067               if (stmt_info != last_store_info)
1068                 continue;
1069
1070               for (stmt_vec_info &store_info : stores)
1071                 {
1072                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1073                   ddr_p ddr = initialize_data_dependence_relation
1074                                 (dr_a, store_dr, vNULL);
1075                   bool dependent
1076                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1077                   free_dependence_relation (ddr);
1078                   if (dependent)
1079                     return false;
1080                 }
1081               continue;
1082             }
1083
1084           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1085             {
1086               /* We are hoisting a load - this means we can use TBAA for
1087                  disambiguation.  */
1088               if (!ref_initialized_p)
1089                 ao_ref_init (&ref, DR_REF (dr_a));
1090               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1091                 {
1092                   /* If we couldn't record a (single) data reference for this
1093                      stmt we have to give up now.  */
1094                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1095                   if (!dr_b)
1096                     return false;
1097                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1098                                                                    dr_b, vNULL);
1099                   bool dependent
1100                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1101                   free_dependence_relation (ddr);
1102                   if (dependent)
1103                     return false;
1104                 }
1105               /* No dependence.  */
1106               return true;
1107             };
1108           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1109             {
1110               /* When we run into a store group we have to honor
1111                  that earlier stores might be moved here.  We don't
1112                  know exactly which and where to since we lack a
1113                  back-mapping from DR to SLP node, so assume all
1114                  earlier stores are sunk here.  It's enough to
1115                  consider the last stmt of a group for this.
1116                  ???  Both this and the fact that we disregard that
1117                  the conflicting instance might be removed later
1118                  is overly conservative.  */
1119               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1120                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1121                      store_info != NULL;
1122                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1123                   if ((store_info == stmt_info
1124                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1125                       && !check_hoist (store_info))
1126                     return false;
1127             }
1128           else
1129             {
1130               if (!check_hoist (stmt_info))
1131                 return false;
1132             }
1133         }
1134     }
1135   return true;
1136 }
1137
1138
1139 /* Function vect_analyze_data_ref_dependences.
1140
1141    Examine all the data references in the basic-block, and make sure there
1142    do not exist any data dependences between them.  Set *MAX_VF according to
1143    the maximum vectorization factor the data dependences allow.  */
1144
1145 bool
1146 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1147 {
1148   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1149
1150   /* The stores of this instance are at the root of the SLP tree.  */
1151   slp_tree store = NULL;
1152   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1153     store = SLP_INSTANCE_TREE (instance);
1154
1155   /* Verify we can sink stores to the vectorized stmt insert location.  */
1156   stmt_vec_info last_store_info = NULL;
1157   if (store)
1158     {
1159       if (! vect_slp_analyze_store_dependences (vinfo, store))
1160         return false;
1161
1162       /* Mark stores in this instance and remember the last one.  */
1163       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1164       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1165         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1166     }
1167
1168   bool res = true;
1169
1170   /* Verify we can sink loads to the vectorized stmt insert location,
1171      special-casing stores of this instance.  */
1172   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1173     if (! vect_slp_analyze_load_dependences (vinfo, load,
1174                                              store
1175                                              ? SLP_TREE_SCALAR_STMTS (store)
1176                                              : vNULL, last_store_info))
1177       {
1178         res = false;
1179         break;
1180       }
1181
1182   /* Unset the visited flag.  */
1183   if (store)
1184     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1185       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1186
1187   return res;
1188 }
1189
1190 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1191    applied.  */
1192
1193 int
1194 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1195 {
1196   HOST_WIDE_INT diff = 0;
1197   /* Alignment is only analyzed for the first element of a DR group,
1198      use that but adjust misalignment by the offset of the access.  */
1199   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1200     {
1201       dr_vec_info *first_dr
1202         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1203       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1204          INTEGER_CSTs and the first element in the group has the lowest
1205          address.  */
1206       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1207               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1208       gcc_assert (diff >= 0);
1209       dr_info = first_dr;
1210     }
1211
1212   int misalign = dr_info->misalignment;
1213   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1214   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1215     return misalign;
1216
1217   /* If the access is only aligned for a vector type with smaller alignment
1218      requirement the access has unknown misalignment.  */
1219   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1220                 targetm.vectorize.preferred_vector_alignment (vectype)))
1221     return DR_MISALIGNMENT_UNKNOWN;
1222
1223   /* Apply the offset from the DR group start and the externally supplied
1224      offset which can for example result from a negative stride access.  */
1225   poly_int64 misalignment = misalign + diff + offset;
1226
1227   /* vect_compute_data_ref_alignment will have ensured that target_alignment
1228      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
1229   unsigned HOST_WIDE_INT target_alignment_c
1230     = dr_info->target_alignment.to_constant ();
1231   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1232     return DR_MISALIGNMENT_UNKNOWN;
1233   return misalign;
1234 }
1235
1236 /* Record the base alignment guarantee given by DRB, which occurs
1237    in STMT_INFO.  */
1238
1239 static void
1240 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1241                             innermost_loop_behavior *drb)
1242 {
1243   bool existed;
1244   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1245     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1246   if (!existed || entry.second->base_alignment < drb->base_alignment)
1247     {
1248       entry = std::make_pair (stmt_info, drb);
1249       if (dump_enabled_p ())
1250         dump_printf_loc (MSG_NOTE, vect_location,
1251                          "recording new base alignment for %T\n"
1252                          "  alignment:    %d\n"
1253                          "  misalignment: %d\n"
1254                          "  based on:     %G",
1255                          drb->base_address,
1256                          drb->base_alignment,
1257                          drb->base_misalignment,
1258                          stmt_info->stmt);
1259     }
1260 }
1261
1262 /* If the region we're going to vectorize is reached, all unconditional
1263    data references occur at least once.  We can therefore pool the base
1264    alignment guarantees from each unconditional reference.  Do this by
1265    going through all the data references in VINFO and checking whether
1266    the containing statement makes the reference unconditionally.  If so,
1267    record the alignment of the base address in VINFO so that it can be
1268    used for all other references with the same base.  */
1269
1270 void
1271 vect_record_base_alignments (vec_info *vinfo)
1272 {
1273   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1274   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1275   for (data_reference *dr : vinfo->shared->datarefs)
1276     {
1277       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1278       stmt_vec_info stmt_info = dr_info->stmt;
1279       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1280           && STMT_VINFO_VECTORIZABLE (stmt_info)
1281           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1282         {
1283           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1284
1285           /* If DR is nested in the loop that is being vectorized, we can also
1286              record the alignment of the base wrt the outer loop.  */
1287           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1288             vect_record_base_alignment
1289               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1290         }
1291     }
1292 }
1293
1294 /* Function vect_compute_data_ref_alignment
1295
1296    Compute the misalignment of the data reference DR_INFO when vectorizing
1297    with VECTYPE.
1298
1299    Output:
1300    1. initialized misalignment info for DR_INFO
1301
1302    FOR NOW: No analysis is actually performed. Misalignment is calculated
1303    only for trivial cases. TODO.  */
1304
1305 static void
1306 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1307                                  tree vectype)
1308 {
1309   stmt_vec_info stmt_info = dr_info->stmt;
1310   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1311   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1312   class loop *loop = NULL;
1313   tree ref = DR_REF (dr_info->dr);
1314
1315   if (dump_enabled_p ())
1316     dump_printf_loc (MSG_NOTE, vect_location,
1317                      "vect_compute_data_ref_alignment:\n");
1318
1319   if (loop_vinfo)
1320     loop = LOOP_VINFO_LOOP (loop_vinfo);
1321
1322   /* Initialize misalignment to unknown.  */
1323   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1324
1325   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1326     return;
1327
1328   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1329   bool step_preserves_misalignment_p;
1330
1331   poly_uint64 vector_alignment
1332     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1333                  BITS_PER_UNIT);
1334   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1335
1336   /* If the main loop has peeled for alignment we have no way of knowing
1337      whether the data accesses in the epilogues are aligned.  We can't at
1338      compile time answer the question whether we have entered the main loop or
1339      not.  Fixes PR 92351.  */
1340   if (loop_vinfo)
1341     {
1342       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1343       if (orig_loop_vinfo
1344           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1345         return;
1346     }
1347
1348   unsigned HOST_WIDE_INT vect_align_c;
1349   if (!vector_alignment.is_constant (&vect_align_c))
1350     return;
1351
1352   /* No step for BB vectorization.  */
1353   if (!loop)
1354     {
1355       gcc_assert (integer_zerop (drb->step));
1356       step_preserves_misalignment_p = true;
1357     }
1358
1359   /* In case the dataref is in an inner-loop of the loop that is being
1360      vectorized (LOOP), we use the base and misalignment information
1361      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1362      stays the same throughout the execution of the inner-loop, which is why
1363      we have to check that the stride of the dataref in the inner-loop evenly
1364      divides by the vector alignment.  */
1365   else if (nested_in_vect_loop_p (loop, stmt_info))
1366     {
1367       step_preserves_misalignment_p
1368         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1369
1370       if (dump_enabled_p ())
1371         {
1372           if (step_preserves_misalignment_p)
1373             dump_printf_loc (MSG_NOTE, vect_location,
1374                              "inner step divides the vector alignment.\n");
1375           else
1376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                              "inner step doesn't divide the vector"
1378                              " alignment.\n");
1379         }
1380     }
1381
1382   /* Similarly we can only use base and misalignment information relative to
1383      an innermost loop if the misalignment stays the same throughout the
1384      execution of the loop.  As above, this is the case if the stride of
1385      the dataref evenly divides by the alignment.  */
1386   else
1387     {
1388       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1389       step_preserves_misalignment_p
1390         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1391
1392       if (!step_preserves_misalignment_p && dump_enabled_p ())
1393         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1394                          "step doesn't divide the vector alignment.\n");
1395     }
1396
1397   unsigned int base_alignment = drb->base_alignment;
1398   unsigned int base_misalignment = drb->base_misalignment;
1399
1400   /* Calculate the maximum of the pooled base address alignment and the
1401      alignment that we can compute for DR itself.  */
1402   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1403     = base_alignments->get (drb->base_address);
1404   if (entry
1405       && base_alignment < (*entry).second->base_alignment
1406       && (loop_vinfo
1407           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1408                               gimple_bb (entry->first->stmt))
1409               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1410                   || (entry->first->dr_aux.group <= dr_info->group)))))
1411     {
1412       base_alignment = entry->second->base_alignment;
1413       base_misalignment = entry->second->base_misalignment;
1414     }
1415
1416   if (drb->offset_alignment < vect_align_c
1417       || !step_preserves_misalignment_p
1418       /* We need to know whether the step wrt the vectorized loop is
1419          negative when computing the starting misalignment below.  */
1420       || TREE_CODE (drb->step) != INTEGER_CST)
1421     {
1422       if (dump_enabled_p ())
1423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1424                          "Unknown alignment for access: %T\n", ref);
1425       return;
1426     }
1427
1428   if (base_alignment < vect_align_c)
1429     {
1430       unsigned int max_alignment;
1431       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1432       if (max_alignment < vect_align_c
1433           || !vect_can_force_dr_alignment_p (base,
1434                                              vect_align_c * BITS_PER_UNIT))
1435         {
1436           if (dump_enabled_p ())
1437             dump_printf_loc (MSG_NOTE, vect_location,
1438                              "can't force alignment of ref: %T\n", ref);
1439           return;
1440         }
1441
1442       /* Force the alignment of the decl.
1443          NOTE: This is the only change to the code we make during
1444          the analysis phase, before deciding to vectorize the loop.  */
1445       if (dump_enabled_p ())
1446         dump_printf_loc (MSG_NOTE, vect_location,
1447                          "force alignment of %T\n", ref);
1448
1449       dr_info->base_decl = base;
1450       dr_info->base_misaligned = true;
1451       base_misalignment = 0;
1452     }
1453   poly_int64 misalignment
1454     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1455
1456   unsigned int const_misalignment;
1457   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1458     {
1459       if (dump_enabled_p ())
1460         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461                          "Non-constant misalignment for access: %T\n", ref);
1462       return;
1463     }
1464
1465   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1466
1467   if (dump_enabled_p ())
1468     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                      "misalign = %d bytes of ref %T\n",
1470                      const_misalignment, ref);
1471
1472   return;
1473 }
1474
1475 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1476    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1477    is made aligned via peeling.  */
1478
1479 static bool
1480 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1481                                          dr_vec_info *dr_peel_info)
1482 {
1483   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1484                   DR_TARGET_ALIGNMENT (dr_info)))
1485     {
1486       poly_offset_int diff
1487         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1488            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1489       if (known_eq (diff, 0)
1490           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1491         return true;
1492     }
1493   return false;
1494 }
1495
1496 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1497    aligned via peeling.  */
1498
1499 static bool
1500 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1501                                  dr_vec_info *dr_peel_info)
1502 {
1503   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1504                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1505       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1506                            DR_OFFSET (dr_peel_info->dr), 0)
1507       || !operand_equal_p (DR_STEP (dr_info->dr),
1508                            DR_STEP (dr_peel_info->dr), 0))
1509     return false;
1510
1511   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1512 }
1513
1514 /* Compute the value for dr_info->misalign so that the access appears
1515    aligned.  This is used by peeling to compensate for dr_misalignment
1516    applying the offset for negative step.  */
1517
1518 int
1519 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1520 {
1521   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1522     return 0;
1523
1524   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1525   poly_int64 misalignment
1526     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1527        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1528
1529   unsigned HOST_WIDE_INT target_alignment_c;
1530   int misalign;
1531   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1532       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1533     return DR_MISALIGNMENT_UNKNOWN;
1534   return misalign;
1535 }
1536
1537 /* Function vect_update_misalignment_for_peel.
1538    Sets DR_INFO's misalignment
1539    - to 0 if it has the same alignment as DR_PEEL_INFO,
1540    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1541    - to -1 (unknown) otherwise.
1542
1543    DR_INFO - the data reference whose misalignment is to be adjusted.
1544    DR_PEEL_INFO - the data reference whose misalignment is being made
1545                   zero in the vector loop by the peel.
1546    NPEEL - the number of iterations in the peel loop if the misalignment
1547            of DR_PEEL_INFO is known at compile time.  */
1548
1549 static void
1550 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1551                                    dr_vec_info *dr_peel_info, int npeel)
1552 {
1553   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1554   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1555     {
1556       SET_DR_MISALIGNMENT (dr_info,
1557                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1558       return;
1559     }
1560
1561   unsigned HOST_WIDE_INT alignment;
1562   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1563       && known_alignment_for_access_p (dr_info,
1564                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1565       && known_alignment_for_access_p (dr_peel_info,
1566                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1567     {
1568       int misal = dr_info->misalignment;
1569       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1570       misal &= alignment - 1;
1571       set_dr_misalignment (dr_info, misal);
1572       return;
1573     }
1574
1575   if (dump_enabled_p ())
1576     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1577                      "to unknown (-1).\n");
1578   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1579 }
1580
1581 /* Return true if alignment is relevant for DR_INFO.  */
1582
1583 static bool
1584 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1585 {
1586   stmt_vec_info stmt_info = dr_info->stmt;
1587
1588   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1589     return false;
1590
1591   /* For interleaving, only the alignment of the first access matters.  */
1592   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1593       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1594     return false;
1595
1596   /* Scatter-gather and invariant accesses continue to address individual
1597      scalars, so vector-level alignment is irrelevant.  */
1598   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1599       || integer_zerop (DR_STEP (dr_info->dr)))
1600     return false;
1601
1602   /* Strided accesses perform only component accesses, alignment is
1603      irrelevant for them.  */
1604   if (STMT_VINFO_STRIDED_P (stmt_info)
1605       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1606     return false;
1607
1608   return true;
1609 }
1610
1611 /* Given an memory reference EXP return whether its alignment is less
1612    than its size.  */
1613
1614 static bool
1615 not_size_aligned (tree exp)
1616 {
1617   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1618     return true;
1619
1620   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1621           > get_object_alignment (exp));
1622 }
1623
1624 /* Function vector_alignment_reachable_p
1625
1626    Return true if vector alignment for DR_INFO is reachable by peeling
1627    a few loop iterations.  Return false otherwise.  */
1628
1629 static bool
1630 vector_alignment_reachable_p (dr_vec_info *dr_info)
1631 {
1632   stmt_vec_info stmt_info = dr_info->stmt;
1633   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1634
1635   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1636     {
1637       /* For interleaved access we peel only if number of iterations in
1638          the prolog loop ({VF - misalignment}), is a multiple of the
1639          number of the interleaved accesses.  */
1640       int elem_size, mis_in_elements;
1641
1642       /* FORNOW: handle only known alignment.  */
1643       if (!known_alignment_for_access_p (dr_info, vectype))
1644         return false;
1645
1646       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1648       elem_size = vector_element_size (vector_size, nelements);
1649       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1650
1651       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1652         return false;
1653     }
1654
1655   /* If misalignment is known at the compile time then allow peeling
1656      only if natural alignment is reachable through peeling.  */
1657   if (known_alignment_for_access_p (dr_info, vectype)
1658       && !aligned_access_p (dr_info, vectype))
1659     {
1660       HOST_WIDE_INT elmsize =
1661                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1662       if (dump_enabled_p ())
1663         {
1664           dump_printf_loc (MSG_NOTE, vect_location,
1665                            "data size = %wd. misalignment = %d.\n", elmsize,
1666                            dr_misalignment (dr_info, vectype));
1667         }
1668       if (dr_misalignment (dr_info, vectype) % elmsize)
1669         {
1670           if (dump_enabled_p ())
1671             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1672                              "data size does not divide the misalignment.\n");
1673           return false;
1674         }
1675     }
1676
1677   if (!known_alignment_for_access_p (dr_info, vectype))
1678     {
1679       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1680       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "Unknown misalignment, %snaturally aligned\n",
1684                          is_packed ? "not " : "");
1685       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1686     }
1687
1688   return true;
1689 }
1690
1691
1692 /* Calculate the cost of the memory access represented by DR_INFO.  */
1693
1694 static void
1695 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1696                            dr_alignment_support alignment_support_scheme,
1697                            int misalignment,
1698                            unsigned int *inside_cost,
1699                            unsigned int *outside_cost,
1700                            stmt_vector_for_cost *body_cost_vec,
1701                            stmt_vector_for_cost *prologue_cost_vec)
1702 {
1703   stmt_vec_info stmt_info = dr_info->stmt;
1704   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1705   int ncopies;
1706
1707   if (PURE_SLP_STMT (stmt_info))
1708     ncopies = 1;
1709   else
1710     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1711
1712   if (DR_IS_READ (dr_info->dr))
1713     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1714                         misalignment, true, inside_cost,
1715                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1716   else
1717     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1718                          misalignment, inside_cost, body_cost_vec);
1719
1720   if (dump_enabled_p ())
1721     dump_printf_loc (MSG_NOTE, vect_location,
1722                      "vect_get_data_access_cost: inside_cost = %d, "
1723                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1724 }
1725
1726
1727 typedef struct _vect_peel_info
1728 {
1729   dr_vec_info *dr_info;
1730   int npeel;
1731   unsigned int count;
1732 } *vect_peel_info;
1733
1734 typedef struct _vect_peel_extended_info
1735 {
1736   vec_info *vinfo;
1737   struct _vect_peel_info peel_info;
1738   unsigned int inside_cost;
1739   unsigned int outside_cost;
1740 } *vect_peel_extended_info;
1741
1742
1743 /* Peeling hashtable helpers.  */
1744
1745 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1746 {
1747   static inline hashval_t hash (const _vect_peel_info *);
1748   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1749 };
1750
1751 inline hashval_t
1752 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1753 {
1754   return (hashval_t) peel_info->npeel;
1755 }
1756
1757 inline bool
1758 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1759 {
1760   return (a->npeel == b->npeel);
1761 }
1762
1763
1764 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1765
1766 static void
1767 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1768                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1769                           int npeel, bool supportable_if_not_aligned)
1770 {
1771   struct _vect_peel_info elem, *slot;
1772   _vect_peel_info **new_slot;
1773
1774   elem.npeel = npeel;
1775   slot = peeling_htab->find (&elem);
1776   if (slot)
1777     slot->count++;
1778   else
1779     {
1780       slot = XNEW (struct _vect_peel_info);
1781       slot->npeel = npeel;
1782       slot->dr_info = dr_info;
1783       slot->count = 1;
1784       new_slot = peeling_htab->find_slot (slot, INSERT);
1785       *new_slot = slot;
1786     }
1787
1788   /* If this DR is not supported with unknown misalignment then bias
1789      this slot when the cost model is disabled.  */
1790   if (!supportable_if_not_aligned
1791       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1792     slot->count += VECT_MAX_COST;
1793 }
1794
1795
1796 /* Traverse peeling hash table to find peeling option that aligns maximum
1797    number of data accesses.  */
1798
1799 int
1800 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1801                                      _vect_peel_extended_info *max)
1802 {
1803   vect_peel_info elem = *slot;
1804
1805   if (elem->count > max->peel_info.count
1806       || (elem->count == max->peel_info.count
1807           && max->peel_info.npeel > elem->npeel))
1808     {
1809       max->peel_info.npeel = elem->npeel;
1810       max->peel_info.count = elem->count;
1811       max->peel_info.dr_info = elem->dr_info;
1812     }
1813
1814   return 1;
1815 }
1816
1817 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1818    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1819    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1820    after peeling.  */
1821
1822 static void
1823 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1824                                 dr_vec_info *dr0_info,
1825                                 unsigned int *inside_cost,
1826                                 unsigned int *outside_cost,
1827                                 stmt_vector_for_cost *body_cost_vec,
1828                                 stmt_vector_for_cost *prologue_cost_vec,
1829                                 unsigned int npeel)
1830 {
1831   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1832
1833   bool dr0_alignment_known_p
1834     = (dr0_info
1835        && known_alignment_for_access_p (dr0_info,
1836                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1837
1838   for (data_reference *dr : datarefs)
1839     {
1840       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1841       if (!vect_relevant_for_alignment_p (dr_info))
1842         continue;
1843
1844       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1845       dr_alignment_support alignment_support_scheme;
1846       int misalignment;
1847       unsigned HOST_WIDE_INT alignment;
1848
1849       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1850                                             size_zero_node) < 0;
1851       poly_int64 off = 0;
1852       if (negative)
1853         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1854                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1855
1856       if (npeel == 0)
1857         misalignment = dr_misalignment (dr_info, vectype, off);
1858       else if (dr_info == dr0_info
1859                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1860         misalignment = 0;
1861       else if (!dr0_alignment_known_p
1862                || !known_alignment_for_access_p (dr_info, vectype)
1863                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1864         misalignment = DR_MISALIGNMENT_UNKNOWN;
1865       else
1866         {
1867           misalignment = dr_misalignment (dr_info, vectype, off);
1868           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1869           misalignment &= alignment - 1;
1870         }
1871       alignment_support_scheme
1872         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1873                                          misalignment);
1874
1875       vect_get_data_access_cost (loop_vinfo, dr_info,
1876                                  alignment_support_scheme, misalignment,
1877                                  inside_cost, outside_cost,
1878                                  body_cost_vec, prologue_cost_vec);
1879     }
1880 }
1881
1882 /* Traverse peeling hash table and calculate cost for each peeling option.
1883    Find the one with the lowest cost.  */
1884
1885 int
1886 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1887                                    _vect_peel_extended_info *min)
1888 {
1889   vect_peel_info elem = *slot;
1890   int dummy;
1891   unsigned int inside_cost = 0, outside_cost = 0;
1892   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1893   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1894                        epilogue_cost_vec;
1895
1896   prologue_cost_vec.create (2);
1897   body_cost_vec.create (2);
1898   epilogue_cost_vec.create (2);
1899
1900   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1901                                   &outside_cost, &body_cost_vec,
1902                                   &prologue_cost_vec, elem->npeel);
1903
1904   body_cost_vec.release ();
1905
1906   outside_cost += vect_get_known_peeling_cost
1907     (loop_vinfo, elem->npeel, &dummy,
1908      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1909      &prologue_cost_vec, &epilogue_cost_vec);
1910
1911   /* Prologue and epilogue costs are added to the target model later.
1912      These costs depend only on the scalar iteration cost, the
1913      number of peeling iterations finally chosen, and the number of
1914      misaligned statements.  So discard the information found here.  */
1915   prologue_cost_vec.release ();
1916   epilogue_cost_vec.release ();
1917
1918   if (inside_cost < min->inside_cost
1919       || (inside_cost == min->inside_cost
1920           && outside_cost < min->outside_cost))
1921     {
1922       min->inside_cost = inside_cost;
1923       min->outside_cost = outside_cost;
1924       min->peel_info.dr_info = elem->dr_info;
1925       min->peel_info.npeel = elem->npeel;
1926       min->peel_info.count = elem->count;
1927     }
1928
1929   return 1;
1930 }
1931
1932
1933 /* Choose best peeling option by traversing peeling hash table and either
1934    choosing an option with the lowest cost (if cost model is enabled) or the
1935    option that aligns as many accesses as possible.  */
1936
1937 static struct _vect_peel_extended_info
1938 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1939                                        loop_vec_info loop_vinfo)
1940 {
1941    struct _vect_peel_extended_info res;
1942
1943    res.peel_info.dr_info = NULL;
1944    res.vinfo = loop_vinfo;
1945
1946    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1947      {
1948        res.inside_cost = INT_MAX;
1949        res.outside_cost = INT_MAX;
1950        peeling_htab->traverse <_vect_peel_extended_info *,
1951                                vect_peeling_hash_get_lowest_cost> (&res);
1952      }
1953    else
1954      {
1955        res.peel_info.count = 0;
1956        peeling_htab->traverse <_vect_peel_extended_info *,
1957                                vect_peeling_hash_get_most_frequent> (&res);
1958        res.inside_cost = 0;
1959        res.outside_cost = 0;
1960      }
1961
1962    return res;
1963 }
1964
1965 /* Return true if the new peeling NPEEL is supported.  */
1966
1967 static bool
1968 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1969                           unsigned npeel)
1970 {
1971   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1972   enum dr_alignment_support supportable_dr_alignment;
1973
1974   bool dr0_alignment_known_p
1975     = known_alignment_for_access_p (dr0_info,
1976                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1977
1978   /* Ensure that all data refs can be vectorized after the peel.  */
1979   for (data_reference *dr : datarefs)
1980     {
1981       if (dr == dr0_info->dr)
1982         continue;
1983
1984       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1985       if (!vect_relevant_for_alignment_p (dr_info)
1986           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1987         continue;
1988
1989       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1990       int misalignment;
1991       unsigned HOST_WIDE_INT alignment;
1992       if (!dr0_alignment_known_p
1993           || !known_alignment_for_access_p (dr_info, vectype)
1994           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1995         misalignment = DR_MISALIGNMENT_UNKNOWN;
1996       else
1997         {
1998           misalignment = dr_misalignment (dr_info, vectype);
1999           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2000           misalignment &= alignment - 1;
2001         }
2002       supportable_dr_alignment
2003         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2004                                          misalignment);
2005       if (supportable_dr_alignment == dr_unaligned_unsupported)
2006         return false;
2007     }
2008
2009   return true;
2010 }
2011
2012 /* Compare two data-references DRA and DRB to group them into chunks
2013    with related alignment.  */
2014
2015 static int
2016 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2017 {
2018   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2019   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2020   int cmp;
2021
2022   /* Stabilize sort.  */
2023   if (dra == drb)
2024     return 0;
2025
2026   /* Ordering of DRs according to base.  */
2027   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2028                                DR_BASE_ADDRESS (drb));
2029   if (cmp != 0)
2030     return cmp;
2031
2032   /* And according to DR_OFFSET.  */
2033   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2034   if (cmp != 0)
2035     return cmp;
2036
2037   /* And after step.  */
2038   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2039   if (cmp != 0)
2040     return cmp;
2041
2042   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2043   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2044   if (cmp == 0)
2045     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2046   return cmp;
2047 }
2048
2049 /* Function vect_enhance_data_refs_alignment
2050
2051    This pass will use loop versioning and loop peeling in order to enhance
2052    the alignment of data references in the loop.
2053
2054    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2055    original loop is to be vectorized.  Any other loops that are created by
2056    the transformations performed in this pass - are not supposed to be
2057    vectorized.  This restriction will be relaxed.
2058
2059    This pass will require a cost model to guide it whether to apply peeling
2060    or versioning or a combination of the two.  For example, the scheme that
2061    intel uses when given a loop with several memory accesses, is as follows:
2062    choose one memory access ('p') which alignment you want to force by doing
2063    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2064    other accesses are not necessarily aligned, or (2) use loop versioning to
2065    generate one loop in which all accesses are aligned, and another loop in
2066    which only 'p' is necessarily aligned.
2067
2068    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2069    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2070    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2071
2072    Devising a cost model is the most critical aspect of this work.  It will
2073    guide us on which access to peel for, whether to use loop versioning, how
2074    many versions to create, etc.  The cost model will probably consist of
2075    generic considerations as well as target specific considerations (on
2076    powerpc for example, misaligned stores are more painful than misaligned
2077    loads).
2078
2079    Here are the general steps involved in alignment enhancements:
2080
2081      -- original loop, before alignment analysis:
2082         for (i=0; i<N; i++){
2083           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2084           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2085         }
2086
2087      -- After vect_compute_data_refs_alignment:
2088         for (i=0; i<N; i++){
2089           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2090           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2091         }
2092
2093      -- Possibility 1: we do loop versioning:
2094      if (p is aligned) {
2095         for (i=0; i<N; i++){    # loop 1A
2096           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2097           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2098         }
2099      }
2100      else {
2101         for (i=0; i<N; i++){    # loop 1B
2102           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2103           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2104         }
2105      }
2106
2107      -- Possibility 2: we do loop peeling:
2108      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2109         x = q[i];
2110         p[i] = y;
2111      }
2112      for (i = 3; i < N; i++){   # loop 2A
2113         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2114         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2115      }
2116
2117      -- Possibility 3: combination of loop peeling and versioning:
2118      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2119         x = q[i];
2120         p[i] = y;
2121      }
2122      if (p is aligned) {
2123         for (i = 3; i<N; i++){  # loop 3A
2124           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2125           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2126         }
2127      }
2128      else {
2129         for (i = 3; i<N; i++){  # loop 3B
2130           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2131           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2132         }
2133      }
2134
2135      These loops are later passed to loop_transform to be vectorized.  The
2136      vectorizer will use the alignment information to guide the transformation
2137      (whether to generate regular loads/stores, or with special handling for
2138      misalignment).  */
2139
2140 opt_result
2141 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2142 {
2143   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2144   dr_vec_info *first_store = NULL;
2145   dr_vec_info *dr0_info = NULL;
2146   struct data_reference *dr;
2147   unsigned int i;
2148   bool do_peeling = false;
2149   bool do_versioning = false;
2150   unsigned int npeel = 0;
2151   bool one_misalignment_known = false;
2152   bool one_misalignment_unknown = false;
2153   bool one_dr_unsupportable = false;
2154   dr_vec_info *unsupportable_dr_info = NULL;
2155   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2156   hash_table<peel_info_hasher> peeling_htab (1);
2157
2158   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2159
2160   /* Reset data so we can safely be called multiple times.  */
2161   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2162   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2163
2164   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2165     return opt_result::success ();
2166
2167   /* Sort the vector of datarefs so DRs that have the same or dependent
2168      alignment are next to each other.  */
2169   auto_vec<data_reference_p> datarefs
2170     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2171   datarefs.qsort (dr_align_group_sort_cmp);
2172
2173   /* Compute the number of DRs that become aligned when we peel
2174      a dataref so it becomes aligned.  */
2175   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2176   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2177   unsigned i0;
2178   for (i0 = 0; i0 < datarefs.length (); ++i0)
2179     if (DR_BASE_ADDRESS (datarefs[i0]))
2180       break;
2181   for (i = i0 + 1; i <= datarefs.length (); ++i)
2182     {
2183       if (i == datarefs.length ()
2184           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2185                                DR_BASE_ADDRESS (datarefs[i]), 0)
2186           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2187                                DR_OFFSET (datarefs[i]), 0)
2188           || !operand_equal_p (DR_STEP (datarefs[i0]),
2189                                DR_STEP (datarefs[i]), 0))
2190         {
2191           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2192              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2193              will get known misalignment if we align one of the refs
2194              with the largest DR_TARGET_ALIGNMENT.  */
2195           for (unsigned j = i0; j < i; ++j)
2196             {
2197               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2198               for (unsigned k = i0; k < i; ++k)
2199                 {
2200                   if (k == j)
2201                     continue;
2202                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2203                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2204                                                                dr_infoj))
2205                     n_same_align_refs[j]++;
2206                 }
2207             }
2208           i0 = i;
2209         }
2210     }
2211
2212   /* While cost model enhancements are expected in the future, the high level
2213      view of the code at this time is as follows:
2214
2215      A) If there is a misaligned access then see if peeling to align
2216         this access can make all data references satisfy
2217         vect_supportable_dr_alignment.  If so, update data structures
2218         as needed and return true.
2219
2220      B) If peeling wasn't possible and there is a data reference with an
2221         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2222         then see if loop versioning checks can be used to make all data
2223         references satisfy vect_supportable_dr_alignment.  If so, update
2224         data structures as needed and return true.
2225
2226      C) If neither peeling nor versioning were successful then return false if
2227         any data reference does not satisfy vect_supportable_dr_alignment.
2228
2229      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2230
2231      Note, Possibility 3 above (which is peeling and versioning together) is not
2232      being done at this time.  */
2233
2234   /* (1) Peeling to force alignment.  */
2235
2236   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2237      Considerations:
2238      + How many accesses will become aligned due to the peeling
2239      - How many accesses will become unaligned due to the peeling,
2240        and the cost of misaligned accesses.
2241      - The cost of peeling (the extra runtime checks, the increase
2242        in code size).  */
2243
2244   FOR_EACH_VEC_ELT (datarefs, i, dr)
2245     {
2246       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2247       if (!vect_relevant_for_alignment_p (dr_info))
2248         continue;
2249
2250       stmt_vec_info stmt_info = dr_info->stmt;
2251       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2252       do_peeling = vector_alignment_reachable_p (dr_info);
2253       if (do_peeling)
2254         {
2255           if (known_alignment_for_access_p (dr_info, vectype))
2256             {
2257               unsigned int npeel_tmp = 0;
2258               bool negative = tree_int_cst_compare (DR_STEP (dr),
2259                                                     size_zero_node) < 0;
2260
2261               /* If known_alignment_for_access_p then we have set
2262                  DR_MISALIGNMENT which is only done if we know it at compiler
2263                  time, so it is safe to assume target alignment is constant.
2264                */
2265               unsigned int target_align =
2266                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2267               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2268               poly_int64 off = 0;
2269               if (negative)
2270                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2271               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2272               mis = negative ? mis : -mis;
2273               if (mis != 0)
2274                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2275
2276               /* For multiple types, it is possible that the bigger type access
2277                  will have more than one peeling option.  E.g., a loop with two
2278                  types: one of size (vector size / 4), and the other one of
2279                  size (vector size / 8).  Vectorization factor will 8.  If both
2280                  accesses are misaligned by 3, the first one needs one scalar
2281                  iteration to be aligned, and the second one needs 5.  But the
2282                  first one will be aligned also by peeling 5 scalar
2283                  iterations, and in that case both accesses will be aligned.
2284                  Hence, except for the immediate peeling amount, we also want
2285                  to try to add full vector size, while we don't exceed
2286                  vectorization factor.
2287                  We do this automatically for cost model, since we calculate
2288                  cost for every peeling option.  */
2289               poly_uint64 nscalars = npeel_tmp;
2290               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2291                 {
2292                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2293                   nscalars = (STMT_SLP_TYPE (stmt_info)
2294                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2295                 }
2296
2297               /* Save info about DR in the hash table.  Also include peeling
2298                  amounts according to the explanation above.  Indicate
2299                  the alignment status when the ref is not aligned.
2300                  ???  Rather than using unknown alignment here we should
2301                  prune all entries from the peeling hashtable which cause
2302                  DRs to be not supported.  */
2303               bool supportable_if_not_aligned
2304                 = vect_supportable_dr_alignment
2305                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2306               while (known_le (npeel_tmp, nscalars))
2307                 {
2308                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2309                                             dr_info, npeel_tmp,
2310                                             supportable_if_not_aligned);
2311                   npeel_tmp += MAX (1, target_align / dr_size);
2312                 }
2313
2314               one_misalignment_known = true;
2315             }
2316           else
2317             {
2318               /* If we don't know any misalignment values, we prefer
2319                  peeling for data-ref that has the maximum number of data-refs
2320                  with the same alignment, unless the target prefers to align
2321                  stores over load.  */
2322               unsigned same_align_drs = n_same_align_refs[i];
2323               if (!dr0_info
2324                   || dr0_same_align_drs < same_align_drs)
2325                 {
2326                   dr0_same_align_drs = same_align_drs;
2327                   dr0_info = dr_info;
2328                 }
2329               /* For data-refs with the same number of related
2330                  accesses prefer the one where the misalign
2331                  computation will be invariant in the outermost loop.  */
2332               else if (dr0_same_align_drs == same_align_drs)
2333                 {
2334                   class loop *ivloop0, *ivloop;
2335                   ivloop0 = outermost_invariant_loop_for_expr
2336                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2337                   ivloop = outermost_invariant_loop_for_expr
2338                     (loop, DR_BASE_ADDRESS (dr));
2339                   if ((ivloop && !ivloop0)
2340                       || (ivloop && ivloop0
2341                           && flow_loop_nested_p (ivloop, ivloop0)))
2342                     dr0_info = dr_info;
2343                 }
2344
2345               one_misalignment_unknown = true;
2346
2347               /* Check for data refs with unsupportable alignment that
2348                  can be peeled.  */
2349               enum dr_alignment_support supportable_dr_alignment
2350                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2351                                                  DR_MISALIGNMENT_UNKNOWN);
2352               if (supportable_dr_alignment == dr_unaligned_unsupported)
2353                 {
2354                   one_dr_unsupportable = true;
2355                   unsupportable_dr_info = dr_info;
2356                 }
2357
2358               if (!first_store && DR_IS_WRITE (dr))
2359                 {
2360                   first_store = dr_info;
2361                   first_store_same_align_drs = same_align_drs;
2362                 }
2363             }
2364         }
2365       else
2366         {
2367           if (!aligned_access_p (dr_info, vectype))
2368             {
2369               if (dump_enabled_p ())
2370                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371                                  "vector alignment may not be reachable\n");
2372               break;
2373             }
2374         }
2375     }
2376
2377   /* Check if we can possibly peel the loop.  */
2378   if (!vect_can_advance_ivs_p (loop_vinfo)
2379       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2380                                        loop_preheader_edge (loop))
2381       || loop->inner
2382       || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2383     do_peeling = false;
2384
2385   struct _vect_peel_extended_info peel_for_known_alignment;
2386   struct _vect_peel_extended_info peel_for_unknown_alignment;
2387   struct _vect_peel_extended_info best_peel;
2388
2389   peel_for_unknown_alignment.inside_cost = INT_MAX;
2390   peel_for_unknown_alignment.outside_cost = INT_MAX;
2391   peel_for_unknown_alignment.peel_info.count = 0;
2392
2393   if (do_peeling
2394       && one_misalignment_unknown)
2395     {
2396       /* Check if the target requires to prefer stores over loads, i.e., if
2397          misaligned stores are more expensive than misaligned loads (taking
2398          drs with same alignment into account).  */
2399       unsigned int load_inside_cost = 0;
2400       unsigned int load_outside_cost = 0;
2401       unsigned int store_inside_cost = 0;
2402       unsigned int store_outside_cost = 0;
2403       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2404
2405       stmt_vector_for_cost dummy;
2406       dummy.create (2);
2407       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2408                                       &load_inside_cost,
2409                                       &load_outside_cost,
2410                                       &dummy, &dummy, estimated_npeels);
2411       dummy.release ();
2412
2413       if (first_store)
2414         {
2415           dummy.create (2);
2416           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2417                                           &store_inside_cost,
2418                                           &store_outside_cost,
2419                                           &dummy, &dummy,
2420                                           estimated_npeels);
2421           dummy.release ();
2422         }
2423       else
2424         {
2425           store_inside_cost = INT_MAX;
2426           store_outside_cost = INT_MAX;
2427         }
2428
2429       if (load_inside_cost > store_inside_cost
2430           || (load_inside_cost == store_inside_cost
2431               && load_outside_cost > store_outside_cost))
2432         {
2433           dr0_info = first_store;
2434           dr0_same_align_drs = first_store_same_align_drs;
2435           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2436           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2437         }
2438       else
2439         {
2440           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2441           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2442         }
2443
2444       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2445       prologue_cost_vec.create (2);
2446       epilogue_cost_vec.create (2);
2447
2448       int dummy2;
2449       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2450         (loop_vinfo, estimated_npeels, &dummy2,
2451          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2452          &prologue_cost_vec, &epilogue_cost_vec);
2453
2454       prologue_cost_vec.release ();
2455       epilogue_cost_vec.release ();
2456
2457       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2458     }
2459
2460   peel_for_unknown_alignment.peel_info.npeel = 0;
2461   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2462
2463   best_peel = peel_for_unknown_alignment;
2464
2465   peel_for_known_alignment.inside_cost = INT_MAX;
2466   peel_for_known_alignment.outside_cost = INT_MAX;
2467   peel_for_known_alignment.peel_info.count = 0;
2468   peel_for_known_alignment.peel_info.dr_info = NULL;
2469
2470   if (do_peeling && one_misalignment_known)
2471     {
2472       /* Peeling is possible, but there is no data access that is not supported
2473          unless aligned.  So we try to choose the best possible peeling from
2474          the hash table.  */
2475       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2476         (&peeling_htab, loop_vinfo);
2477     }
2478
2479   /* Compare costs of peeling for known and unknown alignment. */
2480   if (peel_for_known_alignment.peel_info.dr_info != NULL
2481       && peel_for_unknown_alignment.inside_cost
2482       >= peel_for_known_alignment.inside_cost)
2483     {
2484       best_peel = peel_for_known_alignment;
2485
2486       /* If the best peeling for known alignment has NPEEL == 0, perform no
2487          peeling at all except if there is an unsupportable dr that we can
2488          align.  */
2489       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2490         do_peeling = false;
2491     }
2492
2493   /* If there is an unsupportable data ref, prefer this over all choices so far
2494      since we'd have to discard a chosen peeling except when it accidentally
2495      aligned the unsupportable data ref.  */
2496   if (one_dr_unsupportable)
2497     dr0_info = unsupportable_dr_info;
2498   else if (do_peeling)
2499     {
2500       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2501          TODO: Use nopeel_outside_cost or get rid of it?  */
2502       unsigned nopeel_inside_cost = 0;
2503       unsigned nopeel_outside_cost = 0;
2504
2505       stmt_vector_for_cost dummy;
2506       dummy.create (2);
2507       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2508                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2509       dummy.release ();
2510
2511       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2512          costs will be recorded.  */
2513       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2514       prologue_cost_vec.create (2);
2515       epilogue_cost_vec.create (2);
2516
2517       int dummy2;
2518       nopeel_outside_cost += vect_get_known_peeling_cost
2519         (loop_vinfo, 0, &dummy2,
2520          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2521          &prologue_cost_vec, &epilogue_cost_vec);
2522
2523       prologue_cost_vec.release ();
2524       epilogue_cost_vec.release ();
2525
2526       npeel = best_peel.peel_info.npeel;
2527       dr0_info = best_peel.peel_info.dr_info;
2528
2529       /* If no peeling is not more expensive than the best peeling we
2530          have so far, don't perform any peeling.  */
2531       if (nopeel_inside_cost <= best_peel.inside_cost)
2532         do_peeling = false;
2533     }
2534
2535   if (do_peeling)
2536     {
2537       stmt_vec_info stmt_info = dr0_info->stmt;
2538       if (known_alignment_for_access_p (dr0_info,
2539                                         STMT_VINFO_VECTYPE (stmt_info)))
2540         {
2541           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2542                                                 size_zero_node) < 0;
2543           if (!npeel)
2544             {
2545               /* Since it's known at compile time, compute the number of
2546                  iterations in the peeled loop (the peeling factor) for use in
2547                  updating DR_MISALIGNMENT values.  The peeling factor is the
2548                  vectorization factor minus the misalignment as an element
2549                  count.  */
2550               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2551               poly_int64 off = 0;
2552               if (negative)
2553                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2554                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2555               unsigned int mis
2556                 = dr_misalignment (dr0_info, vectype, off);
2557               mis = negative ? mis : -mis;
2558               /* If known_alignment_for_access_p then we have set
2559                  DR_MISALIGNMENT which is only done if we know it at compiler
2560                  time, so it is safe to assume target alignment is constant.
2561                */
2562               unsigned int target_align =
2563                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2564               npeel = ((mis & (target_align - 1))
2565                        / vect_get_scalar_dr_size (dr0_info));
2566             }
2567
2568           /* For interleaved data access every iteration accesses all the
2569              members of the group, therefore we divide the number of iterations
2570              by the group size.  */
2571           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2572             npeel /= DR_GROUP_SIZE (stmt_info);
2573
2574           if (dump_enabled_p ())
2575             dump_printf_loc (MSG_NOTE, vect_location,
2576                              "Try peeling by %d\n", npeel);
2577         }
2578
2579       /* Ensure that all datarefs can be vectorized after the peel.  */
2580       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2581         do_peeling = false;
2582
2583       /* Check if all datarefs are supportable and log.  */
2584       if (do_peeling
2585           && npeel == 0
2586           && known_alignment_for_access_p (dr0_info,
2587                                            STMT_VINFO_VECTYPE (stmt_info)))
2588         return opt_result::success ();
2589
2590       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2591       if (do_peeling)
2592         {
2593           unsigned max_allowed_peel
2594             = param_vect_max_peeling_for_alignment;
2595           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2596             max_allowed_peel = 0;
2597           if (max_allowed_peel != (unsigned)-1)
2598             {
2599               unsigned max_peel = npeel;
2600               if (max_peel == 0)
2601                 {
2602                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2603                   unsigned HOST_WIDE_INT target_align_c;
2604                   if (target_align.is_constant (&target_align_c))
2605                     max_peel =
2606                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2607                   else
2608                     {
2609                       do_peeling = false;
2610                       if (dump_enabled_p ())
2611                         dump_printf_loc (MSG_NOTE, vect_location,
2612                           "Disable peeling, max peels set and vector"
2613                           " alignment unknown\n");
2614                     }
2615                 }
2616               if (max_peel > max_allowed_peel)
2617                 {
2618                   do_peeling = false;
2619                   if (dump_enabled_p ())
2620                     dump_printf_loc (MSG_NOTE, vect_location,
2621                         "Disable peeling, max peels reached: %d\n", max_peel);
2622                 }
2623             }
2624         }
2625
2626       /* Cost model #2 - if peeling may result in a remaining loop not
2627          iterating enough to be vectorized then do not peel.  Since this
2628          is a cost heuristic rather than a correctness decision, use the
2629          most likely runtime value for variable vectorization factors.  */
2630       if (do_peeling
2631           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2632         {
2633           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2634           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2635           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2636               < assumed_vf + max_peel)
2637             do_peeling = false;
2638         }
2639
2640       if (do_peeling)
2641         {
2642           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2643              If the misalignment of DR_i is identical to that of dr0 then set
2644              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2645              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2646              by the peeling factor times the element size of DR_i (MOD the
2647              vectorization factor times the size).  Otherwise, the
2648              misalignment of DR_i must be set to unknown.  */
2649           FOR_EACH_VEC_ELT (datarefs, i, dr)
2650             if (dr != dr0_info->dr)
2651               {
2652                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2653                 if (!vect_relevant_for_alignment_p (dr_info))
2654                   continue;
2655
2656                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2657               }
2658
2659           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2660           if (npeel)
2661             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2662           else
2663             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2664           SET_DR_MISALIGNMENT (dr0_info,
2665                                vect_dr_misalign_for_aligned_access (dr0_info));
2666           if (dump_enabled_p ())
2667             {
2668               dump_printf_loc (MSG_NOTE, vect_location,
2669                                "Alignment of access forced using peeling.\n");
2670               dump_printf_loc (MSG_NOTE, vect_location,
2671                                "Peeling for alignment will be applied.\n");
2672             }
2673
2674           /* The inside-loop cost will be accounted for in vectorizable_load
2675              and vectorizable_store correctly with adjusted alignments.
2676              Drop the body_cst_vec on the floor here.  */
2677           return opt_result::success ();
2678         }
2679     }
2680
2681   /* (2) Versioning to force alignment.  */
2682
2683   /* Try versioning if:
2684      1) optimize loop for speed and the cost-model is not cheap
2685      2) there is at least one unsupported misaligned data ref with an unknown
2686         misalignment, and
2687      3) all misaligned data refs with a known misalignment are supported, and
2688      4) the number of runtime alignment checks is within reason.  */
2689
2690   do_versioning
2691     = (optimize_loop_nest_for_speed_p (loop)
2692        && !loop->inner /* FORNOW */
2693        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2694
2695   if (do_versioning)
2696     {
2697       FOR_EACH_VEC_ELT (datarefs, i, dr)
2698         {
2699           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2700           if (!vect_relevant_for_alignment_p (dr_info))
2701             continue;
2702
2703           stmt_vec_info stmt_info = dr_info->stmt;
2704           if (STMT_VINFO_STRIDED_P (stmt_info))
2705             {
2706               do_versioning = false;
2707               break;
2708             }
2709
2710           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2711           bool negative = tree_int_cst_compare (DR_STEP (dr),
2712                                                 size_zero_node) < 0;
2713           poly_int64 off = 0;
2714           if (negative)
2715             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2716                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2717           int misalignment;
2718           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2719             continue;
2720
2721           enum dr_alignment_support supportable_dr_alignment
2722             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2723                                              misalignment);
2724           if (supportable_dr_alignment == dr_unaligned_unsupported)
2725             {
2726               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2727                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2728                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2729                 {
2730                   do_versioning = false;
2731                   break;
2732                 }
2733
2734               /* At present we don't support versioning for alignment
2735                  with variable VF, since there's no guarantee that the
2736                  VF is a power of two.  We could relax this if we added
2737                  a way of enforcing a power-of-two size.  */
2738               unsigned HOST_WIDE_INT size;
2739               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2740                 {
2741                   do_versioning = false;
2742                   break;
2743                 }
2744
2745               /* Forcing alignment in the first iteration is no good if
2746                  we don't keep it across iterations.  For now, just disable
2747                  versioning in this case.
2748                  ?? We could actually unroll the loop to achieve the required
2749                  overall step alignment, and forcing the alignment could be
2750                  done by doing some iterations of the non-vectorized loop.  */
2751               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2752                                * DR_STEP_ALIGNMENT (dr),
2753                                DR_TARGET_ALIGNMENT (dr_info)))
2754                 {
2755                   do_versioning = false;
2756                   break;
2757                 }
2758
2759               /* The rightmost bits of an aligned address must be zeros.
2760                  Construct the mask needed for this test.  For example,
2761                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2762                  mask must be 15 = 0xf. */
2763               int mask = size - 1;
2764
2765               /* FORNOW: use the same mask to test all potentially unaligned
2766                  references in the loop.  */
2767               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2768                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2769                 {
2770                   do_versioning = false;
2771                   break;
2772                 }
2773
2774               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2775               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2776             }
2777         }
2778
2779       /* Versioning requires at least one misaligned data reference.  */
2780       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2781         do_versioning = false;
2782       else if (!do_versioning)
2783         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2784     }
2785
2786   if (do_versioning)
2787     {
2788       const vec<stmt_vec_info> &may_misalign_stmts
2789         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2790       stmt_vec_info stmt_info;
2791
2792       /* It can now be assumed that the data references in the statements
2793          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2794          of the loop being vectorized.  */
2795       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2796         {
2797           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2798           SET_DR_MISALIGNMENT (dr_info,
2799                                vect_dr_misalign_for_aligned_access (dr_info));
2800           if (dump_enabled_p ())
2801             dump_printf_loc (MSG_NOTE, vect_location,
2802                              "Alignment of access forced using versioning.\n");
2803         }
2804
2805       if (dump_enabled_p ())
2806         dump_printf_loc (MSG_NOTE, vect_location,
2807                          "Versioning for alignment will be applied.\n");
2808
2809       /* Peeling and versioning can't be done together at this time.  */
2810       gcc_assert (! (do_peeling && do_versioning));
2811
2812       return opt_result::success ();
2813     }
2814
2815   /* This point is reached if neither peeling nor versioning is being done.  */
2816   gcc_assert (! (do_peeling || do_versioning));
2817
2818   return opt_result::success ();
2819 }
2820
2821
2822 /* Function vect_analyze_data_refs_alignment
2823
2824    Analyze the alignment of the data-references in the loop.
2825    Return FALSE if a data reference is found that cannot be vectorized.  */
2826
2827 opt_result
2828 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2829 {
2830   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2831
2832   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2833   struct data_reference *dr;
2834   unsigned int i;
2835
2836   vect_record_base_alignments (loop_vinfo);
2837   FOR_EACH_VEC_ELT (datarefs, i, dr)
2838     {
2839       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2840       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2841         {
2842           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2843               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2844             continue;
2845           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2846                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2847         }
2848     }
2849
2850   return opt_result::success ();
2851 }
2852
2853
2854 /* Analyze alignment of DRs of stmts in NODE.  */
2855
2856 static bool
2857 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2858 {
2859   /* Alignment is maintained in the first element of the group.  */
2860   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2861   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2862   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2863   tree vectype = SLP_TREE_VECTYPE (node);
2864   poly_uint64 vector_alignment
2865     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2866                  BITS_PER_UNIT);
2867   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2868     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2869   /* Re-analyze alignment when we're facing a vectorization with a bigger
2870      alignment requirement.  */
2871   else if (known_lt (dr_info->target_alignment, vector_alignment))
2872     {
2873       poly_uint64 old_target_alignment = dr_info->target_alignment;
2874       int old_misalignment = dr_info->misalignment;
2875       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2876       /* But keep knowledge about a smaller alignment.  */
2877       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2878           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2879         {
2880           dr_info->target_alignment = old_target_alignment;
2881           dr_info->misalignment = old_misalignment;
2882         }
2883     }
2884   /* When we ever face unordered target alignments the first one wins in terms
2885      of analyzing and the other will become unknown in dr_misalignment.  */
2886   return true;
2887 }
2888
2889 /* Function vect_slp_analyze_instance_alignment
2890
2891    Analyze the alignment of the data-references in the SLP instance.
2892    Return FALSE if a data reference is found that cannot be vectorized.  */
2893
2894 bool
2895 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2896                                                 slp_instance instance)
2897 {
2898   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2899
2900   slp_tree node;
2901   unsigned i;
2902   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2903     if (! vect_slp_analyze_node_alignment (vinfo, node))
2904       return false;
2905
2906   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2907       && ! vect_slp_analyze_node_alignment
2908              (vinfo, SLP_INSTANCE_TREE (instance)))
2909     return false;
2910
2911   return true;
2912 }
2913
2914
2915 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2916    accesses of legal size, step, etc.  Detect gaps, single element
2917    interleaving, and other special cases. Set grouped access info.
2918    Collect groups of strided stores for further use in SLP analysis.
2919    Worker for vect_analyze_group_access.  */
2920
2921 static bool
2922 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2923 {
2924   data_reference *dr = dr_info->dr;
2925   tree step = DR_STEP (dr);
2926   tree scalar_type = TREE_TYPE (DR_REF (dr));
2927   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2928   stmt_vec_info stmt_info = dr_info->stmt;
2929   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2930   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2931   HOST_WIDE_INT dr_step = -1;
2932   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2933   bool slp_impossible = false;
2934
2935   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2936      size of the interleaving group (including gaps).  */
2937   if (tree_fits_shwi_p (step))
2938     {
2939       dr_step = tree_to_shwi (step);
2940       /* Check that STEP is a multiple of type size.  Otherwise there is
2941          a non-element-sized gap at the end of the group which we
2942          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2943          ???  As we can handle non-constant step fine here we should
2944          simply remove uses of DR_GROUP_GAP between the last and first
2945          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2946          simply not include that gap.  */
2947       if ((dr_step % type_size) != 0)
2948         {
2949           if (dump_enabled_p ())
2950             dump_printf_loc (MSG_NOTE, vect_location,
2951                              "Step %T is not a multiple of the element size"
2952                              " for %T\n",
2953                              step, DR_REF (dr));
2954           return false;
2955         }
2956       groupsize = absu_hwi (dr_step) / type_size;
2957     }
2958   else
2959     groupsize = 0;
2960
2961   /* Not consecutive access is possible only if it is a part of interleaving.  */
2962   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2963     {
2964       /* Check if it this DR is a part of interleaving, and is a single
2965          element of the group that is accessed in the loop.  */
2966
2967       /* Gaps are supported only for loads. STEP must be a multiple of the type
2968          size.  */
2969       if (DR_IS_READ (dr)
2970           && (dr_step % type_size) == 0
2971           && groupsize > 0
2972           /* This could be UINT_MAX but as we are generating code in a very
2973              inefficient way we have to cap earlier.
2974              See PR91403 for example.  */
2975           && groupsize <= 4096)
2976         {
2977           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2978           DR_GROUP_SIZE (stmt_info) = groupsize;
2979           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2980           if (dump_enabled_p ())
2981             dump_printf_loc (MSG_NOTE, vect_location,
2982                              "Detected single element interleaving %T"
2983                              " step %T\n",
2984                              DR_REF (dr), step);
2985
2986           return true;
2987         }
2988
2989       if (dump_enabled_p ())
2990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2991                          "not consecutive access %G", stmt_info->stmt);
2992
2993       if (bb_vinfo)
2994         {
2995           /* Mark the statement as unvectorizable.  */
2996           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2997           return true;
2998         }
2999
3000       if (dump_enabled_p ())
3001         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3002       STMT_VINFO_STRIDED_P (stmt_info) = true;
3003       return true;
3004     }
3005
3006   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3007     {
3008       /* First stmt in the interleaving chain. Check the chain.  */
3009       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3010       struct data_reference *data_ref = dr;
3011       unsigned int count = 1;
3012       tree prev_init = DR_INIT (data_ref);
3013       HOST_WIDE_INT diff, gaps = 0;
3014
3015       /* By construction, all group members have INTEGER_CST DR_INITs.  */
3016       while (next)
3017         {
3018           /* We never have the same DR multiple times.  */
3019           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3020                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3021
3022           data_ref = STMT_VINFO_DATA_REF (next);
3023
3024           /* All group members have the same STEP by construction.  */
3025           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3026
3027           /* Check that the distance between two accesses is equal to the type
3028              size. Otherwise, we have gaps.  */
3029           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3030                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3031           if (diff < 1 || diff > UINT_MAX)
3032             {
3033               /* For artificial testcases with array accesses with large
3034                  constant indices we can run into overflow issues which
3035                  can end up fooling the groupsize constraint below so
3036                  check the individual gaps (which are represented as
3037                  unsigned int) as well.  */
3038               if (dump_enabled_p ())
3039                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3040                                  "interleaved access with gap larger "
3041                                  "than representable\n");
3042               return false;
3043             }
3044           if (diff != 1)
3045             {
3046               /* FORNOW: SLP of accesses with gaps is not supported.  */
3047               slp_impossible = true;
3048               if (DR_IS_WRITE (data_ref))
3049                 {
3050                   if (dump_enabled_p ())
3051                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3052                                      "interleaved store with gaps\n");
3053                   return false;
3054                 }
3055
3056               gaps += diff - 1;
3057             }
3058
3059           last_accessed_element += diff;
3060
3061           /* Store the gap from the previous member of the group. If there is no
3062              gap in the access, DR_GROUP_GAP is always 1.  */
3063           DR_GROUP_GAP (next) = diff;
3064
3065           prev_init = DR_INIT (data_ref);
3066           next = DR_GROUP_NEXT_ELEMENT (next);
3067           /* Count the number of data-refs in the chain.  */
3068           count++;
3069         }
3070
3071       if (groupsize == 0)
3072         groupsize = count + gaps;
3073
3074       /* This could be UINT_MAX but as we are generating code in a very
3075          inefficient way we have to cap earlier.  See PR78699 for example.  */
3076       if (groupsize > 4096)
3077         {
3078           if (dump_enabled_p ())
3079             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080                              "group is too large\n");
3081           return false;
3082         }
3083
3084       /* Check that the size of the interleaving is equal to count for stores,
3085          i.e., that there are no gaps.  */
3086       if (groupsize != count
3087           && !DR_IS_READ (dr))
3088         {
3089           groupsize = count;
3090           STMT_VINFO_STRIDED_P (stmt_info) = true;
3091         }
3092
3093       /* If there is a gap after the last load in the group it is the
3094          difference between the groupsize and the last accessed
3095          element.
3096          When there is no gap, this difference should be 0.  */
3097       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3098
3099       DR_GROUP_SIZE (stmt_info) = groupsize;
3100       if (dump_enabled_p ())
3101         {
3102           dump_printf_loc (MSG_NOTE, vect_location,
3103                            "Detected interleaving ");
3104           if (DR_IS_READ (dr))
3105             dump_printf (MSG_NOTE, "load ");
3106           else if (STMT_VINFO_STRIDED_P (stmt_info))
3107             dump_printf (MSG_NOTE, "strided store ");
3108           else
3109             dump_printf (MSG_NOTE, "store ");
3110           dump_printf (MSG_NOTE, "of size %u\n",
3111                        (unsigned)groupsize);
3112           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3113           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3114           while (next)
3115             {
3116               if (DR_GROUP_GAP (next) != 1)
3117                 dump_printf_loc (MSG_NOTE, vect_location,
3118                                  "\t<gap of %d elements>\n",
3119                                  DR_GROUP_GAP (next) - 1);
3120               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3121               next = DR_GROUP_NEXT_ELEMENT (next);
3122             }
3123           if (DR_GROUP_GAP (stmt_info) != 0)
3124             dump_printf_loc (MSG_NOTE, vect_location,
3125                              "\t<gap of %d elements>\n",
3126                              DR_GROUP_GAP (stmt_info));
3127         }
3128
3129       /* SLP: create an SLP data structure for every interleaving group of
3130          stores for further analysis in vect_analyse_slp.  */
3131       if (DR_IS_WRITE (dr) && !slp_impossible)
3132         {
3133           if (loop_vinfo)
3134             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3135           if (bb_vinfo)
3136             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3137         }
3138     }
3139
3140   return true;
3141 }
3142
3143 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3144    accesses of legal size, step, etc.  Detect gaps, single element
3145    interleaving, and other special cases. Set grouped access info.
3146    Collect groups of strided stores for further use in SLP analysis.  */
3147
3148 static bool
3149 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3150 {
3151   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3152     {
3153       /* Dissolve the group if present.  */
3154       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3155       while (stmt_info)
3156         {
3157           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3158           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3159           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3160           stmt_info = next;
3161         }
3162       return false;
3163     }
3164   return true;
3165 }
3166
3167 /* Analyze the access pattern of the data-reference DR_INFO.
3168    In case of non-consecutive accesses call vect_analyze_group_access() to
3169    analyze groups of accesses.  */
3170
3171 static bool
3172 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3173 {
3174   data_reference *dr = dr_info->dr;
3175   tree step = DR_STEP (dr);
3176   tree scalar_type = TREE_TYPE (DR_REF (dr));
3177   stmt_vec_info stmt_info = dr_info->stmt;
3178   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3179   class loop *loop = NULL;
3180
3181   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3182     return true;
3183
3184   if (loop_vinfo)
3185     loop = LOOP_VINFO_LOOP (loop_vinfo);
3186
3187   if (loop_vinfo && !step)
3188     {
3189       if (dump_enabled_p ())
3190         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3191                          "bad data-ref access in loop\n");
3192       return false;
3193     }
3194
3195   /* Allow loads with zero step in inner-loop vectorization.  */
3196   if (loop_vinfo && integer_zerop (step))
3197     {
3198       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3199       if (!nested_in_vect_loop_p (loop, stmt_info))
3200         return DR_IS_READ (dr);
3201       /* Allow references with zero step for outer loops marked
3202          with pragma omp simd only - it guarantees absence of
3203          loop-carried dependencies between inner loop iterations.  */
3204       if (loop->safelen < 2)
3205         {
3206           if (dump_enabled_p ())
3207             dump_printf_loc (MSG_NOTE, vect_location,
3208                              "zero step in inner loop of nest\n");
3209           return false;
3210         }
3211     }
3212
3213   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3214     {
3215       /* Interleaved accesses are not yet supported within outer-loop
3216         vectorization for references in the inner-loop.  */
3217       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3218
3219       /* For the rest of the analysis we use the outer-loop step.  */
3220       step = STMT_VINFO_DR_STEP (stmt_info);
3221       if (integer_zerop (step))
3222         {
3223           if (dump_enabled_p ())
3224             dump_printf_loc (MSG_NOTE, vect_location,
3225                              "zero step in outer loop.\n");
3226           return DR_IS_READ (dr);
3227         }
3228     }
3229
3230   /* Consecutive?  */
3231   if (TREE_CODE (step) == INTEGER_CST)
3232     {
3233       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3234       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3235           || (dr_step < 0
3236               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3237         {
3238           /* Mark that it is not interleaving.  */
3239           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3240           return true;
3241         }
3242     }
3243
3244   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3245     {
3246       if (dump_enabled_p ())
3247         dump_printf_loc (MSG_NOTE, vect_location,
3248                          "grouped access in outer loop.\n");
3249       return false;
3250     }
3251
3252
3253   /* Assume this is a DR handled by non-constant strided load case.  */
3254   if (TREE_CODE (step) != INTEGER_CST)
3255     return (STMT_VINFO_STRIDED_P (stmt_info)
3256             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3257                 || vect_analyze_group_access (vinfo, dr_info)));
3258
3259   /* Not consecutive access - check if it's a part of interleaving group.  */
3260   return vect_analyze_group_access (vinfo, dr_info);
3261 }
3262
3263 /* Compare two data-references DRA and DRB to group them into chunks
3264    suitable for grouping.  */
3265
3266 static int
3267 dr_group_sort_cmp (const void *dra_, const void *drb_)
3268 {
3269   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3270   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3271   data_reference_p dra = dra_info->dr;
3272   data_reference_p drb = drb_info->dr;
3273   int cmp;
3274
3275   /* Stabilize sort.  */
3276   if (dra == drb)
3277     return 0;
3278
3279   /* Different group IDs lead never belong to the same group.  */
3280   if (dra_info->group != drb_info->group)
3281     return dra_info->group < drb_info->group ? -1 : 1;
3282
3283   /* Ordering of DRs according to base.  */
3284   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3285                                DR_BASE_ADDRESS (drb));
3286   if (cmp != 0)
3287     return cmp;
3288
3289   /* And according to DR_OFFSET.  */
3290   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3291   if (cmp != 0)
3292     return cmp;
3293
3294   /* Put reads before writes.  */
3295   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3296     return DR_IS_READ (dra) ? -1 : 1;
3297
3298   /* Then sort after access size.  */
3299   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3300                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3301   if (cmp != 0)
3302     return cmp;
3303
3304   /* And after step.  */
3305   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3306   if (cmp != 0)
3307     return cmp;
3308
3309   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3310   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3311   if (cmp == 0)
3312     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3313   return cmp;
3314 }
3315
3316 /* If OP is the result of a conversion, return the unconverted value,
3317    otherwise return null.  */
3318
3319 static tree
3320 strip_conversion (tree op)
3321 {
3322   if (TREE_CODE (op) != SSA_NAME)
3323     return NULL_TREE;
3324   gimple *stmt = SSA_NAME_DEF_STMT (op);
3325   if (!is_gimple_assign (stmt)
3326       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3327     return NULL_TREE;
3328   return gimple_assign_rhs1 (stmt);
3329 }
3330
3331 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3332    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3333    be grouped in SLP mode.  */
3334
3335 static bool
3336 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3337                    bool allow_slp_p)
3338 {
3339   if (gimple_assign_single_p (stmt1_info->stmt))
3340     return gimple_assign_single_p (stmt2_info->stmt);
3341
3342   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3343   if (call1 && gimple_call_internal_p (call1))
3344     {
3345       /* Check for two masked loads or two masked stores.  */
3346       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3347       if (!call2 || !gimple_call_internal_p (call2))
3348         return false;
3349       internal_fn ifn = gimple_call_internal_fn (call1);
3350       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3351         return false;
3352       if (ifn != gimple_call_internal_fn (call2))
3353         return false;
3354
3355       /* Check that the masks are the same.  Cope with casts of masks,
3356          like those created by build_mask_conversion.  */
3357       tree mask1 = gimple_call_arg (call1, 2);
3358       tree mask2 = gimple_call_arg (call2, 2);
3359       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3360         {
3361           mask1 = strip_conversion (mask1);
3362           if (!mask1)
3363             return false;
3364           mask2 = strip_conversion (mask2);
3365           if (!mask2)
3366             return false;
3367           if (!operand_equal_p (mask1, mask2, 0))
3368             return false;
3369         }
3370       return true;
3371     }
3372
3373   return false;
3374 }
3375
3376 /* Function vect_analyze_data_ref_accesses.
3377
3378    Analyze the access pattern of all the data references in the loop.
3379
3380    FORNOW: the only access pattern that is considered vectorizable is a
3381            simple step 1 (consecutive) access.
3382
3383    FORNOW: handle only arrays and pointer accesses.  */
3384
3385 opt_result
3386 vect_analyze_data_ref_accesses (vec_info *vinfo,
3387                                 vec<int> *dataref_groups)
3388 {
3389   unsigned int i;
3390   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3391
3392   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3393
3394   if (datarefs.is_empty ())
3395     return opt_result::success ();
3396
3397   /* Sort the array of datarefs to make building the interleaving chains
3398      linear.  Don't modify the original vector's order, it is needed for
3399      determining what dependencies are reversed.  */
3400   vec<dr_vec_info *> datarefs_copy;
3401   datarefs_copy.create (datarefs.length ());
3402   for (unsigned i = 0; i < datarefs.length (); i++)
3403     {
3404       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3405       /* If the caller computed DR grouping use that, otherwise group by
3406          basic blocks.  */
3407       if (dataref_groups)
3408         dr_info->group = (*dataref_groups)[i];
3409       else
3410         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3411       datarefs_copy.quick_push (dr_info);
3412     }
3413   datarefs_copy.qsort (dr_group_sort_cmp);
3414   hash_set<stmt_vec_info> to_fixup;
3415
3416   /* Build the interleaving chains.  */
3417   for (i = 0; i < datarefs_copy.length () - 1;)
3418     {
3419       dr_vec_info *dr_info_a = datarefs_copy[i];
3420       data_reference_p dra = dr_info_a->dr;
3421       int dra_group_id = dr_info_a->group;
3422       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3423       stmt_vec_info lastinfo = NULL;
3424       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3425           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3426         {
3427           ++i;
3428           continue;
3429         }
3430       for (i = i + 1; i < datarefs_copy.length (); ++i)
3431         {
3432           dr_vec_info *dr_info_b = datarefs_copy[i];
3433           data_reference_p drb = dr_info_b->dr;
3434           int drb_group_id = dr_info_b->group;
3435           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3436           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3437               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3438             break;
3439
3440           /* ???  Imperfect sorting (non-compatible types, non-modulo
3441              accesses, same accesses) can lead to a group to be artificially
3442              split here as we don't just skip over those.  If it really
3443              matters we can push those to a worklist and re-iterate
3444              over them.  The we can just skip ahead to the next DR here.  */
3445
3446           /* DRs in a different DR group should not be put into the same
3447              interleaving group.  */
3448           if (dra_group_id != drb_group_id)
3449             break;
3450
3451           /* Check that the data-refs have same first location (except init)
3452              and they are both either store or load (not load and store,
3453              not masked loads or stores).  */
3454           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3455               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3456                                         DR_BASE_ADDRESS (drb)) != 0
3457               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3458               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3459             break;
3460
3461           /* Check that the data-refs have the same constant size.  */
3462           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3463           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3464           if (!tree_fits_uhwi_p (sza)
3465               || !tree_fits_uhwi_p (szb)
3466               || !tree_int_cst_equal (sza, szb))
3467             break;
3468
3469           /* Check that the data-refs have the same step.  */
3470           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3471             break;
3472
3473           /* Check the types are compatible.
3474              ???  We don't distinguish this during sorting.  */
3475           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3476                                    TREE_TYPE (DR_REF (drb))))
3477             break;
3478
3479           /* Check that the DR_INITs are compile-time constants.  */
3480           if (!tree_fits_shwi_p (DR_INIT (dra))
3481               || !tree_fits_shwi_p (DR_INIT (drb)))
3482             break;
3483
3484           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3485              just hold extra information.  */
3486           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3487               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3488               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3489             break;
3490
3491           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3492           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3493           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3494           HOST_WIDE_INT init_prev
3495             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3496           gcc_assert (init_a <= init_b
3497                       && init_a <= init_prev
3498                       && init_prev <= init_b);
3499
3500           /* Do not place the same access in the interleaving chain twice.  */
3501           if (init_b == init_prev)
3502             {
3503               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3504                           < gimple_uid (DR_STMT (drb)));
3505               /* Simply link in duplicates and fix up the chain below.  */
3506             }
3507           else
3508             {
3509               /* If init_b == init_a + the size of the type * k, we have an
3510                  interleaving, and DRA is accessed before DRB.  */
3511               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3512               if (type_size_a == 0
3513                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3514                       % type_size_a != 0))
3515                 break;
3516
3517               /* If we have a store, the accesses are adjacent.  This splits
3518                  groups into chunks we support (we don't support vectorization
3519                  of stores with gaps).  */
3520               if (!DR_IS_READ (dra)
3521                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3522                       != type_size_a))
3523                 break;
3524
3525               /* If the step (if not zero or non-constant) is smaller than the
3526                  difference between data-refs' inits this splits groups into
3527                  suitable sizes.  */
3528               if (tree_fits_shwi_p (DR_STEP (dra)))
3529                 {
3530                   unsigned HOST_WIDE_INT step
3531                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3532                   if (step != 0
3533                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3534                     break;
3535                 }
3536             }
3537
3538           if (dump_enabled_p ())
3539             dump_printf_loc (MSG_NOTE, vect_location,
3540                              DR_IS_READ (dra)
3541                              ? "Detected interleaving load %T and %T\n"
3542                              : "Detected interleaving store %T and %T\n",
3543                              DR_REF (dra), DR_REF (drb));
3544
3545           /* Link the found element into the group list.  */
3546           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3547             {
3548               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3549               lastinfo = stmtinfo_a;
3550             }
3551           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3552           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3553           lastinfo = stmtinfo_b;
3554
3555           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3556             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3557
3558           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3559             dump_printf_loc (MSG_NOTE, vect_location,
3560                              "Load suitable for SLP vectorization only.\n");
3561
3562           if (init_b == init_prev
3563               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3564               && dump_enabled_p ())
3565             dump_printf_loc (MSG_NOTE, vect_location,
3566                              "Queuing group with duplicate access for fixup\n");
3567         }
3568     }
3569
3570   /* Fixup groups with duplicate entries by splitting it.  */
3571   while (1)
3572     {
3573       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3574       if (!(it != to_fixup.end ()))
3575         break;
3576       stmt_vec_info grp = *it;
3577       to_fixup.remove (grp);
3578
3579       /* Find the earliest duplicate group member.  */
3580       unsigned first_duplicate = -1u;
3581       stmt_vec_info next, g = grp;
3582       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3583         {
3584           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3585                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3586               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3587             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3588           g = next;
3589         }
3590       if (first_duplicate == -1U)
3591         continue;
3592
3593       /* Then move all stmts after the first duplicate to a new group.
3594          Note this is a heuristic but one with the property that *it
3595          is fixed up completely.  */
3596       g = grp;
3597       stmt_vec_info newgroup = NULL, ng = grp;
3598       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3599         {
3600           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3601             {
3602               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3603               if (!newgroup)
3604                 newgroup = next;
3605               else
3606                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3607               ng = next;
3608               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3609             }
3610           else
3611             g = DR_GROUP_NEXT_ELEMENT (g);
3612         }
3613       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3614
3615       /* Fixup the new group which still may contain duplicates.  */
3616       to_fixup.add (newgroup);
3617     }
3618
3619   dr_vec_info *dr_info;
3620   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3621     {
3622       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3623           && !vect_analyze_data_ref_access (vinfo, dr_info))
3624         {
3625           if (dump_enabled_p ())
3626             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3627                              "not vectorized: complicated access pattern.\n");
3628
3629           if (is_a <bb_vec_info> (vinfo))
3630             {
3631               /* Mark the statement as not vectorizable.  */
3632               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3633               continue;
3634             }
3635           else
3636             {
3637               datarefs_copy.release ();
3638               return opt_result::failure_at (dr_info->stmt->stmt,
3639                                              "not vectorized:"
3640                                              " complicated access pattern.\n");
3641             }
3642         }
3643     }
3644
3645   datarefs_copy.release ();
3646   return opt_result::success ();
3647 }
3648
3649 /* Function vect_vfa_segment_size.
3650
3651    Input:
3652      DR_INFO: The data reference.
3653      LENGTH_FACTOR: segment length to consider.
3654
3655    Return a value suitable for the dr_with_seg_len::seg_len field.
3656    This is the "distance travelled" by the pointer from the first
3657    iteration in the segment to the last.  Note that it does not include
3658    the size of the access; in effect it only describes the first byte.  */
3659
3660 static tree
3661 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3662 {
3663   length_factor = size_binop (MINUS_EXPR,
3664                               fold_convert (sizetype, length_factor),
3665                               size_one_node);
3666   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3667                      length_factor);
3668 }
3669
3670 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3671    gives the worst-case number of bytes covered by the segment.  */
3672
3673 static unsigned HOST_WIDE_INT
3674 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3675 {
3676   stmt_vec_info stmt_vinfo = dr_info->stmt;
3677   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3678   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3679   unsigned HOST_WIDE_INT access_size = ref_size;
3680   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3681     {
3682       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3683       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3684     }
3685   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3686   int misalignment;
3687   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3688       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3689       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3690           == dr_explicit_realign_optimized))
3691     {
3692       /* We might access a full vector's worth.  */
3693       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3694     }
3695   return access_size;
3696 }
3697
3698 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3699    describes.  */
3700
3701 static unsigned int
3702 vect_vfa_align (dr_vec_info *dr_info)
3703 {
3704   return dr_alignment (dr_info->dr);
3705 }
3706
3707 /* Function vect_no_alias_p.
3708
3709    Given data references A and B with equal base and offset, see whether
3710    the alias relation can be decided at compilation time.  Return 1 if
3711    it can and the references alias, 0 if it can and the references do
3712    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3713    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3714    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3715
3716 static int
3717 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3718                          tree segment_length_a, tree segment_length_b,
3719                          unsigned HOST_WIDE_INT access_size_a,
3720                          unsigned HOST_WIDE_INT access_size_b)
3721 {
3722   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3723   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3724   poly_uint64 const_length_a;
3725   poly_uint64 const_length_b;
3726
3727   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3728      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3729      [a, a+12) */
3730   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3731     {
3732       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3733       offset_a -= const_length_a;
3734     }
3735   else
3736     const_length_a = tree_to_poly_uint64 (segment_length_a);
3737   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3738     {
3739       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3740       offset_b -= const_length_b;
3741     }
3742   else
3743     const_length_b = tree_to_poly_uint64 (segment_length_b);
3744
3745   const_length_a += access_size_a;
3746   const_length_b += access_size_b;
3747
3748   if (ranges_known_overlap_p (offset_a, const_length_a,
3749                               offset_b, const_length_b))
3750     return 1;
3751
3752   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3753                                offset_b, const_length_b))
3754     return 0;
3755
3756   return -1;
3757 }
3758
3759 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3760    in DDR is >= VF.  */
3761
3762 static bool
3763 dependence_distance_ge_vf (data_dependence_relation *ddr,
3764                            unsigned int loop_depth, poly_uint64 vf)
3765 {
3766   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3767       || DDR_NUM_DIST_VECTS (ddr) == 0)
3768     return false;
3769
3770   /* If the dependence is exact, we should have limited the VF instead.  */
3771   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3772
3773   unsigned int i;
3774   lambda_vector dist_v;
3775   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3776     {
3777       HOST_WIDE_INT dist = dist_v[loop_depth];
3778       if (dist != 0
3779           && !(dist > 0 && DDR_REVERSED_P (ddr))
3780           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3781         return false;
3782     }
3783
3784   if (dump_enabled_p ())
3785     dump_printf_loc (MSG_NOTE, vect_location,
3786                      "dependence distance between %T and %T is >= VF\n",
3787                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3788
3789   return true;
3790 }
3791
3792 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3793
3794 static void
3795 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3796 {
3797   dump_printf (dump_kind, "%s (%T) >= ",
3798                lower_bound.unsigned_p ? "unsigned" : "abs",
3799                lower_bound.expr);
3800   dump_dec (dump_kind, lower_bound.min_value);
3801 }
3802
3803 /* Record that the vectorized loop requires the vec_lower_bound described
3804    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3805
3806 static void
3807 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3808                         poly_uint64 min_value)
3809 {
3810   vec<vec_lower_bound> &lower_bounds
3811     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3812   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3813     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3814       {
3815         unsigned_p &= lower_bounds[i].unsigned_p;
3816         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3817         if (lower_bounds[i].unsigned_p != unsigned_p
3818             || maybe_lt (lower_bounds[i].min_value, min_value))
3819           {
3820             lower_bounds[i].unsigned_p = unsigned_p;
3821             lower_bounds[i].min_value = min_value;
3822             if (dump_enabled_p ())
3823               {
3824                 dump_printf_loc (MSG_NOTE, vect_location,
3825                                  "updating run-time check to ");
3826                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3827                 dump_printf (MSG_NOTE, "\n");
3828               }
3829           }
3830         return;
3831       }
3832
3833   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3834   if (dump_enabled_p ())
3835     {
3836       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3837       dump_lower_bound (MSG_NOTE, lower_bound);
3838       dump_printf (MSG_NOTE, "\n");
3839     }
3840   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3841 }
3842
3843 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3844    will span fewer than GAP bytes.  */
3845
3846 static bool
3847 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3848                   poly_int64 gap)
3849 {
3850   stmt_vec_info stmt_info = dr_info->stmt;
3851   HOST_WIDE_INT count
3852     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3853   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3854     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3855   return (estimated_poly_value (gap)
3856           <= count * vect_get_scalar_dr_size (dr_info));
3857 }
3858
3859 /* Return true if we know that there is no alias between DR_INFO_A and
3860    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3861    When returning true, set *LOWER_BOUND_OUT to this N.  */
3862
3863 static bool
3864 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3865                                 poly_uint64 *lower_bound_out)
3866 {
3867   /* Check that there is a constant gap of known sign between DR_A
3868      and DR_B.  */
3869   data_reference *dr_a = dr_info_a->dr;
3870   data_reference *dr_b = dr_info_b->dr;
3871   poly_int64 init_a, init_b;
3872   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3873       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3874       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3875       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3876       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3877       || !ordered_p (init_a, init_b))
3878     return false;
3879
3880   /* Sort DR_A and DR_B by the address they access.  */
3881   if (maybe_lt (init_b, init_a))
3882     {
3883       std::swap (init_a, init_b);
3884       std::swap (dr_info_a, dr_info_b);
3885       std::swap (dr_a, dr_b);
3886     }
3887
3888   /* If the two accesses could be dependent within a scalar iteration,
3889      make sure that we'd retain their order.  */
3890   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3891       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3892     return false;
3893
3894   /* There is no alias if abs (DR_STEP) is greater than or equal to
3895      the bytes spanned by the combination of the two accesses.  */
3896   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3897   return true;
3898 }
3899
3900 /* Function vect_prune_runtime_alias_test_list.
3901
3902    Prune a list of ddrs to be tested at run-time by versioning for alias.
3903    Merge several alias checks into one if possible.
3904    Return FALSE if resulting list of ddrs is longer then allowed by
3905    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3906
3907 opt_result
3908 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3909 {
3910   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3911   hash_set <tree_pair_hash> compared_objects;
3912
3913   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3914   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3915     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3916   const vec<vec_object_pair> &check_unequal_addrs
3917     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3918   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3919   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3920
3921   ddr_p ddr;
3922   unsigned int i;
3923   tree length_factor;
3924
3925   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3926
3927   /* Step values are irrelevant for aliasing if the number of vector
3928      iterations is equal to the number of scalar iterations (which can
3929      happen for fully-SLP loops).  */
3930   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3931
3932   if (!vf_one_p)
3933     {
3934       /* Convert the checks for nonzero steps into bound tests.  */
3935       tree value;
3936       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3937         vect_check_lower_bound (loop_vinfo, value, true, 1);
3938     }
3939
3940   if (may_alias_ddrs.is_empty ())
3941     return opt_result::success ();
3942
3943   comp_alias_ddrs.create (may_alias_ddrs.length ());
3944
3945   unsigned int loop_depth
3946     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3947                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3948
3949   /* First, we collect all data ref pairs for aliasing checks.  */
3950   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3951     {
3952       poly_uint64 lower_bound;
3953       tree segment_length_a, segment_length_b;
3954       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3955       unsigned int align_a, align_b;
3956
3957       /* Ignore the alias if the VF we chose ended up being no greater
3958          than the dependence distance.  */
3959       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3960         continue;
3961
3962       if (DDR_OBJECT_A (ddr))
3963         {
3964           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3965           if (!compared_objects.add (new_pair))
3966             {
3967               if (dump_enabled_p ())
3968                 dump_printf_loc (MSG_NOTE, vect_location,
3969                                  "checking that %T and %T"
3970                                  " have different addresses\n",
3971                                  new_pair.first, new_pair.second);
3972               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3973             }
3974           continue;
3975         }
3976
3977       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3978       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3979
3980       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3981       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3982
3983       bool preserves_scalar_order_p
3984         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3985       bool ignore_step_p
3986           = (vf_one_p
3987              && (preserves_scalar_order_p
3988                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3989                                      DR_STEP (dr_info_b->dr))));
3990
3991       /* Skip the pair if inter-iteration dependencies are irrelevant
3992          and intra-iteration dependencies are guaranteed to be honored.  */
3993       if (ignore_step_p
3994           && (preserves_scalar_order_p
3995               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3996                                                  &lower_bound)))
3997         {
3998           if (dump_enabled_p ())
3999             dump_printf_loc (MSG_NOTE, vect_location,
4000                              "no need for alias check between "
4001                              "%T and %T when VF is 1\n",
4002                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4003           continue;
4004         }
4005
4006       /* See whether we can handle the alias using a bounds check on
4007          the step, and whether that's likely to be the best approach.
4008          (It might not be, for example, if the minimum step is much larger
4009          than the number of bytes handled by one vector iteration.)  */
4010       if (!ignore_step_p
4011           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4012           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4013                                              &lower_bound)
4014           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4015               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4016         {
4017           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4018           if (dump_enabled_p ())
4019             {
4020               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4021                                "%T and %T when the step %T is outside ",
4022                                DR_REF (dr_info_a->dr),
4023                                DR_REF (dr_info_b->dr),
4024                                DR_STEP (dr_info_a->dr));
4025               if (unsigned_p)
4026                 dump_printf (MSG_NOTE, "[0");
4027               else
4028                 {
4029                   dump_printf (MSG_NOTE, "(");
4030                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4031                 }
4032               dump_printf (MSG_NOTE, ", ");
4033               dump_dec (MSG_NOTE, lower_bound);
4034               dump_printf (MSG_NOTE, ")\n");
4035             }
4036           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4037                                   unsigned_p, lower_bound);
4038           continue;
4039         }
4040
4041       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4042       if (dr_group_first_a)
4043         {
4044           stmt_info_a = dr_group_first_a;
4045           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4046         }
4047
4048       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4049       if (dr_group_first_b)
4050         {
4051           stmt_info_b = dr_group_first_b;
4052           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4053         }
4054
4055       if (ignore_step_p)
4056         {
4057           segment_length_a = size_zero_node;
4058           segment_length_b = size_zero_node;
4059         }
4060       else
4061         {
4062           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4063                                 DR_STEP (dr_info_b->dr), 0))
4064             length_factor = scalar_loop_iters;
4065           else
4066             length_factor = size_int (vect_factor);
4067           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4068           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4069         }
4070       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4071       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4072       align_a = vect_vfa_align (dr_info_a);
4073       align_b = vect_vfa_align (dr_info_b);
4074
4075       /* See whether the alias is known at compilation time.  */
4076       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4077                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4078           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4079                               DR_OFFSET (dr_info_b->dr), 0)
4080           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4081           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4082           && poly_int_tree_p (segment_length_a)
4083           && poly_int_tree_p (segment_length_b))
4084         {
4085           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4086                                              segment_length_a,
4087                                              segment_length_b,
4088                                              access_size_a,
4089                                              access_size_b);
4090           if (res >= 0 && dump_enabled_p ())
4091             {
4092               dump_printf_loc (MSG_NOTE, vect_location,
4093                                "can tell at compile time that %T and %T",
4094                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4095               if (res == 0)
4096                 dump_printf (MSG_NOTE, " do not alias\n");
4097               else
4098                 dump_printf (MSG_NOTE, " alias\n");
4099             }
4100
4101           if (res == 0)
4102             continue;
4103
4104           if (res == 1)
4105             return opt_result::failure_at (stmt_info_b->stmt,
4106                                            "not vectorized:"
4107                                            " compilation time alias: %G%G",
4108                                            stmt_info_a->stmt,
4109                                            stmt_info_b->stmt);
4110         }
4111
4112       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4113                             access_size_a, align_a);
4114       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4115                             access_size_b, align_b);
4116       /* Canonicalize the order to be the one that's needed for accurate
4117          RAW, WAR and WAW flags, in cases where the data references are
4118          well-ordered.  The order doesn't really matter otherwise,
4119          but we might as well be consistent.  */
4120       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4121         std::swap (dr_a, dr_b);
4122
4123       dr_with_seg_len_pair_t dr_with_seg_len_pair
4124         (dr_a, dr_b, (preserves_scalar_order_p
4125                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4126                       : dr_with_seg_len_pair_t::REORDERED));
4127
4128       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4129     }
4130
4131   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4132
4133   unsigned int count = (comp_alias_ddrs.length ()
4134                         + check_unequal_addrs.length ());
4135
4136   if (count
4137       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4138           == VECT_COST_MODEL_VERY_CHEAP))
4139     return opt_result::failure_at
4140       (vect_location, "would need a runtime alias check\n");
4141
4142   if (dump_enabled_p ())
4143     dump_printf_loc (MSG_NOTE, vect_location,
4144                      "improved number of alias checks from %d to %d\n",
4145                      may_alias_ddrs.length (), count);
4146   unsigned limit = param_vect_max_version_for_alias_checks;
4147   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4148     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4149   if (count > limit)
4150     return opt_result::failure_at
4151       (vect_location,
4152        "number of versioning for alias run-time tests exceeds %d "
4153        "(--param vect-max-version-for-alias-checks)\n", limit);
4154
4155   return opt_result::success ();
4156 }
4157
4158 /* Check whether we can use an internal function for a gather load
4159    or scatter store.  READ_P is true for loads and false for stores.
4160    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4161    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4162    is the type of the offset that is being applied to the invariant
4163    base address.  SCALE is the amount by which the offset should
4164    be multiplied *after* it has been converted to address width.
4165
4166    Return true if the function is supported, storing the function id in
4167    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
4168
4169 bool
4170 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4171                           tree vectype, tree memory_type, tree offset_type,
4172                           int scale, internal_fn *ifn_out,
4173                           tree *offset_vectype_out)
4174 {
4175   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4176   unsigned int element_bits = vector_element_bits (vectype);
4177   if (element_bits != memory_bits)
4178     /* For now the vector elements must be the same width as the
4179        memory elements.  */
4180     return false;
4181
4182   /* Work out which function we need.  */
4183   internal_fn ifn, alt_ifn, alt_ifn2;
4184   if (read_p)
4185     {
4186       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4187       alt_ifn = IFN_MASK_GATHER_LOAD;
4188       /* When target supports MASK_LEN_GATHER_LOAD, we always
4189          use MASK_LEN_GATHER_LOAD regardless whether len and
4190          mask are valid or not.  */
4191       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4192     }
4193   else
4194     {
4195       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4196       alt_ifn = IFN_MASK_SCATTER_STORE;
4197       /* When target supports MASK_LEN_SCATTER_STORE, we always
4198          use MASK_LEN_SCATTER_STORE regardless whether len and
4199          mask are valid or not.  */
4200       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4201     }
4202
4203   for (;;)
4204     {
4205       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4206       if (!offset_vectype)
4207         return false;
4208
4209       /* Test whether the target supports this combination.  */
4210       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4211                                                   offset_vectype, scale))
4212         {
4213           *ifn_out = ifn;
4214           *offset_vectype_out = offset_vectype;
4215           return true;
4216         }
4217       else if (!masked_p
4218                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4219                                                           memory_type,
4220                                                           offset_vectype,
4221                                                           scale))
4222         {
4223           *ifn_out = alt_ifn;
4224           *offset_vectype_out = offset_vectype;
4225           return true;
4226         }
4227       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4228                                                        memory_type,
4229                                                        offset_vectype, scale))
4230         {
4231           *ifn_out = alt_ifn2;
4232           *offset_vectype_out = offset_vectype;
4233           return true;
4234         }
4235
4236       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4237           && TYPE_PRECISION (offset_type) >= element_bits)
4238         return false;
4239
4240       offset_type = build_nonstandard_integer_type
4241         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4242     }
4243 }
4244
4245 /* STMT_INFO is a call to an internal gather load or scatter store function.
4246    Describe the operation in INFO.  */
4247
4248 static void
4249 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4250                                    gather_scatter_info *info)
4251 {
4252   gcall *call = as_a <gcall *> (stmt_info->stmt);
4253   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4254   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4255
4256   info->ifn = gimple_call_internal_fn (call);
4257   info->decl = NULL_TREE;
4258   info->base = gimple_call_arg (call, 0);
4259   info->offset = gimple_call_arg (call, 1);
4260   info->offset_dt = vect_unknown_def_type;
4261   info->offset_vectype = NULL_TREE;
4262   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4263   info->element_type = TREE_TYPE (vectype);
4264   info->memory_type = TREE_TYPE (DR_REF (dr));
4265 }
4266
4267 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4268    gather load or scatter store.  Describe the operation in *INFO if so.  */
4269
4270 bool
4271 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4272                            gather_scatter_info *info)
4273 {
4274   HOST_WIDE_INT scale = 1;
4275   poly_int64 pbitpos, pbitsize;
4276   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4277   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4278   tree offtype = NULL_TREE;
4279   tree decl = NULL_TREE, base, off;
4280   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4281   tree memory_type = TREE_TYPE (DR_REF (dr));
4282   machine_mode pmode;
4283   int punsignedp, reversep, pvolatilep = 0;
4284   internal_fn ifn;
4285   tree offset_vectype;
4286   bool masked_p = false;
4287
4288   /* See whether this is already a call to a gather/scatter internal function.
4289      If not, see whether it's a masked load or store.  */
4290   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4291   if (call && gimple_call_internal_p (call))
4292     {
4293       ifn = gimple_call_internal_fn (call);
4294       if (internal_gather_scatter_fn_p (ifn))
4295         {
4296           vect_describe_gather_scatter_call (stmt_info, info);
4297           return true;
4298         }
4299       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4300     }
4301
4302   /* True if we should aim to use internal functions rather than
4303      built-in functions.  */
4304   bool use_ifn_p = (DR_IS_READ (dr)
4305                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4306                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4307
4308   base = DR_REF (dr);
4309   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4310      see if we can use the def stmt of the address.  */
4311   if (masked_p
4312       && TREE_CODE (base) == MEM_REF
4313       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4314       && integer_zerop (TREE_OPERAND (base, 1))
4315       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4316     {
4317       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4318       if (is_gimple_assign (def_stmt)
4319           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4320         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4321     }
4322
4323   /* The gather and scatter builtins need address of the form
4324      loop_invariant + vector * {1, 2, 4, 8}
4325      or
4326      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4327      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4328      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4329      multiplications and additions in it.  To get a vector, we need
4330      a single SSA_NAME that will be defined in the loop and will
4331      contain everything that is not loop invariant and that can be
4332      vectorized.  The following code attempts to find such a preexistng
4333      SSA_NAME OFF and put the loop invariants into a tree BASE
4334      that can be gimplified before the loop.  */
4335   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4336                               &punsignedp, &reversep, &pvolatilep);
4337   if (reversep)
4338     return false;
4339
4340   /* PR 107346.  Packed structs can have fields at offsets that are not
4341      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4342   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4343     return false;
4344
4345   /* We need to be able to form an address to the base which for example
4346      isn't possible for hard registers.  */
4347   if (may_be_nonaddressable_p (base))
4348     return false;
4349
4350   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4351
4352   if (TREE_CODE (base) == MEM_REF)
4353     {
4354       if (!integer_zerop (TREE_OPERAND (base, 1)))
4355         {
4356           if (off == NULL_TREE)
4357             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4358           else
4359             off = size_binop (PLUS_EXPR, off,
4360                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4361         }
4362       base = TREE_OPERAND (base, 0);
4363     }
4364   else
4365     base = build_fold_addr_expr (base);
4366
4367   if (off == NULL_TREE)
4368     off = size_zero_node;
4369
4370   /* If base is not loop invariant, either off is 0, then we start with just
4371      the constant offset in the loop invariant BASE and continue with base
4372      as OFF, otherwise give up.
4373      We could handle that case by gimplifying the addition of base + off
4374      into some SSA_NAME and use that as off, but for now punt.  */
4375   if (!expr_invariant_in_loop_p (loop, base))
4376     {
4377       if (!integer_zerop (off))
4378         return false;
4379       off = base;
4380       base = size_int (pbytepos);
4381     }
4382   /* Otherwise put base + constant offset into the loop invariant BASE
4383      and continue with OFF.  */
4384   else
4385     {
4386       base = fold_convert (sizetype, base);
4387       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4388     }
4389
4390   /* OFF at this point may be either a SSA_NAME or some tree expression
4391      from get_inner_reference.  Try to peel off loop invariants from it
4392      into BASE as long as possible.  */
4393   STRIP_NOPS (off);
4394   while (offtype == NULL_TREE)
4395     {
4396       enum tree_code code;
4397       tree op0, op1, add = NULL_TREE;
4398
4399       if (TREE_CODE (off) == SSA_NAME)
4400         {
4401           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4402
4403           if (expr_invariant_in_loop_p (loop, off))
4404             return false;
4405
4406           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4407             break;
4408
4409           op0 = gimple_assign_rhs1 (def_stmt);
4410           code = gimple_assign_rhs_code (def_stmt);
4411           op1 = gimple_assign_rhs2 (def_stmt);
4412         }
4413       else
4414         {
4415           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4416             return false;
4417           code = TREE_CODE (off);
4418           extract_ops_from_tree (off, &code, &op0, &op1);
4419         }
4420       switch (code)
4421         {
4422         case POINTER_PLUS_EXPR:
4423         case PLUS_EXPR:
4424           if (expr_invariant_in_loop_p (loop, op0))
4425             {
4426               add = op0;
4427               off = op1;
4428             do_add:
4429               add = fold_convert (sizetype, add);
4430               if (scale != 1)
4431                 add = size_binop (MULT_EXPR, add, size_int (scale));
4432               base = size_binop (PLUS_EXPR, base, add);
4433               continue;
4434             }
4435           if (expr_invariant_in_loop_p (loop, op1))
4436             {
4437               add = op1;
4438               off = op0;
4439               goto do_add;
4440             }
4441           break;
4442         case MINUS_EXPR:
4443           if (expr_invariant_in_loop_p (loop, op1))
4444             {
4445               add = fold_convert (sizetype, op1);
4446               add = size_binop (MINUS_EXPR, size_zero_node, add);
4447               off = op0;
4448               goto do_add;
4449             }
4450           break;
4451         case MULT_EXPR:
4452           if (scale == 1 && tree_fits_shwi_p (op1))
4453             {
4454               int new_scale = tree_to_shwi (op1);
4455               /* Only treat this as a scaling operation if the target
4456                  supports it for at least some offset type.  */
4457               if (use_ifn_p
4458                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4459                                                 masked_p, vectype, memory_type,
4460                                                 signed_char_type_node,
4461                                                 new_scale, &ifn,
4462                                                 &offset_vectype)
4463                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4464                                                 masked_p, vectype, memory_type,
4465                                                 unsigned_char_type_node,
4466                                                 new_scale, &ifn,
4467                                                 &offset_vectype))
4468                 break;
4469               scale = new_scale;
4470               off = op0;
4471               continue;
4472             }
4473           break;
4474         case SSA_NAME:
4475           off = op0;
4476           continue;
4477         CASE_CONVERT:
4478           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4479               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4480             break;
4481
4482           /* Don't include the conversion if the target is happy with
4483              the current offset type.  */
4484           if (use_ifn_p
4485               && TREE_CODE (off) == SSA_NAME
4486               && !POINTER_TYPE_P (TREE_TYPE (off))
4487               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4488                                            masked_p, vectype, memory_type,
4489                                            TREE_TYPE (off), scale, &ifn,
4490                                            &offset_vectype))
4491             break;
4492
4493           if (TYPE_PRECISION (TREE_TYPE (op0))
4494               == TYPE_PRECISION (TREE_TYPE (off)))
4495             {
4496               off = op0;
4497               continue;
4498             }
4499
4500           /* Include the conversion if it is widening and we're using
4501              the IFN path or the target can handle the converted from
4502              offset or the current size is not already the same as the
4503              data vector element size.  */
4504           if ((TYPE_PRECISION (TREE_TYPE (op0))
4505                < TYPE_PRECISION (TREE_TYPE (off)))
4506               && (use_ifn_p
4507                   || (DR_IS_READ (dr)
4508                       ? (targetm.vectorize.builtin_gather
4509                          && targetm.vectorize.builtin_gather (vectype,
4510                                                               TREE_TYPE (op0),
4511                                                               scale))
4512                       : (targetm.vectorize.builtin_scatter
4513                          && targetm.vectorize.builtin_scatter (vectype,
4514                                                                TREE_TYPE (op0),
4515                                                                scale)))
4516                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4517                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4518             {
4519               off = op0;
4520               offtype = TREE_TYPE (off);
4521               STRIP_NOPS (off);
4522               continue;
4523             }
4524           break;
4525         default:
4526           break;
4527         }
4528       break;
4529     }
4530
4531   /* If at the end OFF still isn't a SSA_NAME or isn't
4532      defined in the loop, punt.  */
4533   if (TREE_CODE (off) != SSA_NAME
4534       || expr_invariant_in_loop_p (loop, off))
4535     return false;
4536
4537   if (offtype == NULL_TREE)
4538     offtype = TREE_TYPE (off);
4539
4540   if (use_ifn_p)
4541     {
4542       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4543                                      vectype, memory_type, offtype, scale,
4544                                      &ifn, &offset_vectype))
4545         ifn = IFN_LAST;
4546       decl = NULL_TREE;
4547     }
4548   else
4549     {
4550       if (DR_IS_READ (dr))
4551         {
4552           if (targetm.vectorize.builtin_gather)
4553             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4554         }
4555       else
4556         {
4557           if (targetm.vectorize.builtin_scatter)
4558             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4559         }
4560       ifn = IFN_LAST;
4561       /* The offset vector type will be read from DECL when needed.  */
4562       offset_vectype = NULL_TREE;
4563     }
4564
4565   info->ifn = ifn;
4566   info->decl = decl;
4567   info->base = base;
4568   info->offset = off;
4569   info->offset_dt = vect_unknown_def_type;
4570   info->offset_vectype = offset_vectype;
4571   info->scale = scale;
4572   info->element_type = TREE_TYPE (vectype);
4573   info->memory_type = memory_type;
4574   return true;
4575 }
4576
4577 /* Find the data references in STMT, analyze them with respect to LOOP and
4578    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4579    be handled.  */
4580
4581 opt_result
4582 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4583                                vec<data_reference_p> *datarefs,
4584                                vec<int> *dataref_groups, int group_id)
4585 {
4586   /* We can ignore clobbers for dataref analysis - they are removed during
4587      loop vectorization and BB vectorization checks dependences with a
4588      stmt walk.  */
4589   if (gimple_clobber_p (stmt))
4590     return opt_result::success ();
4591
4592   if (gimple_has_volatile_ops (stmt))
4593     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4594                                    stmt);
4595
4596   if (stmt_can_throw_internal (cfun, stmt))
4597     return opt_result::failure_at (stmt,
4598                                    "not vectorized:"
4599                                    " statement can throw an exception: %G",
4600                                    stmt);
4601
4602   auto_vec<data_reference_p, 2> refs;
4603   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4604   if (!res)
4605     return res;
4606
4607   if (refs.is_empty ())
4608     return opt_result::success ();
4609
4610   if (refs.length () > 1)
4611     {
4612       while (!refs.is_empty ())
4613         free_data_ref (refs.pop ());
4614       return opt_result::failure_at (stmt,
4615                                      "not vectorized: more than one "
4616                                      "data ref in stmt: %G", stmt);
4617     }
4618
4619   data_reference_p dr = refs.pop ();
4620   if (gcall *call = dyn_cast <gcall *> (stmt))
4621     if (!gimple_call_internal_p (call)
4622         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4623             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4624       {
4625         free_data_ref (dr);
4626         return opt_result::failure_at (stmt,
4627                                        "not vectorized: dr in a call %G", stmt);
4628       }
4629
4630   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4631       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4632     {
4633       free_data_ref (dr);
4634       return opt_result::failure_at (stmt,
4635                                      "not vectorized:"
4636                                      " statement is an unsupported"
4637                                      " bitfield access %G", stmt);
4638     }
4639
4640   if (DR_BASE_ADDRESS (dr)
4641       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4642     {
4643       free_data_ref (dr);
4644       return opt_result::failure_at (stmt,
4645                                      "not vectorized:"
4646                                      " base addr of dr is a constant\n");
4647     }
4648
4649   /* Check whether this may be a SIMD lane access and adjust the
4650      DR to make it easier for us to handle it.  */
4651   if (loop
4652       && loop->simduid
4653       && (!DR_BASE_ADDRESS (dr)
4654           || !DR_OFFSET (dr)
4655           || !DR_INIT (dr)
4656           || !DR_STEP (dr)))
4657     {
4658       struct data_reference *newdr
4659         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4660                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4661       if (DR_BASE_ADDRESS (newdr)
4662           && DR_OFFSET (newdr)
4663           && DR_INIT (newdr)
4664           && DR_STEP (newdr)
4665           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4666           && integer_zerop (DR_STEP (newdr)))
4667         {
4668           tree base_address = DR_BASE_ADDRESS (newdr);
4669           tree off = DR_OFFSET (newdr);
4670           tree step = ssize_int (1);
4671           if (integer_zerop (off)
4672               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4673             {
4674               off = TREE_OPERAND (base_address, 1);
4675               base_address = TREE_OPERAND (base_address, 0);
4676             }
4677           STRIP_NOPS (off);
4678           if (TREE_CODE (off) == MULT_EXPR
4679               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4680             {
4681               step = TREE_OPERAND (off, 1);
4682               off = TREE_OPERAND (off, 0);
4683               STRIP_NOPS (off);
4684             }
4685           if (CONVERT_EXPR_P (off)
4686               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4687                   < TYPE_PRECISION (TREE_TYPE (off))))
4688             off = TREE_OPERAND (off, 0);
4689           if (TREE_CODE (off) == SSA_NAME)
4690             {
4691               gimple *def = SSA_NAME_DEF_STMT (off);
4692               /* Look through widening conversion.  */
4693               if (is_gimple_assign (def)
4694                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4695                 {
4696                   tree rhs1 = gimple_assign_rhs1 (def);
4697                   if (TREE_CODE (rhs1) == SSA_NAME
4698                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4699                       && (TYPE_PRECISION (TREE_TYPE (off))
4700                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4701                     def = SSA_NAME_DEF_STMT (rhs1);
4702                 }
4703               if (is_gimple_call (def)
4704                   && gimple_call_internal_p (def)
4705                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4706                 {
4707                   tree arg = gimple_call_arg (def, 0);
4708                   tree reft = TREE_TYPE (DR_REF (newdr));
4709                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4710                   arg = SSA_NAME_VAR (arg);
4711                   if (arg == loop->simduid
4712                       /* For now.  */
4713                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4714                     {
4715                       DR_BASE_ADDRESS (newdr) = base_address;
4716                       DR_OFFSET (newdr) = ssize_int (0);
4717                       DR_STEP (newdr) = step;
4718                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4719                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4720                       /* Mark as simd-lane access.  */
4721                       tree arg2 = gimple_call_arg (def, 1);
4722                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4723                       free_data_ref (dr);
4724                       datarefs->safe_push (newdr);
4725                       if (dataref_groups)
4726                         dataref_groups->safe_push (group_id);
4727                       return opt_result::success ();
4728                     }
4729                 }
4730             }
4731         }
4732       free_data_ref (newdr);
4733     }
4734
4735   datarefs->safe_push (dr);
4736   if (dataref_groups)
4737     dataref_groups->safe_push (group_id);
4738   return opt_result::success ();
4739 }
4740
4741 /* Function vect_analyze_data_refs.
4742
4743   Find all the data references in the loop or basic block.
4744
4745    The general structure of the analysis of data refs in the vectorizer is as
4746    follows:
4747    1- vect_analyze_data_refs(loop/bb): call
4748       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4749       in the loop/bb and their dependences.
4750    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4751    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4752    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4753
4754 */
4755
4756 opt_result
4757 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4758 {
4759   class loop *loop = NULL;
4760   unsigned int i;
4761   struct data_reference *dr;
4762   tree scalar_type;
4763
4764   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4765
4766   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4767     loop = LOOP_VINFO_LOOP (loop_vinfo);
4768
4769   /* Go through the data-refs, check that the analysis succeeded.  Update
4770      pointer from stmt_vec_info struct to DR and vectype.  */
4771
4772   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4773   FOR_EACH_VEC_ELT (datarefs, i, dr)
4774     {
4775       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4776       poly_uint64 vf;
4777
4778       gcc_assert (DR_REF (dr));
4779       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4780       gcc_assert (!stmt_info->dr_aux.dr);
4781       stmt_info->dr_aux.dr = dr;
4782       stmt_info->dr_aux.stmt = stmt_info;
4783
4784       /* Check that analysis of the data-ref succeeded.  */
4785       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4786           || !DR_STEP (dr))
4787         {
4788           bool maybe_gather
4789             = DR_IS_READ (dr)
4790               && !TREE_THIS_VOLATILE (DR_REF (dr));
4791           bool maybe_scatter
4792             = DR_IS_WRITE (dr)
4793               && !TREE_THIS_VOLATILE (DR_REF (dr));
4794
4795           /* If target supports vector gather loads or scatter stores,
4796              see if they can't be used.  */
4797           if (is_a <loop_vec_info> (vinfo)
4798               && !nested_in_vect_loop_p (loop, stmt_info))
4799             {
4800               if (maybe_gather || maybe_scatter)
4801                 {
4802                   if (maybe_gather)
4803                     gatherscatter = GATHER;
4804                   else
4805                     gatherscatter = SCATTER;
4806                 }
4807             }
4808
4809           if (gatherscatter == SG_NONE)
4810             {
4811               if (dump_enabled_p ())
4812                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4813                                  "not vectorized: data ref analysis "
4814                                  "failed %G", stmt_info->stmt);
4815               if (is_a <bb_vec_info> (vinfo))
4816                 {
4817                   /* In BB vectorization the ref can still participate
4818                      in dependence analysis, we just can't vectorize it.  */
4819                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4820                   continue;
4821                 }
4822               return opt_result::failure_at (stmt_info->stmt,
4823                                              "not vectorized:"
4824                                              " data ref analysis failed: %G",
4825                                              stmt_info->stmt);
4826             }
4827         }
4828
4829       /* See if this was detected as SIMD lane access.  */
4830       if (dr->aux == (void *)-1
4831           || dr->aux == (void *)-2
4832           || dr->aux == (void *)-3
4833           || dr->aux == (void *)-4)
4834         {
4835           if (nested_in_vect_loop_p (loop, stmt_info))
4836             return opt_result::failure_at (stmt_info->stmt,
4837                                            "not vectorized:"
4838                                            " data ref analysis failed: %G",
4839                                            stmt_info->stmt);
4840           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4841             = -(uintptr_t) dr->aux;
4842         }
4843
4844       tree base = get_base_address (DR_REF (dr));
4845       if (base && VAR_P (base) && DECL_NONALIASED (base))
4846         {
4847           if (dump_enabled_p ())
4848             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4849                              "not vectorized: base object not addressable "
4850                              "for stmt: %G", stmt_info->stmt);
4851           if (is_a <bb_vec_info> (vinfo))
4852             {
4853               /* In BB vectorization the ref can still participate
4854                  in dependence analysis, we just can't vectorize it.  */
4855               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4856               continue;
4857             }
4858           return opt_result::failure_at (stmt_info->stmt,
4859                                          "not vectorized: base object not"
4860                                          " addressable for stmt: %G",
4861                                          stmt_info->stmt);
4862         }
4863
4864       if (is_a <loop_vec_info> (vinfo)
4865           && DR_STEP (dr)
4866           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4867         {
4868           if (nested_in_vect_loop_p (loop, stmt_info))
4869             return opt_result::failure_at (stmt_info->stmt,
4870                                            "not vectorized: "
4871                                            "not suitable for strided load %G",
4872                                            stmt_info->stmt);
4873           STMT_VINFO_STRIDED_P (stmt_info) = true;
4874         }
4875
4876       /* Update DR field in stmt_vec_info struct.  */
4877
4878       /* If the dataref is in an inner-loop of the loop that is considered for
4879          for vectorization, we also want to analyze the access relative to
4880          the outer-loop (DR contains information only relative to the
4881          inner-most enclosing loop).  We do that by building a reference to the
4882          first location accessed by the inner-loop, and analyze it relative to
4883          the outer-loop.  */
4884       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4885         {
4886           /* Build a reference to the first location accessed by the
4887              inner loop: *(BASE + INIT + OFFSET).  By construction,
4888              this address must be invariant in the inner loop, so we
4889              can consider it as being used in the outer loop.  */
4890           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4891           tree offset = unshare_expr (DR_OFFSET (dr));
4892           tree init = unshare_expr (DR_INIT (dr));
4893           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4894                                           init, offset);
4895           tree init_addr = fold_build_pointer_plus (base, init_offset);
4896           tree init_ref = build_fold_indirect_ref (init_addr);
4897
4898           if (dump_enabled_p ())
4899             dump_printf_loc (MSG_NOTE, vect_location,
4900                              "analyze in outer loop: %T\n", init_ref);
4901
4902           opt_result res
4903             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4904                                     init_ref, loop, stmt_info->stmt);
4905           if (!res)
4906             /* dr_analyze_innermost already explained the failure.  */
4907             return res;
4908
4909           if (dump_enabled_p ())
4910             dump_printf_loc (MSG_NOTE, vect_location,
4911                              "\touter base_address: %T\n"
4912                              "\touter offset from base address: %T\n"
4913                              "\touter constant offset from base address: %T\n"
4914                              "\touter step: %T\n"
4915                              "\touter base alignment: %d\n\n"
4916                              "\touter base misalignment: %d\n"
4917                              "\touter offset alignment: %d\n"
4918                              "\touter step alignment: %d\n",
4919                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4920                              STMT_VINFO_DR_OFFSET (stmt_info),
4921                              STMT_VINFO_DR_INIT (stmt_info),
4922                              STMT_VINFO_DR_STEP (stmt_info),
4923                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4924                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4925                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4926                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4927         }
4928
4929       /* Set vectype for STMT.  */
4930       scalar_type = TREE_TYPE (DR_REF (dr));
4931       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4932       if (!vectype)
4933         {
4934           if (dump_enabled_p ())
4935             {
4936               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4937                                "not vectorized: no vectype for stmt: %G",
4938                                stmt_info->stmt);
4939               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4940               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4941                                  scalar_type);
4942               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4943             }
4944
4945           if (is_a <bb_vec_info> (vinfo))
4946             {
4947               /* No vector type is fine, the ref can still participate
4948                  in dependence analysis, we just can't vectorize it.  */
4949               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4950               continue;
4951             }
4952           if (fatal)
4953             *fatal = false;
4954           return opt_result::failure_at (stmt_info->stmt,
4955                                          "not vectorized:"
4956                                          " no vectype for stmt: %G"
4957                                          " scalar_type: %T\n",
4958                                          stmt_info->stmt, scalar_type);
4959         }
4960       else
4961         {
4962           if (dump_enabled_p ())
4963             dump_printf_loc (MSG_NOTE, vect_location,
4964                              "got vectype for stmt: %G%T\n",
4965                              stmt_info->stmt, vectype);
4966         }
4967
4968       /* Adjust the minimal vectorization factor according to the
4969          vector type.  */
4970       vf = TYPE_VECTOR_SUBPARTS (vectype);
4971       *min_vf = upper_bound (*min_vf, vf);
4972
4973       /* Leave the BB vectorizer to pick the vector type later, based on
4974          the final dataref group size and SLP node size.  */
4975       if (is_a <loop_vec_info> (vinfo))
4976         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4977
4978       if (gatherscatter != SG_NONE)
4979         {
4980           gather_scatter_info gs_info;
4981           if (!vect_check_gather_scatter (stmt_info,
4982                                           as_a <loop_vec_info> (vinfo),
4983                                           &gs_info)
4984               || !get_vectype_for_scalar_type (vinfo,
4985                                                TREE_TYPE (gs_info.offset)))
4986             {
4987               if (fatal)
4988                 *fatal = false;
4989               return opt_result::failure_at
4990                         (stmt_info->stmt,
4991                          (gatherscatter == GATHER)
4992                          ? "not vectorized: not suitable for gather load %G"
4993                          : "not vectorized: not suitable for scatter store %G",
4994                          stmt_info->stmt);
4995             }
4996           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4997         }
4998     }
4999
5000   /* We used to stop processing and prune the list here.  Verify we no
5001      longer need to.  */
5002   gcc_assert (i == datarefs.length ());
5003
5004   return opt_result::success ();
5005 }
5006
5007
5008 /* Function vect_get_new_vect_var.
5009
5010    Returns a name for a new variable.  The current naming scheme appends the
5011    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5012    the name of vectorizer generated variables, and appends that to NAME if
5013    provided.  */
5014
5015 tree
5016 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5017 {
5018   const char *prefix;
5019   tree new_vect_var;
5020
5021   switch (var_kind)
5022   {
5023   case vect_simple_var:
5024     prefix = "vect";
5025     break;
5026   case vect_scalar_var:
5027     prefix = "stmp";
5028     break;
5029   case vect_mask_var:
5030     prefix = "mask";
5031     break;
5032   case vect_pointer_var:
5033     prefix = "vectp";
5034     break;
5035   default:
5036     gcc_unreachable ();
5037   }
5038
5039   if (name)
5040     {
5041       char* tmp = concat (prefix, "_", name, NULL);
5042       new_vect_var = create_tmp_reg (type, tmp);
5043       free (tmp);
5044     }
5045   else
5046     new_vect_var = create_tmp_reg (type, prefix);
5047
5048   return new_vect_var;
5049 }
5050
5051 /* Like vect_get_new_vect_var but return an SSA name.  */
5052
5053 tree
5054 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5055 {
5056   const char *prefix;
5057   tree new_vect_var;
5058
5059   switch (var_kind)
5060   {
5061   case vect_simple_var:
5062     prefix = "vect";
5063     break;
5064   case vect_scalar_var:
5065     prefix = "stmp";
5066     break;
5067   case vect_pointer_var:
5068     prefix = "vectp";
5069     break;
5070   default:
5071     gcc_unreachable ();
5072   }
5073
5074   if (name)
5075     {
5076       char* tmp = concat (prefix, "_", name, NULL);
5077       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5078       free (tmp);
5079     }
5080   else
5081     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5082
5083   return new_vect_var;
5084 }
5085
5086 /* Duplicate points-to info on NAME from DR_INFO.  */
5087
5088 static void
5089 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5090 {
5091   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5092   /* DR_PTR_INFO is for a base SSA name, not including constant or
5093      variable offsets in the ref so its alignment info does not apply.  */
5094   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5095 }
5096
5097 /* Function vect_create_addr_base_for_vector_ref.
5098
5099    Create an expression that computes the address of the first memory location
5100    that will be accessed for a data reference.
5101
5102    Input:
5103    STMT_INFO: The statement containing the data reference.
5104    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5105    OFFSET: Optional. If supplied, it is be added to the initial address.
5106    LOOP:    Specify relative to which loop-nest should the address be computed.
5107             For example, when the dataref is in an inner-loop nested in an
5108             outer-loop that is now being vectorized, LOOP can be either the
5109             outer-loop, or the inner-loop.  The first memory location accessed
5110             by the following dataref ('in' points to short):
5111
5112                 for (i=0; i<N; i++)
5113                    for (j=0; j<M; j++)
5114                      s += in[i+j]
5115
5116             is as follows:
5117             if LOOP=i_loop:     &in             (relative to i_loop)
5118             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5119
5120    Output:
5121    1. Return an SSA_NAME whose value is the address of the memory location of
5122       the first vector of the data reference.
5123    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5124       these statement(s) which define the returned SSA_NAME.
5125
5126    FORNOW: We are only handling array accesses with step 1.  */
5127
5128 tree
5129 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5130                                       gimple_seq *new_stmt_list,
5131                                       tree offset)
5132 {
5133   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5134   struct data_reference *dr = dr_info->dr;
5135   const char *base_name;
5136   tree addr_base;
5137   tree dest;
5138   gimple_seq seq = NULL;
5139   tree vect_ptr_type;
5140   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5141   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5142
5143   tree data_ref_base = unshare_expr (drb->base_address);
5144   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5145   tree init = unshare_expr (drb->init);
5146
5147   if (loop_vinfo)
5148     base_name = get_name (data_ref_base);
5149   else
5150     {
5151       base_offset = ssize_int (0);
5152       init = ssize_int (0);
5153       base_name = get_name (DR_REF (dr));
5154     }
5155
5156   /* Create base_offset */
5157   base_offset = size_binop (PLUS_EXPR,
5158                             fold_convert (sizetype, base_offset),
5159                             fold_convert (sizetype, init));
5160
5161   if (offset)
5162     {
5163       offset = fold_convert (sizetype, offset);
5164       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5165                                  base_offset, offset);
5166     }
5167
5168   /* base + base_offset */
5169   if (loop_vinfo)
5170     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5171   else
5172     addr_base = build1 (ADDR_EXPR,
5173                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5174                         /* Strip zero offset components since we don't need
5175                            them and they can confuse late diagnostics if
5176                            we CSE them wrongly.  See PR106904 for example.  */
5177                         unshare_expr (strip_zero_offset_components
5178                                                                 (DR_REF (dr))));
5179
5180   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5181   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5182   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5183   gimple_seq_add_seq (new_stmt_list, seq);
5184
5185   if (DR_PTR_INFO (dr)
5186       && TREE_CODE (addr_base) == SSA_NAME
5187       /* We should only duplicate pointer info to newly created SSA names.  */
5188       && SSA_NAME_VAR (addr_base) == dest)
5189     {
5190       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5191       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5192     }
5193
5194   if (dump_enabled_p ())
5195     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5196
5197   return addr_base;
5198 }
5199
5200
5201 /* Function vect_create_data_ref_ptr.
5202
5203    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5204    location accessed in the loop by STMT_INFO, along with the def-use update
5205    chain to appropriately advance the pointer through the loop iterations.
5206    Also set aliasing information for the pointer.  This pointer is used by
5207    the callers to this function to create a memory reference expression for
5208    vector load/store access.
5209
5210    Input:
5211    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5212          GIMPLE_ASSIGN <name, data-ref> or
5213          GIMPLE_ASSIGN <data-ref, name>.
5214    2. AGGR_TYPE: the type of the reference, which should be either a vector
5215         or an array.
5216    3. AT_LOOP: the loop where the vector memref is to be created.
5217    4. OFFSET (optional): a byte offset to be added to the initial address
5218         accessed by the data-ref in STMT_INFO.
5219    5. BSI: location where the new stmts are to be placed if there is no loop
5220    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5221         pointing to the initial address.
5222    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5223         to the IV during each iteration of the loop.  NULL says to move
5224         by one copy of AGGR_TYPE up or down, depending on the step of the
5225         data reference.
5226
5227    Output:
5228    1. Declare a new ptr to vector_type, and have it point to the base of the
5229       data reference (initial addressed accessed by the data reference).
5230       For example, for vector of type V8HI, the following code is generated:
5231
5232       v8hi *ap;
5233       ap = (v8hi *)initial_address;
5234
5235       if OFFSET is not supplied:
5236          initial_address = &a[init];
5237       if OFFSET is supplied:
5238          initial_address = &a[init] + OFFSET;
5239       if BYTE_OFFSET is supplied:
5240          initial_address = &a[init] + BYTE_OFFSET;
5241
5242       Return the initial_address in INITIAL_ADDRESS.
5243
5244    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5245       update the pointer in each iteration of the loop.
5246
5247       Return the increment stmt that updates the pointer in PTR_INCR.
5248
5249    3. Return the pointer.  */
5250
5251 tree
5252 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5253                           tree aggr_type, class loop *at_loop, tree offset,
5254                           tree *initial_address, gimple_stmt_iterator *gsi,
5255                           gimple **ptr_incr, bool only_init,
5256                           tree iv_step)
5257 {
5258   const char *base_name;
5259   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5260   class loop *loop = NULL;
5261   bool nested_in_vect_loop = false;
5262   class loop *containing_loop = NULL;
5263   tree aggr_ptr_type;
5264   tree aggr_ptr;
5265   tree new_temp;
5266   gimple_seq new_stmt_list = NULL;
5267   edge pe = NULL;
5268   basic_block new_bb;
5269   tree aggr_ptr_init;
5270   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5271   struct data_reference *dr = dr_info->dr;
5272   tree aptr;
5273   gimple_stmt_iterator incr_gsi;
5274   bool insert_after;
5275   tree indx_before_incr, indx_after_incr;
5276   gimple *incr;
5277   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5278
5279   gcc_assert (iv_step != NULL_TREE
5280               || TREE_CODE (aggr_type) == ARRAY_TYPE
5281               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5282
5283   if (loop_vinfo)
5284     {
5285       loop = LOOP_VINFO_LOOP (loop_vinfo);
5286       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5287       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5288       pe = loop_preheader_edge (loop);
5289     }
5290   else
5291     {
5292       gcc_assert (bb_vinfo);
5293       only_init = true;
5294       *ptr_incr = NULL;
5295     }
5296
5297   /* Create an expression for the first address accessed by this load
5298      in LOOP.  */
5299   base_name = get_name (DR_BASE_ADDRESS (dr));
5300
5301   if (dump_enabled_p ())
5302     {
5303       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5304       dump_printf_loc (MSG_NOTE, vect_location,
5305                        "create %s-pointer variable to type: %T",
5306                        get_tree_code_name (TREE_CODE (aggr_type)),
5307                        aggr_type);
5308       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5309         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5310       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5311         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5312       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5313         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5314       else
5315         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5316       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5317     }
5318
5319   /* (1) Create the new aggregate-pointer variable.
5320      Vector and array types inherit the alias set of their component
5321      type by default so we need to use a ref-all pointer if the data
5322      reference does not conflict with the created aggregated data
5323      reference because it is not addressable.  */
5324   bool need_ref_all = false;
5325   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5326                               get_alias_set (DR_REF (dr))))
5327     need_ref_all = true;
5328   /* Likewise for any of the data references in the stmt group.  */
5329   else if (DR_GROUP_SIZE (stmt_info) > 1)
5330     {
5331       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5332       do
5333         {
5334           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5335           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5336                                       get_alias_set (DR_REF (sdr))))
5337             {
5338               need_ref_all = true;
5339               break;
5340             }
5341           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5342         }
5343       while (sinfo);
5344     }
5345   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5346                                                need_ref_all);
5347   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5348
5349
5350   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5351      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5352      def-use update cycles for the pointer: one relative to the outer-loop
5353      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5354      to the inner-loop (which is the inner-most loop containing the dataref),
5355      and this is done be step (5) below.
5356
5357      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5358      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5359      redundant.  Steps (3),(4) create the following:
5360
5361         vp0 = &base_addr;
5362         LOOP:   vp1 = phi(vp0,vp2)
5363                 ...
5364                 ...
5365                 vp2 = vp1 + step
5366                 goto LOOP
5367
5368      If there is an inner-loop nested in loop, then step (5) will also be
5369      applied, and an additional update in the inner-loop will be created:
5370
5371         vp0 = &base_addr;
5372         LOOP:   vp1 = phi(vp0,vp2)
5373                 ...
5374         inner:     vp3 = phi(vp1,vp4)
5375                    vp4 = vp3 + inner_step
5376                    if () goto inner
5377                 ...
5378                 vp2 = vp1 + step
5379                 if () goto LOOP   */
5380
5381   /* (2) Calculate the initial address of the aggregate-pointer, and set
5382      the aggregate-pointer to point to it before the loop.  */
5383
5384   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5385
5386   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5387                                                    stmt_info, &new_stmt_list,
5388                                                    offset);
5389   if (new_stmt_list)
5390     {
5391       if (pe)
5392         {
5393           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5394           gcc_assert (!new_bb);
5395         }
5396       else
5397         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5398     }
5399
5400   *initial_address = new_temp;
5401   aggr_ptr_init = new_temp;
5402
5403   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5404      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5405      inner-loop nested in LOOP (during outer-loop vectorization).  */
5406
5407   /* No update in loop is required.  */
5408   if (only_init && (!loop_vinfo || at_loop == loop))
5409     aptr = aggr_ptr_init;
5410   else
5411     {
5412       /* Accesses to invariant addresses should be handled specially
5413          by the caller.  */
5414       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5415       gcc_assert (!integer_zerop (step));
5416
5417       if (iv_step == NULL_TREE)
5418         {
5419           /* The step of the aggregate pointer is the type size,
5420              negated for downward accesses.  */
5421           iv_step = TYPE_SIZE_UNIT (aggr_type);
5422           if (tree_int_cst_sgn (step) == -1)
5423             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5424         }
5425
5426       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5427
5428       create_iv (aggr_ptr_init, PLUS_EXPR,
5429                  fold_convert (aggr_ptr_type, iv_step),
5430                  aggr_ptr, loop, &incr_gsi, insert_after,
5431                  &indx_before_incr, &indx_after_incr);
5432       incr = gsi_stmt (incr_gsi);
5433
5434       /* Copy the points-to information if it exists. */
5435       if (DR_PTR_INFO (dr))
5436         {
5437           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5438           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5439         }
5440       if (ptr_incr)
5441         *ptr_incr = incr;
5442
5443       aptr = indx_before_incr;
5444     }
5445
5446   if (!nested_in_vect_loop || only_init)
5447     return aptr;
5448
5449
5450   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5451      nested in LOOP, if exists.  */
5452
5453   gcc_assert (nested_in_vect_loop);
5454   if (!only_init)
5455     {
5456       standard_iv_increment_position (containing_loop, &incr_gsi,
5457                                       &insert_after);
5458       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5459                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5460                  &indx_before_incr, &indx_after_incr);
5461       incr = gsi_stmt (incr_gsi);
5462
5463       /* Copy the points-to information if it exists. */
5464       if (DR_PTR_INFO (dr))
5465         {
5466           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5467           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5468         }
5469       if (ptr_incr)
5470         *ptr_incr = incr;
5471
5472       return indx_before_incr;
5473     }
5474   else
5475     gcc_unreachable ();
5476 }
5477
5478
5479 /* Function bump_vector_ptr
5480
5481    Increment a pointer (to a vector type) by vector-size. If requested,
5482    i.e. if PTR-INCR is given, then also connect the new increment stmt
5483    to the existing def-use update-chain of the pointer, by modifying
5484    the PTR_INCR as illustrated below:
5485
5486    The pointer def-use update-chain before this function:
5487                         DATAREF_PTR = phi (p_0, p_2)
5488                         ....
5489         PTR_INCR:       p_2 = DATAREF_PTR + step
5490
5491    The pointer def-use update-chain after this function:
5492                         DATAREF_PTR = phi (p_0, p_2)
5493                         ....
5494                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5495                         ....
5496         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5497
5498    Input:
5499    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5500                  in the loop.
5501    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5502               the loop.  The increment amount across iterations is expected
5503               to be vector_size.
5504    BSI - location where the new update stmt is to be placed.
5505    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5506    BUMP - optional. The offset by which to bump the pointer. If not given,
5507           the offset is assumed to be vector_size.
5508
5509    Output: Return NEW_DATAREF_PTR as illustrated above.
5510
5511 */
5512
5513 tree
5514 bump_vector_ptr (vec_info *vinfo,
5515                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5516                  stmt_vec_info stmt_info, tree bump)
5517 {
5518   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5519   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5520   tree update = TYPE_SIZE_UNIT (vectype);
5521   gimple *incr_stmt;
5522   ssa_op_iter iter;
5523   use_operand_p use_p;
5524   tree new_dataref_ptr;
5525
5526   if (bump)
5527     update = bump;
5528
5529   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5530     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5531   else if (is_gimple_min_invariant (dataref_ptr))
5532     /* When possible avoid emitting a separate increment stmt that will
5533        force the addressed object addressable.  */
5534     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5535                    fold_build2 (MEM_REF,
5536                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5537                                 dataref_ptr,
5538                                 fold_convert (ptr_type_node, update)));
5539   else
5540     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5541   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5542                                    dataref_ptr, update);
5543   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5544   /* Fold the increment, avoiding excessive chains use-def chains of
5545      those, leading to compile-time issues for passes until the next
5546      forwprop pass which would do this as well.  */
5547   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5548   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5549     {
5550       incr_stmt = gsi_stmt (fold_gsi);
5551       update_stmt (incr_stmt);
5552     }
5553
5554   /* Copy the points-to information if it exists. */
5555   if (DR_PTR_INFO (dr))
5556     {
5557       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5558       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5559     }
5560
5561   if (!ptr_incr)
5562     return new_dataref_ptr;
5563
5564   /* Update the vector-pointer's cross-iteration increment.  */
5565   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5566     {
5567       tree use = USE_FROM_PTR (use_p);
5568
5569       if (use == dataref_ptr)
5570         SET_USE (use_p, new_dataref_ptr);
5571       else
5572         gcc_assert (operand_equal_p (use, update, 0));
5573     }
5574
5575   return new_dataref_ptr;
5576 }
5577
5578
5579 /* Copy memory reference info such as base/clique from the SRC reference
5580    to the DEST MEM_REF.  */
5581
5582 void
5583 vect_copy_ref_info (tree dest, tree src)
5584 {
5585   if (TREE_CODE (dest) != MEM_REF)
5586     return;
5587
5588   tree src_base = src;
5589   while (handled_component_p (src_base))
5590     src_base = TREE_OPERAND (src_base, 0);
5591   if (TREE_CODE (src_base) != MEM_REF
5592       && TREE_CODE (src_base) != TARGET_MEM_REF)
5593     return;
5594
5595   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5596   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5597 }
5598
5599
5600 /* Function vect_create_destination_var.
5601
5602    Create a new temporary of type VECTYPE.  */
5603
5604 tree
5605 vect_create_destination_var (tree scalar_dest, tree vectype)
5606 {
5607   tree vec_dest;
5608   const char *name;
5609   char *new_name;
5610   tree type;
5611   enum vect_var_kind kind;
5612
5613   kind = vectype
5614     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5615     ? vect_mask_var
5616     : vect_simple_var
5617     : vect_scalar_var;
5618   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5619
5620   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5621
5622   name = get_name (scalar_dest);
5623   if (name)
5624     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5625   else
5626     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5627   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5628   free (new_name);
5629
5630   return vec_dest;
5631 }
5632
5633 /* Function vect_grouped_store_supported.
5634
5635    Returns TRUE if interleave high and interleave low permutations
5636    are supported, and FALSE otherwise.  */
5637
5638 bool
5639 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5640 {
5641   machine_mode mode = TYPE_MODE (vectype);
5642
5643   /* vect_permute_store_chain requires the group size to be equal to 3 or
5644      be a power of two.  */
5645   if (count != 3 && exact_log2 (count) == -1)
5646     {
5647       if (dump_enabled_p ())
5648         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649                          "the size of the group of accesses"
5650                          " is not a power of 2 or not eqaul to 3\n");
5651       return false;
5652     }
5653
5654   /* Check that the permutation is supported.  */
5655   if (VECTOR_MODE_P (mode))
5656     {
5657       unsigned int i;
5658       if (count == 3)
5659         {
5660           unsigned int j0 = 0, j1 = 0, j2 = 0;
5661           unsigned int i, j;
5662
5663           unsigned int nelt;
5664           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5665             {
5666               if (dump_enabled_p ())
5667                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5668                                  "cannot handle groups of 3 stores for"
5669                                  " variable-length vectors\n");
5670               return false;
5671             }
5672
5673           vec_perm_builder sel (nelt, nelt, 1);
5674           sel.quick_grow (nelt);
5675           vec_perm_indices indices;
5676           for (j = 0; j < 3; j++)
5677             {
5678               int nelt0 = ((3 - j) * nelt) % 3;
5679               int nelt1 = ((3 - j) * nelt + 1) % 3;
5680               int nelt2 = ((3 - j) * nelt + 2) % 3;
5681               for (i = 0; i < nelt; i++)
5682                 {
5683                   if (3 * i + nelt0 < nelt)
5684                     sel[3 * i + nelt0] = j0++;
5685                   if (3 * i + nelt1 < nelt)
5686                     sel[3 * i + nelt1] = nelt + j1++;
5687                   if (3 * i + nelt2 < nelt)
5688                     sel[3 * i + nelt2] = 0;
5689                 }
5690               indices.new_vector (sel, 2, nelt);
5691               if (!can_vec_perm_const_p (mode, mode, indices))
5692                 {
5693                   if (dump_enabled_p ())
5694                     dump_printf (MSG_MISSED_OPTIMIZATION,
5695                                  "permutation op not supported by target.\n");
5696                   return false;
5697                 }
5698
5699               for (i = 0; i < nelt; i++)
5700                 {
5701                   if (3 * i + nelt0 < nelt)
5702                     sel[3 * i + nelt0] = 3 * i + nelt0;
5703                   if (3 * i + nelt1 < nelt)
5704                     sel[3 * i + nelt1] = 3 * i + nelt1;
5705                   if (3 * i + nelt2 < nelt)
5706                     sel[3 * i + nelt2] = nelt + j2++;
5707                 }
5708               indices.new_vector (sel, 2, nelt);
5709               if (!can_vec_perm_const_p (mode, mode, indices))
5710                 {
5711                   if (dump_enabled_p ())
5712                     dump_printf (MSG_MISSED_OPTIMIZATION,
5713                                  "permutation op not supported by target.\n");
5714                   return false;
5715                 }
5716             }
5717           return true;
5718         }
5719       else
5720         {
5721           /* If length is not equal to 3 then only power of 2 is supported.  */
5722           gcc_assert (pow2p_hwi (count));
5723           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5724
5725           /* The encoding has 2 interleaved stepped patterns.  */
5726           if(!multiple_p (nelt, 2))
5727             return false;
5728           vec_perm_builder sel (nelt, 2, 3);
5729           sel.quick_grow (6);
5730           for (i = 0; i < 3; i++)
5731             {
5732               sel[i * 2] = i;
5733               sel[i * 2 + 1] = i + nelt;
5734             }
5735           vec_perm_indices indices (sel, 2, nelt);
5736           if (can_vec_perm_const_p (mode, mode, indices))
5737             {
5738               for (i = 0; i < 6; i++)
5739                 sel[i] += exact_div (nelt, 2);
5740               indices.new_vector (sel, 2, nelt);
5741               if (can_vec_perm_const_p (mode, mode, indices))
5742                 return true;
5743             }
5744         }
5745     }
5746
5747   if (dump_enabled_p ())
5748     dump_printf (MSG_MISSED_OPTIMIZATION,
5749                  "permutation op not supported by target.\n");
5750   return false;
5751 }
5752
5753 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5754    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5755
5756 internal_fn
5757 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5758                             bool masked_p)
5759 {
5760   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5761                                     vec_mask_len_store_lanes_optab, vectype,
5762                                     count))
5763     return IFN_MASK_LEN_STORE_LANES;
5764   else if (masked_p)
5765     {
5766       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5767                                         vec_mask_store_lanes_optab, vectype,
5768                                         count))
5769         return IFN_MASK_STORE_LANES;
5770     }
5771   else
5772     {
5773       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5774                                         vec_store_lanes_optab, vectype, count))
5775         return IFN_STORE_LANES;
5776     }
5777   return IFN_LAST;
5778 }
5779
5780
5781 /* Function vect_permute_store_chain.
5782
5783    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5784    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5785    the data correctly for the stores.  Return the final references for stores
5786    in RESULT_CHAIN.
5787
5788    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5789    The input is 4 vectors each containing 8 elements.  We assign a number to
5790    each element, the input sequence is:
5791
5792    1st vec:   0  1  2  3  4  5  6  7
5793    2nd vec:   8  9 10 11 12 13 14 15
5794    3rd vec:  16 17 18 19 20 21 22 23
5795    4th vec:  24 25 26 27 28 29 30 31
5796
5797    The output sequence should be:
5798
5799    1st vec:  0  8 16 24  1  9 17 25
5800    2nd vec:  2 10 18 26  3 11 19 27
5801    3rd vec:  4 12 20 28  5 13 21 30
5802    4th vec:  6 14 22 30  7 15 23 31
5803
5804    i.e., we interleave the contents of the four vectors in their order.
5805
5806    We use interleave_high/low instructions to create such output.  The input of
5807    each interleave_high/low operation is two vectors:
5808    1st vec    2nd vec
5809    0 1 2 3    4 5 6 7
5810    the even elements of the result vector are obtained left-to-right from the
5811    high/low elements of the first vector.  The odd elements of the result are
5812    obtained left-to-right from the high/low elements of the second vector.
5813    The output of interleave_high will be:   0 4 1 5
5814    and of interleave_low:                   2 6 3 7
5815
5816
5817    The permutation is done in log LENGTH stages.  In each stage interleave_high
5818    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5819    where the first argument is taken from the first half of DR_CHAIN and the
5820    second argument from it's second half.
5821    In our example,
5822
5823    I1: interleave_high (1st vec, 3rd vec)
5824    I2: interleave_low (1st vec, 3rd vec)
5825    I3: interleave_high (2nd vec, 4th vec)
5826    I4: interleave_low (2nd vec, 4th vec)
5827
5828    The output for the first stage is:
5829
5830    I1:  0 16  1 17  2 18  3 19
5831    I2:  4 20  5 21  6 22  7 23
5832    I3:  8 24  9 25 10 26 11 27
5833    I4: 12 28 13 29 14 30 15 31
5834
5835    The output of the second stage, i.e. the final result is:
5836
5837    I1:  0  8 16 24  1  9 17 25
5838    I2:  2 10 18 26  3 11 19 27
5839    I3:  4 12 20 28  5 13 21 30
5840    I4:  6 14 22 30  7 15 23 31.  */
5841
5842 void
5843 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5844                           unsigned int length,
5845                           stmt_vec_info stmt_info,
5846                           gimple_stmt_iterator *gsi,
5847                           vec<tree> *result_chain)
5848 {
5849   tree vect1, vect2, high, low;
5850   gimple *perm_stmt;
5851   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5852   tree perm_mask_low, perm_mask_high;
5853   tree data_ref;
5854   tree perm3_mask_low, perm3_mask_high;
5855   unsigned int i, j, n, log_length = exact_log2 (length);
5856
5857   result_chain->quick_grow (length);
5858   memcpy (result_chain->address (), dr_chain.address (),
5859           length * sizeof (tree));
5860
5861   if (length == 3)
5862     {
5863       /* vect_grouped_store_supported ensures that this is constant.  */
5864       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5865       unsigned int j0 = 0, j1 = 0, j2 = 0;
5866
5867       vec_perm_builder sel (nelt, nelt, 1);
5868       sel.quick_grow (nelt);
5869       vec_perm_indices indices;
5870       for (j = 0; j < 3; j++)
5871         {
5872           int nelt0 = ((3 - j) * nelt) % 3;
5873           int nelt1 = ((3 - j) * nelt + 1) % 3;
5874           int nelt2 = ((3 - j) * nelt + 2) % 3;
5875
5876           for (i = 0; i < nelt; i++)
5877             {
5878               if (3 * i + nelt0 < nelt)
5879                 sel[3 * i + nelt0] = j0++;
5880               if (3 * i + nelt1 < nelt)
5881                 sel[3 * i + nelt1] = nelt + j1++;
5882               if (3 * i + nelt2 < nelt)
5883                 sel[3 * i + nelt2] = 0;
5884             }
5885           indices.new_vector (sel, 2, nelt);
5886           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5887
5888           for (i = 0; i < nelt; i++)
5889             {
5890               if (3 * i + nelt0 < nelt)
5891                 sel[3 * i + nelt0] = 3 * i + nelt0;
5892               if (3 * i + nelt1 < nelt)
5893                 sel[3 * i + nelt1] = 3 * i + nelt1;
5894               if (3 * i + nelt2 < nelt)
5895                 sel[3 * i + nelt2] = nelt + j2++;
5896             }
5897           indices.new_vector (sel, 2, nelt);
5898           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5899
5900           vect1 = dr_chain[0];
5901           vect2 = dr_chain[1];
5902
5903           /* Create interleaving stmt:
5904              low = VEC_PERM_EXPR <vect1, vect2,
5905                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5906                                    j + 2, nelt + j + 2, *, ...}>  */
5907           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5908           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5909                                            vect2, perm3_mask_low);
5910           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5911
5912           vect1 = data_ref;
5913           vect2 = dr_chain[2];
5914           /* Create interleaving stmt:
5915              low = VEC_PERM_EXPR <vect1, vect2,
5916                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5917                                    6, 7, nelt + j + 2, ...}>  */
5918           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5919           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5920                                            vect2, perm3_mask_high);
5921           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5922           (*result_chain)[j] = data_ref;
5923         }
5924     }
5925   else
5926     {
5927       /* If length is not equal to 3 then only power of 2 is supported.  */
5928       gcc_assert (pow2p_hwi (length));
5929
5930       /* The encoding has 2 interleaved stepped patterns.  */
5931       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5932       vec_perm_builder sel (nelt, 2, 3);
5933       sel.quick_grow (6);
5934       for (i = 0; i < 3; i++)
5935         {
5936           sel[i * 2] = i;
5937           sel[i * 2 + 1] = i + nelt;
5938         }
5939         vec_perm_indices indices (sel, 2, nelt);
5940         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5941
5942         for (i = 0; i < 6; i++)
5943           sel[i] += exact_div (nelt, 2);
5944         indices.new_vector (sel, 2, nelt);
5945         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5946
5947         for (i = 0, n = log_length; i < n; i++)
5948           {
5949             for (j = 0; j < length/2; j++)
5950               {
5951                 vect1 = dr_chain[j];
5952                 vect2 = dr_chain[j+length/2];
5953
5954                 /* Create interleaving stmt:
5955                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5956                                                         ...}>  */
5957                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5958                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5959                                                  vect2, perm_mask_high);
5960                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5961                 (*result_chain)[2*j] = high;
5962
5963                 /* Create interleaving stmt:
5964                    low = VEC_PERM_EXPR <vect1, vect2,
5965                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5966                                          ...}>  */
5967                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5968                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5969                                                  vect2, perm_mask_low);
5970                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5971                 (*result_chain)[2*j+1] = low;
5972               }
5973             memcpy (dr_chain.address (), result_chain->address (),
5974                     length * sizeof (tree));
5975           }
5976     }
5977 }
5978
5979 /* Function vect_setup_realignment
5980
5981    This function is called when vectorizing an unaligned load using
5982    the dr_explicit_realign[_optimized] scheme.
5983    This function generates the following code at the loop prolog:
5984
5985       p = initial_addr;
5986    x  msq_init = *(floor(p));   # prolog load
5987       realignment_token = call target_builtin;
5988     loop:
5989    x  msq = phi (msq_init, ---)
5990
5991    The stmts marked with x are generated only for the case of
5992    dr_explicit_realign_optimized.
5993
5994    The code above sets up a new (vector) pointer, pointing to the first
5995    location accessed by STMT_INFO, and a "floor-aligned" load using that
5996    pointer.  It also generates code to compute the "realignment-token"
5997    (if the relevant target hook was defined), and creates a phi-node at the
5998    loop-header bb whose arguments are the result of the prolog-load (created
5999    by this function) and the result of a load that takes place in the loop
6000    (to be created by the caller to this function).
6001
6002    For the case of dr_explicit_realign_optimized:
6003    The caller to this function uses the phi-result (msq) to create the
6004    realignment code inside the loop, and sets up the missing phi argument,
6005    as follows:
6006     loop:
6007       msq = phi (msq_init, lsq)
6008       lsq = *(floor(p'));        # load in loop
6009       result = realign_load (msq, lsq, realignment_token);
6010
6011    For the case of dr_explicit_realign:
6012     loop:
6013       msq = *(floor(p));        # load in loop
6014       p' = p + (VS-1);
6015       lsq = *(floor(p'));       # load in loop
6016       result = realign_load (msq, lsq, realignment_token);
6017
6018    Input:
6019    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6020                a memory location that may be unaligned.
6021    BSI - place where new code is to be inserted.
6022    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6023                               is used.
6024
6025    Output:
6026    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6027                        target hook, if defined.
6028    Return value - the result of the loop-header phi node.  */
6029
6030 tree
6031 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6032                         gimple_stmt_iterator *gsi, tree *realignment_token,
6033                         enum dr_alignment_support alignment_support_scheme,
6034                         tree init_addr,
6035                         class loop **at_loop)
6036 {
6037   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6038   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6039   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6040   struct data_reference *dr = dr_info->dr;
6041   class loop *loop = NULL;
6042   edge pe = NULL;
6043   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6044   tree vec_dest;
6045   gimple *inc;
6046   tree ptr;
6047   tree data_ref;
6048   basic_block new_bb;
6049   tree msq_init = NULL_TREE;
6050   tree new_temp;
6051   gphi *phi_stmt;
6052   tree msq = NULL_TREE;
6053   gimple_seq stmts = NULL;
6054   bool compute_in_loop = false;
6055   bool nested_in_vect_loop = false;
6056   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6057   class loop *loop_for_initial_load = NULL;
6058
6059   if (loop_vinfo)
6060     {
6061       loop = LOOP_VINFO_LOOP (loop_vinfo);
6062       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6063     }
6064
6065   gcc_assert (alignment_support_scheme == dr_explicit_realign
6066               || alignment_support_scheme == dr_explicit_realign_optimized);
6067
6068   /* We need to generate three things:
6069      1. the misalignment computation
6070      2. the extra vector load (for the optimized realignment scheme).
6071      3. the phi node for the two vectors from which the realignment is
6072       done (for the optimized realignment scheme).  */
6073
6074   /* 1. Determine where to generate the misalignment computation.
6075
6076      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6077      calculation will be generated by this function, outside the loop (in the
6078      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6079      caller, inside the loop.
6080
6081      Background: If the misalignment remains fixed throughout the iterations of
6082      the loop, then both realignment schemes are applicable, and also the
6083      misalignment computation can be done outside LOOP.  This is because we are
6084      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6085      are a multiple of VS (the Vector Size), and therefore the misalignment in
6086      different vectorized LOOP iterations is always the same.
6087      The problem arises only if the memory access is in an inner-loop nested
6088      inside LOOP, which is now being vectorized using outer-loop vectorization.
6089      This is the only case when the misalignment of the memory access may not
6090      remain fixed throughout the iterations of the inner-loop (as explained in
6091      detail in vect_supportable_dr_alignment).  In this case, not only is the
6092      optimized realignment scheme not applicable, but also the misalignment
6093      computation (and generation of the realignment token that is passed to
6094      REALIGN_LOAD) have to be done inside the loop.
6095
6096      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6097      or not, which in turn determines if the misalignment is computed inside
6098      the inner-loop, or outside LOOP.  */
6099
6100   if (init_addr != NULL_TREE || !loop_vinfo)
6101     {
6102       compute_in_loop = true;
6103       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6104     }
6105
6106
6107   /* 2. Determine where to generate the extra vector load.
6108
6109      For the optimized realignment scheme, instead of generating two vector
6110      loads in each iteration, we generate a single extra vector load in the
6111      preheader of the loop, and in each iteration reuse the result of the
6112      vector load from the previous iteration.  In case the memory access is in
6113      an inner-loop nested inside LOOP, which is now being vectorized using
6114      outer-loop vectorization, we need to determine whether this initial vector
6115      load should be generated at the preheader of the inner-loop, or can be
6116      generated at the preheader of LOOP.  If the memory access has no evolution
6117      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6118      to be generated inside LOOP (in the preheader of the inner-loop).  */
6119
6120   if (nested_in_vect_loop)
6121     {
6122       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6123       bool invariant_in_outerloop =
6124             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6125       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6126     }
6127   else
6128     loop_for_initial_load = loop;
6129   if (at_loop)
6130     *at_loop = loop_for_initial_load;
6131
6132   tree vuse = NULL_TREE;
6133   if (loop_for_initial_load)
6134     {
6135       pe = loop_preheader_edge (loop_for_initial_load);
6136       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6137         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6138     }
6139   if (!vuse)
6140     vuse = gimple_vuse (gsi_stmt (*gsi));
6141
6142   /* 3. For the case of the optimized realignment, create the first vector
6143       load at the loop preheader.  */
6144
6145   if (alignment_support_scheme == dr_explicit_realign_optimized)
6146     {
6147       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6148       gassign *new_stmt;
6149
6150       gcc_assert (!compute_in_loop);
6151       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6152       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6153                                       loop_for_initial_load, NULL_TREE,
6154                                       &init_addr, NULL, &inc, true);
6155       if (TREE_CODE (ptr) == SSA_NAME)
6156         new_temp = copy_ssa_name (ptr);
6157       else
6158         new_temp = make_ssa_name (TREE_TYPE (ptr));
6159       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6160       tree type = TREE_TYPE (ptr);
6161       new_stmt = gimple_build_assign
6162                    (new_temp, BIT_AND_EXPR, ptr,
6163                     fold_build2 (MINUS_EXPR, type,
6164                                  build_int_cst (type, 0),
6165                                  build_int_cst (type, align)));
6166       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6167       gcc_assert (!new_bb);
6168       data_ref
6169         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6170                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6171       vect_copy_ref_info (data_ref, DR_REF (dr));
6172       new_stmt = gimple_build_assign (vec_dest, data_ref);
6173       new_temp = make_ssa_name (vec_dest, new_stmt);
6174       gimple_assign_set_lhs (new_stmt, new_temp);
6175       gimple_set_vuse (new_stmt, vuse);
6176       if (pe)
6177         {
6178           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6179           gcc_assert (!new_bb);
6180         }
6181       else
6182          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6183
6184       msq_init = gimple_assign_lhs (new_stmt);
6185     }
6186
6187   /* 4. Create realignment token using a target builtin, if available.
6188       It is done either inside the containing loop, or before LOOP (as
6189       determined above).  */
6190
6191   if (targetm.vectorize.builtin_mask_for_load)
6192     {
6193       gcall *new_stmt;
6194       tree builtin_decl;
6195
6196       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6197       if (!init_addr)
6198         {
6199           /* Generate the INIT_ADDR computation outside LOOP.  */
6200           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6201                                                             stmt_info, &stmts,
6202                                                             NULL_TREE);
6203           if (loop)
6204             {
6205               pe = loop_preheader_edge (loop);
6206               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6207               gcc_assert (!new_bb);
6208             }
6209           else
6210              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6211         }
6212
6213       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6214       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6215       vec_dest =
6216         vect_create_destination_var (scalar_dest,
6217                                      gimple_call_return_type (new_stmt));
6218       new_temp = make_ssa_name (vec_dest, new_stmt);
6219       gimple_call_set_lhs (new_stmt, new_temp);
6220
6221       if (compute_in_loop)
6222         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6223       else
6224         {
6225           /* Generate the misalignment computation outside LOOP.  */
6226           pe = loop_preheader_edge (loop);
6227           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6228           gcc_assert (!new_bb);
6229         }
6230
6231       *realignment_token = gimple_call_lhs (new_stmt);
6232
6233       /* The result of the CALL_EXPR to this builtin is determined from
6234          the value of the parameter and no global variables are touched
6235          which makes the builtin a "const" function.  Requiring the
6236          builtin to have the "const" attribute makes it unnecessary
6237          to call mark_call_clobbered.  */
6238       gcc_assert (TREE_READONLY (builtin_decl));
6239     }
6240
6241   if (alignment_support_scheme == dr_explicit_realign)
6242     return msq;
6243
6244   gcc_assert (!compute_in_loop);
6245   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6246
6247
6248   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6249
6250   pe = loop_preheader_edge (containing_loop);
6251   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6252   msq = make_ssa_name (vec_dest);
6253   phi_stmt = create_phi_node (msq, containing_loop->header);
6254   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6255
6256   return msq;
6257 }
6258
6259
6260 /* Function vect_grouped_load_supported.
6261
6262    COUNT is the size of the load group (the number of statements plus the
6263    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6264    only one statement, with a gap of COUNT - 1.
6265
6266    Returns true if a suitable permute exists.  */
6267
6268 bool
6269 vect_grouped_load_supported (tree vectype, bool single_element_p,
6270                              unsigned HOST_WIDE_INT count)
6271 {
6272   machine_mode mode = TYPE_MODE (vectype);
6273
6274   /* If this is single-element interleaving with an element distance
6275      that leaves unused vector loads around punt - we at least create
6276      very sub-optimal code in that case (and blow up memory,
6277      see PR65518).  */
6278   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6279     {
6280       if (dump_enabled_p ())
6281         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6282                          "single-element interleaving not supported "
6283                          "for not adjacent vector loads\n");
6284       return false;
6285     }
6286
6287   /* vect_permute_load_chain requires the group size to be equal to 3 or
6288      be a power of two.  */
6289   if (count != 3 && exact_log2 (count) == -1)
6290     {
6291       if (dump_enabled_p ())
6292         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6293                          "the size of the group of accesses"
6294                          " is not a power of 2 or not equal to 3\n");
6295       return false;
6296     }
6297
6298   /* Check that the permutation is supported.  */
6299   if (VECTOR_MODE_P (mode))
6300     {
6301       unsigned int i, j;
6302       if (count == 3)
6303         {
6304           unsigned int nelt;
6305           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6306             {
6307               if (dump_enabled_p ())
6308                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6309                                  "cannot handle groups of 3 loads for"
6310                                  " variable-length vectors\n");
6311               return false;
6312             }
6313
6314           vec_perm_builder sel (nelt, nelt, 1);
6315           sel.quick_grow (nelt);
6316           vec_perm_indices indices;
6317           unsigned int k;
6318           for (k = 0; k < 3; k++)
6319             {
6320               for (i = 0; i < nelt; i++)
6321                 if (3 * i + k < 2 * nelt)
6322                   sel[i] = 3 * i + k;
6323                 else
6324                   sel[i] = 0;
6325               indices.new_vector (sel, 2, nelt);
6326               if (!can_vec_perm_const_p (mode, mode, indices))
6327                 {
6328                   if (dump_enabled_p ())
6329                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6330                                      "shuffle of 3 loads is not supported by"
6331                                      " target\n");
6332                   return false;
6333                 }
6334               for (i = 0, j = 0; i < nelt; i++)
6335                 if (3 * i + k < 2 * nelt)
6336                   sel[i] = i;
6337                 else
6338                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6339               indices.new_vector (sel, 2, nelt);
6340               if (!can_vec_perm_const_p (mode, mode, indices))
6341                 {
6342                   if (dump_enabled_p ())
6343                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6344                                      "shuffle of 3 loads is not supported by"
6345                                      " target\n");
6346                   return false;
6347                 }
6348             }
6349           return true;
6350         }
6351       else
6352         {
6353           /* If length is not equal to 3 then only power of 2 is supported.  */
6354           gcc_assert (pow2p_hwi (count));
6355           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6356
6357           /* The encoding has a single stepped pattern.  */
6358           vec_perm_builder sel (nelt, 1, 3);
6359           sel.quick_grow (3);
6360           for (i = 0; i < 3; i++)
6361             sel[i] = i * 2;
6362           vec_perm_indices indices (sel, 2, nelt);
6363           if (can_vec_perm_const_p (mode, mode, indices))
6364             {
6365               for (i = 0; i < 3; i++)
6366                 sel[i] = i * 2 + 1;
6367               indices.new_vector (sel, 2, nelt);
6368               if (can_vec_perm_const_p (mode, mode, indices))
6369                 return true;
6370             }
6371         }
6372     }
6373
6374   if (dump_enabled_p ())
6375     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6376                      "extract even/odd not supported by target\n");
6377   return false;
6378 }
6379
6380 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6381    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6382
6383 internal_fn
6384 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6385                            bool masked_p)
6386 {
6387   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6388                                     vec_mask_len_load_lanes_optab, vectype,
6389                                     count))
6390     return IFN_MASK_LEN_LOAD_LANES;
6391   else if (masked_p)
6392     {
6393       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6394                                         vec_mask_load_lanes_optab, vectype,
6395                                         count))
6396         return IFN_MASK_LOAD_LANES;
6397     }
6398   else
6399     {
6400       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6401                                         vectype, count))
6402         return IFN_LOAD_LANES;
6403     }
6404   return IFN_LAST;
6405 }
6406
6407 /* Function vect_permute_load_chain.
6408
6409    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6410    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6411    the input data correctly.  Return the final references for loads in
6412    RESULT_CHAIN.
6413
6414    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6415    The input is 4 vectors each containing 8 elements. We assign a number to each
6416    element, the input sequence is:
6417
6418    1st vec:   0  1  2  3  4  5  6  7
6419    2nd vec:   8  9 10 11 12 13 14 15
6420    3rd vec:  16 17 18 19 20 21 22 23
6421    4th vec:  24 25 26 27 28 29 30 31
6422
6423    The output sequence should be:
6424
6425    1st vec:  0 4  8 12 16 20 24 28
6426    2nd vec:  1 5  9 13 17 21 25 29
6427    3rd vec:  2 6 10 14 18 22 26 30
6428    4th vec:  3 7 11 15 19 23 27 31
6429
6430    i.e., the first output vector should contain the first elements of each
6431    interleaving group, etc.
6432
6433    We use extract_even/odd instructions to create such output.  The input of
6434    each extract_even/odd operation is two vectors
6435    1st vec    2nd vec
6436    0 1 2 3    4 5 6 7
6437
6438    and the output is the vector of extracted even/odd elements.  The output of
6439    extract_even will be:   0 2 4 6
6440    and of extract_odd:     1 3 5 7
6441
6442
6443    The permutation is done in log LENGTH stages.  In each stage extract_even
6444    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6445    their order.  In our example,
6446
6447    E1: extract_even (1st vec, 2nd vec)
6448    E2: extract_odd (1st vec, 2nd vec)
6449    E3: extract_even (3rd vec, 4th vec)
6450    E4: extract_odd (3rd vec, 4th vec)
6451
6452    The output for the first stage will be:
6453
6454    E1:  0  2  4  6  8 10 12 14
6455    E2:  1  3  5  7  9 11 13 15
6456    E3: 16 18 20 22 24 26 28 30
6457    E4: 17 19 21 23 25 27 29 31
6458
6459    In order to proceed and create the correct sequence for the next stage (or
6460    for the correct output, if the second stage is the last one, as in our
6461    example), we first put the output of extract_even operation and then the
6462    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6463    The input for the second stage is:
6464
6465    1st vec (E1):  0  2  4  6  8 10 12 14
6466    2nd vec (E3): 16 18 20 22 24 26 28 30
6467    3rd vec (E2):  1  3  5  7  9 11 13 15
6468    4th vec (E4): 17 19 21 23 25 27 29 31
6469
6470    The output of the second stage:
6471
6472    E1: 0 4  8 12 16 20 24 28
6473    E2: 2 6 10 14 18 22 26 30
6474    E3: 1 5  9 13 17 21 25 29
6475    E4: 3 7 11 15 19 23 27 31
6476
6477    And RESULT_CHAIN after reordering:
6478
6479    1st vec (E1):  0 4  8 12 16 20 24 28
6480    2nd vec (E3):  1 5  9 13 17 21 25 29
6481    3rd vec (E2):  2 6 10 14 18 22 26 30
6482    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6483
6484 static void
6485 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6486                          unsigned int length,
6487                          stmt_vec_info stmt_info,
6488                          gimple_stmt_iterator *gsi,
6489                          vec<tree> *result_chain)
6490 {
6491   tree data_ref, first_vect, second_vect;
6492   tree perm_mask_even, perm_mask_odd;
6493   tree perm3_mask_low, perm3_mask_high;
6494   gimple *perm_stmt;
6495   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6496   unsigned int i, j, log_length = exact_log2 (length);
6497
6498   result_chain->quick_grow (length);
6499   memcpy (result_chain->address (), dr_chain.address (),
6500           length * sizeof (tree));
6501
6502   if (length == 3)
6503     {
6504       /* vect_grouped_load_supported ensures that this is constant.  */
6505       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6506       unsigned int k;
6507
6508       vec_perm_builder sel (nelt, nelt, 1);
6509       sel.quick_grow (nelt);
6510       vec_perm_indices indices;
6511       for (k = 0; k < 3; k++)
6512         {
6513           for (i = 0; i < nelt; i++)
6514             if (3 * i + k < 2 * nelt)
6515               sel[i] = 3 * i + k;
6516             else
6517               sel[i] = 0;
6518           indices.new_vector (sel, 2, nelt);
6519           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6520
6521           for (i = 0, j = 0; i < nelt; i++)
6522             if (3 * i + k < 2 * nelt)
6523               sel[i] = i;
6524             else
6525               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6526           indices.new_vector (sel, 2, nelt);
6527           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6528
6529           first_vect = dr_chain[0];
6530           second_vect = dr_chain[1];
6531
6532           /* Create interleaving stmt (low part of):
6533              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6534                                                              ...}>  */
6535           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6536           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6537                                            second_vect, perm3_mask_low);
6538           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6539
6540           /* Create interleaving stmt (high part of):
6541              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6542                                                               ...}>  */
6543           first_vect = data_ref;
6544           second_vect = dr_chain[2];
6545           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6546           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6547                                            second_vect, perm3_mask_high);
6548           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6549           (*result_chain)[k] = data_ref;
6550         }
6551     }
6552   else
6553     {
6554       /* If length is not equal to 3 then only power of 2 is supported.  */
6555       gcc_assert (pow2p_hwi (length));
6556
6557       /* The encoding has a single stepped pattern.  */
6558       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6559       vec_perm_builder sel (nelt, 1, 3);
6560       sel.quick_grow (3);
6561       for (i = 0; i < 3; ++i)
6562         sel[i] = i * 2;
6563       vec_perm_indices indices (sel, 2, nelt);
6564       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6565
6566       for (i = 0; i < 3; ++i)
6567         sel[i] = i * 2 + 1;
6568       indices.new_vector (sel, 2, nelt);
6569       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6570
6571       for (i = 0; i < log_length; i++)
6572         {
6573           for (j = 0; j < length; j += 2)
6574             {
6575               first_vect = dr_chain[j];
6576               second_vect = dr_chain[j+1];
6577
6578               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6579               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6580               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6581                                                first_vect, second_vect,
6582                                                perm_mask_even);
6583               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6584               (*result_chain)[j/2] = data_ref;
6585
6586               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6587               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6588               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6589                                                first_vect, second_vect,
6590                                                perm_mask_odd);
6591               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6592               (*result_chain)[j/2+length/2] = data_ref;
6593             }
6594           memcpy (dr_chain.address (), result_chain->address (),
6595                   length * sizeof (tree));
6596         }
6597     }
6598 }
6599
6600 /* Function vect_shift_permute_load_chain.
6601
6602    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6603    sequence of stmts to reorder the input data accordingly.
6604    Return the final references for loads in RESULT_CHAIN.
6605    Return true if successed, false otherwise.
6606
6607    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6608    The input is 3 vectors each containing 8 elements.  We assign a
6609    number to each element, the input sequence is:
6610
6611    1st vec:   0  1  2  3  4  5  6  7
6612    2nd vec:   8  9 10 11 12 13 14 15
6613    3rd vec:  16 17 18 19 20 21 22 23
6614
6615    The output sequence should be:
6616
6617    1st vec:  0 3 6  9 12 15 18 21
6618    2nd vec:  1 4 7 10 13 16 19 22
6619    3rd vec:  2 5 8 11 14 17 20 23
6620
6621    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6622
6623    First we shuffle all 3 vectors to get correct elements order:
6624
6625    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6626    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6627    3rd vec:  (16 19 22) (17 20 23) (18 21)
6628
6629    Next we unite and shift vector 3 times:
6630
6631    1st step:
6632      shift right by 6 the concatenation of:
6633      "1st vec" and  "2nd vec"
6634        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6635      "2nd vec" and  "3rd vec"
6636        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6637      "3rd vec" and  "1st vec"
6638        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6639                              | New vectors                   |
6640
6641      So that now new vectors are:
6642
6643      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6644      2nd vec:  (10 13) (16 19 22) (17 20 23)
6645      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6646
6647    2nd step:
6648      shift right by 5 the concatenation of:
6649      "1st vec" and  "3rd vec"
6650        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6651      "2nd vec" and  "1st vec"
6652        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6653      "3rd vec" and  "2nd vec"
6654        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6655                           | New vectors                   |
6656
6657      So that now new vectors are:
6658
6659      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6660      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6661      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6662
6663    3rd step:
6664      shift right by 5 the concatenation of:
6665      "1st vec" and  "1st vec"
6666        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6667      shift right by 3 the concatenation of:
6668      "2nd vec" and  "2nd vec"
6669                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6670                           | New vectors                   |
6671
6672      So that now all vectors are READY:
6673      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6674      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6675      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6676
6677    This algorithm is faster than one in vect_permute_load_chain if:
6678      1.  "shift of a concatination" is faster than general permutation.
6679          This is usually so.
6680      2.  The TARGET machine can't execute vector instructions in parallel.
6681          This is because each step of the algorithm depends on previous.
6682          The algorithm in vect_permute_load_chain is much more parallel.
6683
6684    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6685 */
6686
6687 static bool
6688 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6689                                unsigned int length,
6690                                stmt_vec_info stmt_info,
6691                                gimple_stmt_iterator *gsi,
6692                                vec<tree> *result_chain)
6693 {
6694   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6695   tree perm2_mask1, perm2_mask2, perm3_mask;
6696   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6697   gimple *perm_stmt;
6698
6699   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6700   machine_mode vmode = TYPE_MODE (vectype);
6701   unsigned int i;
6702   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6703
6704   unsigned HOST_WIDE_INT nelt, vf;
6705   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6706       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6707     /* Not supported for variable-length vectors.  */
6708     return false;
6709
6710   vec_perm_builder sel (nelt, nelt, 1);
6711   sel.quick_grow (nelt);
6712
6713   result_chain->quick_grow (length);
6714   memcpy (result_chain->address (), dr_chain.address (),
6715           length * sizeof (tree));
6716
6717   if (pow2p_hwi (length) && vf > 4)
6718     {
6719       unsigned int j, log_length = exact_log2 (length);
6720       for (i = 0; i < nelt / 2; ++i)
6721         sel[i] = i * 2;
6722       for (i = 0; i < nelt / 2; ++i)
6723         sel[nelt / 2 + i] = i * 2 + 1;
6724       vec_perm_indices indices (sel, 2, nelt);
6725       if (!can_vec_perm_const_p (vmode, vmode, indices))
6726         {
6727           if (dump_enabled_p ())
6728             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6729                              "shuffle of 2 fields structure is not \
6730                               supported by target\n");
6731           return false;
6732         }
6733       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6734
6735       for (i = 0; i < nelt / 2; ++i)
6736         sel[i] = i * 2 + 1;
6737       for (i = 0; i < nelt / 2; ++i)
6738         sel[nelt / 2 + i] = i * 2;
6739       indices.new_vector (sel, 2, nelt);
6740       if (!can_vec_perm_const_p (vmode, vmode, indices))
6741         {
6742           if (dump_enabled_p ())
6743             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744                              "shuffle of 2 fields structure is not \
6745                               supported by target\n");
6746           return false;
6747         }
6748       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6749
6750       /* Generating permutation constant to shift all elements.
6751          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6752       for (i = 0; i < nelt; i++)
6753         sel[i] = nelt / 2 + i;
6754       indices.new_vector (sel, 2, nelt);
6755       if (!can_vec_perm_const_p (vmode, vmode, indices))
6756         {
6757           if (dump_enabled_p ())
6758             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759                              "shift permutation is not supported by target\n");
6760           return false;
6761         }
6762       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6763
6764       /* Generating permutation constant to select vector from 2.
6765          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6766       for (i = 0; i < nelt / 2; i++)
6767         sel[i] = i;
6768       for (i = nelt / 2; i < nelt; i++)
6769         sel[i] = nelt + i;
6770       indices.new_vector (sel, 2, nelt);
6771       if (!can_vec_perm_const_p (vmode, vmode, indices))
6772         {
6773           if (dump_enabled_p ())
6774             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6775                              "select is not supported by target\n");
6776           return false;
6777         }
6778       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6779
6780       for (i = 0; i < log_length; i++)
6781         {
6782           for (j = 0; j < length; j += 2)
6783             {
6784               first_vect = dr_chain[j];
6785               second_vect = dr_chain[j + 1];
6786
6787               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6788               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6789                                                first_vect, first_vect,
6790                                                perm2_mask1);
6791               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6792               vect[0] = data_ref;
6793
6794               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6795               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6796                                                second_vect, second_vect,
6797                                                perm2_mask2);
6798               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6799               vect[1] = data_ref;
6800
6801               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6802               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6803                                                vect[0], vect[1], shift1_mask);
6804               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6805               (*result_chain)[j/2 + length/2] = data_ref;
6806
6807               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6808               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6809                                                vect[0], vect[1], select_mask);
6810               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6811               (*result_chain)[j/2] = data_ref;
6812             }
6813           memcpy (dr_chain.address (), result_chain->address (),
6814                   length * sizeof (tree));
6815         }
6816       return true;
6817     }
6818   if (length == 3 && vf > 2)
6819     {
6820       unsigned int k = 0, l = 0;
6821
6822       /* Generating permutation constant to get all elements in rigth order.
6823          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6824       for (i = 0; i < nelt; i++)
6825         {
6826           if (3 * k + (l % 3) >= nelt)
6827             {
6828               k = 0;
6829               l += (3 - (nelt % 3));
6830             }
6831           sel[i] = 3 * k + (l % 3);
6832           k++;
6833         }
6834       vec_perm_indices indices (sel, 2, nelt);
6835       if (!can_vec_perm_const_p (vmode, vmode, indices))
6836         {
6837           if (dump_enabled_p ())
6838             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6839                              "shuffle of 3 fields structure is not \
6840                               supported by target\n");
6841           return false;
6842         }
6843       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6844
6845       /* Generating permutation constant to shift all elements.
6846          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6847       for (i = 0; i < nelt; i++)
6848         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6849       indices.new_vector (sel, 2, nelt);
6850       if (!can_vec_perm_const_p (vmode, vmode, indices))
6851         {
6852           if (dump_enabled_p ())
6853             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854                              "shift permutation is not supported by target\n");
6855           return false;
6856         }
6857       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6858
6859       /* Generating permutation constant to shift all elements.
6860          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6861       for (i = 0; i < nelt; i++)
6862         sel[i] = 2 * (nelt / 3) + 1 + i;
6863       indices.new_vector (sel, 2, nelt);
6864       if (!can_vec_perm_const_p (vmode, vmode, indices))
6865         {
6866           if (dump_enabled_p ())
6867             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868                              "shift permutation is not supported by target\n");
6869           return false;
6870         }
6871       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6872
6873       /* Generating permutation constant to shift all elements.
6874          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6875       for (i = 0; i < nelt; i++)
6876         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6877       indices.new_vector (sel, 2, nelt);
6878       if (!can_vec_perm_const_p (vmode, vmode, indices))
6879         {
6880           if (dump_enabled_p ())
6881             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6882                              "shift permutation is not supported by target\n");
6883           return false;
6884         }
6885       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6886
6887       /* Generating permutation constant to shift all elements.
6888          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6889       for (i = 0; i < nelt; i++)
6890         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6891       indices.new_vector (sel, 2, nelt);
6892       if (!can_vec_perm_const_p (vmode, vmode, indices))
6893         {
6894           if (dump_enabled_p ())
6895             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896                              "shift permutation is not supported by target\n");
6897           return false;
6898         }
6899       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6900
6901       for (k = 0; k < 3; k++)
6902         {
6903           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6904           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6905                                            dr_chain[k], dr_chain[k],
6906                                            perm3_mask);
6907           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6908           vect[k] = data_ref;
6909         }
6910
6911       for (k = 0; k < 3; k++)
6912         {
6913           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6914           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6915                                            vect[k % 3], vect[(k + 1) % 3],
6916                                            shift1_mask);
6917           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6918           vect_shift[k] = data_ref;
6919         }
6920
6921       for (k = 0; k < 3; k++)
6922         {
6923           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6924           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6925                                            vect_shift[(4 - k) % 3],
6926                                            vect_shift[(3 - k) % 3],
6927                                            shift2_mask);
6928           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6929           vect[k] = data_ref;
6930         }
6931
6932       (*result_chain)[3 - (nelt % 3)] = vect[2];
6933
6934       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6935       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6936                                        vect[0], shift3_mask);
6937       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6938       (*result_chain)[nelt % 3] = data_ref;
6939
6940       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6941       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6942                                        vect[1], shift4_mask);
6943       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6944       (*result_chain)[0] = data_ref;
6945       return true;
6946     }
6947   return false;
6948 }
6949
6950 /* Function vect_transform_grouped_load.
6951
6952    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6953    to perform their permutation and ascribe the result vectorized statements to
6954    the scalar statements.
6955 */
6956
6957 void
6958 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6959                              vec<tree> dr_chain,
6960                              int size, gimple_stmt_iterator *gsi)
6961 {
6962   machine_mode mode;
6963   vec<tree> result_chain = vNULL;
6964
6965   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6966      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6967      vectors, that are ready for vector computation.  */
6968   result_chain.create (size);
6969
6970   /* If reassociation width for vector type is 2 or greater target machine can
6971      execute 2 or more vector instructions in parallel.  Otherwise try to
6972      get chain for loads group using vect_shift_permute_load_chain.  */
6973   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6974   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6975       || pow2p_hwi (size)
6976       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6977                                          gsi, &result_chain))
6978     vect_permute_load_chain (vinfo, dr_chain,
6979                              size, stmt_info, gsi, &result_chain);
6980   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6981   result_chain.release ();
6982 }
6983
6984 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6985    generated as part of the vectorization of STMT_INFO.  Assign the statement
6986    for each vector to the associated scalar statement.  */
6987
6988 void
6989 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6990                                   vec<tree> result_chain)
6991 {
6992   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6993   unsigned int i, gap_count;
6994   tree tmp_data_ref;
6995
6996   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6997      Since we scan the chain starting from it's first node, their order
6998      corresponds the order of data-refs in RESULT_CHAIN.  */
6999   stmt_vec_info next_stmt_info = first_stmt_info;
7000   gap_count = 1;
7001   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
7002     {
7003       if (!next_stmt_info)
7004         break;
7005
7006       /* Skip the gaps.  Loads created for the gaps will be removed by dead
7007        code elimination pass later.  No need to check for the first stmt in
7008        the group, since it always exists.
7009        DR_GROUP_GAP is the number of steps in elements from the previous
7010        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
7011        correspond to the gaps.  */
7012       if (next_stmt_info != first_stmt_info
7013           && gap_count < DR_GROUP_GAP (next_stmt_info))
7014         {
7015           gap_count++;
7016           continue;
7017         }
7018
7019       /* ???  The following needs cleanup after the removal of
7020          DR_GROUP_SAME_DR_STMT.  */
7021       if (next_stmt_info)
7022         {
7023           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7024           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7025              copies, and we put the new vector statement last.  */
7026           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7027
7028           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7029           gap_count = 1;
7030         }
7031     }
7032 }
7033
7034 /* Function vect_force_dr_alignment_p.
7035
7036    Returns whether the alignment of a DECL can be forced to be aligned
7037    on ALIGNMENT bit boundary.  */
7038
7039 bool
7040 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7041 {
7042   if (!VAR_P (decl))
7043     return false;
7044
7045   if (decl_in_symtab_p (decl)
7046       && !symtab_node::get (decl)->can_increase_alignment_p ())
7047     return false;
7048
7049   if (TREE_STATIC (decl))
7050     return (known_le (alignment,
7051                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7052   else
7053     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7054 }
7055
7056 /* Return whether the data reference DR_INFO is supported with respect to its
7057    alignment.
7058    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7059    it is aligned, i.e., check if it is possible to vectorize it with different
7060    alignment.  */
7061
7062 enum dr_alignment_support
7063 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7064                                tree vectype, int misalignment)
7065 {
7066   data_reference *dr = dr_info->dr;
7067   stmt_vec_info stmt_info = dr_info->stmt;
7068   machine_mode mode = TYPE_MODE (vectype);
7069   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7070   class loop *vect_loop = NULL;
7071   bool nested_in_vect_loop = false;
7072
7073   if (misalignment == 0)
7074     return dr_aligned;
7075
7076   /* For now assume all conditional loads/stores support unaligned
7077      access without any special code.  */
7078   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7079     if (gimple_call_internal_p (stmt)
7080         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7081             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7082       return dr_unaligned_supported;
7083
7084   if (loop_vinfo)
7085     {
7086       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7087       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7088     }
7089
7090   /* Possibly unaligned access.  */
7091
7092   /* We can choose between using the implicit realignment scheme (generating
7093      a misaligned_move stmt) and the explicit realignment scheme (generating
7094      aligned loads with a REALIGN_LOAD).  There are two variants to the
7095      explicit realignment scheme: optimized, and unoptimized.
7096      We can optimize the realignment only if the step between consecutive
7097      vector loads is equal to the vector size.  Since the vector memory
7098      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7099      is guaranteed that the misalignment amount remains the same throughout the
7100      execution of the vectorized loop.  Therefore, we can create the
7101      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7102      at the loop preheader.
7103
7104      However, in the case of outer-loop vectorization, when vectorizing a
7105      memory access in the inner-loop nested within the LOOP that is now being
7106      vectorized, while it is guaranteed that the misalignment of the
7107      vectorized memory access will remain the same in different outer-loop
7108      iterations, it is *not* guaranteed that is will remain the same throughout
7109      the execution of the inner-loop.  This is because the inner-loop advances
7110      with the original scalar step (and not in steps of VS).  If the inner-loop
7111      step happens to be a multiple of VS, then the misalignment remains fixed
7112      and we can use the optimized realignment scheme.  For example:
7113
7114       for (i=0; i<N; i++)
7115         for (j=0; j<M; j++)
7116           s += a[i+j];
7117
7118      When vectorizing the i-loop in the above example, the step between
7119      consecutive vector loads is 1, and so the misalignment does not remain
7120      fixed across the execution of the inner-loop, and the realignment cannot
7121      be optimized (as illustrated in the following pseudo vectorized loop):
7122
7123       for (i=0; i<N; i+=4)
7124         for (j=0; j<M; j++){
7125           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7126                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7127                          // (assuming that we start from an aligned address).
7128           }
7129
7130      We therefore have to use the unoptimized realignment scheme:
7131
7132       for (i=0; i<N; i+=4)
7133           for (j=k; j<M; j+=4)
7134           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7135                            // that the misalignment of the initial address is
7136                            // 0).
7137
7138      The loop can then be vectorized as follows:
7139
7140       for (k=0; k<4; k++){
7141         rt = get_realignment_token (&vp[k]);
7142         for (i=0; i<N; i+=4){
7143           v1 = vp[i+k];
7144           for (j=k; j<M; j+=4){
7145             v2 = vp[i+j+VS-1];
7146             va = REALIGN_LOAD <v1,v2,rt>;
7147             vs += va;
7148             v1 = v2;
7149           }
7150         }
7151     } */
7152
7153   if (DR_IS_READ (dr))
7154     {
7155       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7156           && (!targetm.vectorize.builtin_mask_for_load
7157               || targetm.vectorize.builtin_mask_for_load ()))
7158         {
7159           /* If we are doing SLP then the accesses need not have the
7160              same alignment, instead it depends on the SLP group size.  */
7161           if (loop_vinfo
7162               && STMT_SLP_TYPE (stmt_info)
7163               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7164                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7165                                   * (DR_GROUP_SIZE
7166                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7167                                   TYPE_VECTOR_SUBPARTS (vectype))))
7168             ;
7169           else if (!loop_vinfo
7170                    || (nested_in_vect_loop
7171                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7172                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7173             return dr_explicit_realign;
7174           else
7175             return dr_explicit_realign_optimized;
7176         }
7177     }
7178
7179   bool is_packed = false;
7180   tree type = TREE_TYPE (DR_REF (dr));
7181   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7182     is_packed = not_size_aligned (DR_REF (dr));
7183   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7184                                                      is_packed))
7185     return dr_unaligned_supported;
7186
7187   /* Unsupported.  */
7188   return dr_unaligned_unsupported;
7189 }