gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_MEMORY
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "predict.h"
  32 #include "memmodel.h"
  33 #include "tm_p.h"
  34 #include "ssa.h"
  35 #include "optabs-tree.h"
  36 #include "cgraph.h"
  37 #include "dumpfile.h"
  38 #include "alias.h"
  39 #include "fold-const.h"
  40 #include "stor-layout.h"
  41 #include "tree-eh.h"
  42 #include "gimplify.h"
  43 #include "gimple-iterator.h"
  44 #include "gimplify-me.h"
  45 #include "tree-ssa-loop-ivopts.h"
  46 #include "tree-ssa-loop-manip.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "expr.h"
  52 #include "builtins.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57 #include "gimple-fold.h"
  58
  59 /* Return true if load- or store-lanes optab OPTAB is implemented for
  60    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  61
  62 static bool
  63 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  64                               tree vectype, unsigned HOST_WIDE_INT count)
  65 {
  66   machine_mode mode, array_mode;
  67   bool limit_p;
  68
  69   mode = TYPE_MODE (vectype);
  70   if (!targetm.array_mode (mode, count).exists (&array_mode))
  71     {
  72       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  73       limit_p = !targetm.array_mode_supported_p (mode, count);
  74       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  75         {
  76           if (dump_enabled_p ())
  77             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  78                              "no array mode for %s[%wu]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101 /* Helper function to identify a simd clone call.  If this is a call to a
 102    function with simd clones then return the corresponding cgraph_node,
 103    otherwise return NULL.  */
 104
 105 static cgraph_node*
 106 simd_clone_call_p (gimple *stmt)
 107 {
 108   gcall *call = dyn_cast <gcall *> (stmt);
 109   if (!call)
 110     return NULL;
 111
 112   tree fndecl = NULL_TREE;
 113   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 114     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 115   else
 116     fndecl = gimple_call_fndecl (stmt);
 117
 118   if (fndecl == NULL_TREE)
 119     return NULL;
 120
 121   cgraph_node *node = cgraph_node::get (fndecl);
 122   if (node && node->simd_clones != NULL)
 123     return node;
 124
 125   return NULL;
 126 }
 127
 128
 129
 130 /* Return the smallest scalar part of STMT_INFO.
 131    This is used to determine the vectype of the stmt.  We generally set the
 132    vectype according to the type of the result (lhs).  For stmts whose
 133    result-type is different than the type of the arguments (e.g., demotion,
 134    promotion), vectype will be reset appropriately (later).  Note that we have
 135    to visit the smallest datatype in this function, because that determines the
 136    VF.  If the smallest datatype in the loop is present only as the rhs of a
 137    promotion operation - we'd miss it.
 138    Such a case, where a variable of this datatype does not appear in the lhs
 139    anywhere in the loop, can only occur if it's an invariant: e.g.:
 140    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 141    invariant motion.  However, we cannot rely on invariant motion to always
 142    take invariants out of the loop, and so in the case of promotion we also
 143    have to check the rhs.
 144    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 145    types.  */
 146
 147 tree
 148 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 149 {
 150   HOST_WIDE_INT lhs, rhs;
 151
 152   /* During the analysis phase, this function is called on arbitrary
 153      statements that might not have scalar results.  */
 154   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 155     return scalar_type;
 156
 157   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 158
 159   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 160   if (assign)
 161     {
 162       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 163       if (gimple_assign_cast_p (assign)
 164           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 166           || gimple_assign_rhs_code (assign) == SAD_EXPR
 167           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 168           || gimple_assign_rhs_code (assign) == WIDEN_MULT_PLUS_EXPR
 169           || gimple_assign_rhs_code (assign) == WIDEN_MULT_MINUS_EXPR
 170           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 171           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 172         {
 173           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 174
 175           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 176           if (rhs < lhs)
 177             scalar_type = rhs_type;
 178         }
 179     }
 180   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 181     {
 182       auto clone = node->simd_clones->simdclone;
 183       for (unsigned int i = 0; i < clone->nargs; ++i)
 184         {
 185           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 186             {
 187               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 188               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 189               if (rhs < lhs)
 190                 {
 191                   scalar_type = arg_scalar_type;
 192                   lhs = rhs;
 193                 }
 194             }
 195         }
 196     }
 197   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 198     {
 199       unsigned int i = 0;
 200       if (gimple_call_internal_p (call))
 201         {
 202           internal_fn ifn = gimple_call_internal_fn (call);
 203           if (internal_load_fn_p (ifn))
 204             /* For loads the LHS type does the trick.  */
 205             i = ~0U;
 206           else if (internal_store_fn_p (ifn))
 207             {
 208               /* For stores use the tyep of the stored value.  */
 209               i = internal_fn_stored_value_index (ifn);
 210               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 211               i = ~0U;
 212             }
 213           else if (internal_fn_mask_index (ifn) == 0)
 214             i = 1;
 215         }
 216       if (i < gimple_call_num_args (call))
 217         {
 218           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 219           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 220             {
 221               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 222               if (rhs < lhs)
 223                 scalar_type = rhs_type;
 224             }
 225         }
 226     }
 227
 228   return scalar_type;
 229 }
 230
 231
 232 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 233    tested at run-time.  Return TRUE if DDR was successfully inserted.
 234    Return false if versioning is not supported.  */
 235
 236 static opt_result
 237 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 238 {
 239   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240
 241   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 242     return opt_result::failure_at (vect_location,
 243                                    "will not create alias checks, as"
 244                                    " --param vect-max-version-for-alias-checks"
 245                                    " == 0\n");
 246
 247   opt_result res
 248     = runtime_alias_check_p (ddr, loop,
 249                              optimize_loop_nest_for_speed_p (loop));
 250   if (!res)
 251     return res;
 252
 253   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 254   return opt_result::success ();
 255 }
 256
 257 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 258
 259 static void
 260 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 261 {
 262   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 263   for (unsigned int i = 0; i < checks.length(); ++i)
 264     if (checks[i] == value)
 265       return;
 266
 267   if (dump_enabled_p ())
 268     dump_printf_loc (MSG_NOTE, vect_location,
 269                      "need run-time check that %T is nonzero\n",
 270                      value);
 271   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 272 }
 273
 274 /* Return true if we know that the order of vectorized DR_INFO_A and
 275    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 276    DR_INFO_B.  At least one of the accesses is a write.  */
 277
 278 static bool
 279 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 280 {
 281   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 282   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 283
 284   /* Single statements are always kept in their original order.  */
 285   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 286       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 287     return true;
 288
 289   /* If there is a loop invariant read involved we might vectorize it in
 290      the prologue, breaking scalar oder with respect to the in-loop store.  */
 291   if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
 292       || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
 293     return false;
 294
 295   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 296      emitted at the position of the first scalar load.
 297      Stores in a group are emitted at the position of the last scalar store.
 298      Compute that position and check whether the resulting order matches
 299      the current one.  */
 300   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 301   if (il_a)
 302     {
 303       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 304         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 305              s = DR_GROUP_NEXT_ELEMENT (s))
 306           il_a = get_later_stmt (il_a, s);
 307       else /* DR_IS_READ */
 308         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 309              s = DR_GROUP_NEXT_ELEMENT (s))
 310           if (get_later_stmt (il_a, s) == il_a)
 311             il_a = s;
 312     }
 313   else
 314     il_a = stmtinfo_a;
 315   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 316   if (il_b)
 317     {
 318       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 319         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 320              s = DR_GROUP_NEXT_ELEMENT (s))
 321           il_b = get_later_stmt (il_b, s);
 322       else /* DR_IS_READ */
 323         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 324              s = DR_GROUP_NEXT_ELEMENT (s))
 325           if (get_later_stmt (il_b, s) == il_b)
 326             il_b = s;
 327     }
 328   else
 329     il_b = stmtinfo_b;
 330   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 331   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 332 }
 333
 334 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 335    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 336    distances.  These distances are conservatively correct but they don't
 337    reflect a guaranteed dependence.
 338
 339    Return true if this function does all the work necessary to avoid
 340    an alias or false if the caller should use the dependence distances
 341    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 342    the depth of the loop described by LOOP_VINFO and the other arguments
 343    are as for vect_analyze_data_ref_dependence.  */
 344
 345 static bool
 346 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 347                                        loop_vec_info loop_vinfo,
 348                                        int loop_depth, unsigned int *max_vf)
 349 {
 350   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 351   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 352     {
 353       int dist = dist_v[loop_depth];
 354       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 355         {
 356           /* If the user asserted safelen >= DIST consecutive iterations
 357              can be executed concurrently, assume independence.
 358
 359              ??? An alternative would be to add the alias check even
 360              in this case, and vectorize the fallback loop with the
 361              maximum VF set to safelen.  However, if the user has
 362              explicitly given a length, it's less likely that that
 363              would be a win.  */
 364           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 365             {
 366               if ((unsigned int) loop->safelen < *max_vf)
 367                 *max_vf = loop->safelen;
 368               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 369               continue;
 370             }
 371
 372           /* For dependence distances of 2 or more, we have the option
 373              of limiting VF or checking for an alias at runtime.
 374              Prefer to check at runtime if we can, to avoid limiting
 375              the VF unnecessarily when the bases are in fact independent.
 376
 377              Note that the alias checks will be removed if the VF ends up
 378              being small enough.  */
 379           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 380           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 381           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 382                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 383                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 384         }
 385     }
 386   return true;
 387 }
 388
 389
 390 /* Function vect_analyze_data_ref_dependence.
 391
 392    FIXME: I needed to change the sense of the returned flag.
 393
 394    Return FALSE if there (might) exist a dependence between a memory-reference
 395    DRA and a memory-reference DRB.  When versioning for alias may check a
 396    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 397    the data dependence.  */
 398
 399 static opt_result
 400 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 401                                   loop_vec_info loop_vinfo,
 402                                   unsigned int *max_vf)
 403 {
 404   unsigned int i;
 405   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 406   struct data_reference *dra = DDR_A (ddr);
 407   struct data_reference *drb = DDR_B (ddr);
 408   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 409   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 410   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 411   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 412   lambda_vector dist_v;
 413   unsigned int loop_depth;
 414
 415   /* If user asserted safelen consecutive iterations can be
 416      executed concurrently, assume independence.  */
 417   auto apply_safelen = [&]()
 418     {
 419       if (loop->safelen >= 2)
 420         {
 421           if ((unsigned int) loop->safelen < *max_vf)
 422             *max_vf = loop->safelen;
 423           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 424           return true;
 425         }
 426       return false;
 427     };
 428
 429   /* In loop analysis all data references should be vectorizable.  */
 430   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 431       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 432     gcc_unreachable ();
 433
 434   /* Independent data accesses.  */
 435   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 436     return opt_result::success ();
 437
 438   if (dra == drb
 439       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 440     return opt_result::success ();
 441
 442   /* We do not have to consider dependences between accesses that belong
 443      to the same group, unless the stride could be smaller than the
 444      group size.  */
 445   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 446       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 447           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 448       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 449     return opt_result::success ();
 450
 451   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 452      least two scalar iterations, there is always also a true dependence.
 453      As the vectorizer does not re-order loads and stores we can ignore
 454      the anti-dependence if TBAA can disambiguate both DRs similar to the
 455      case with known negative distance anti-dependences (positive
 456      distance anti-dependences would violate TBAA constraints).  */
 457   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 458        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 459       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 460                                  get_alias_set (DR_REF (drb))))
 461     return opt_result::success ();
 462
 463   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 464       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 465     {
 466       if (apply_safelen ())
 467         return opt_result::success ();
 468
 469       return opt_result::failure_at
 470         (stmtinfo_a->stmt,
 471          "possible alias involving gather/scatter between %T and %T\n",
 472          DR_REF (dra), DR_REF (drb));
 473     }
 474
 475   /* Unknown data dependence.  */
 476   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 477     {
 478       if (apply_safelen ())
 479         return opt_result::success ();
 480
 481       if (dump_enabled_p ())
 482         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 483                          "versioning for alias required: "
 484                          "can't determine dependence between %T and %T\n",
 485                          DR_REF (dra), DR_REF (drb));
 486
 487       /* Add to list of ddrs that need to be tested at run-time.  */
 488       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 489     }
 490
 491   /* Known data dependence.  */
 492   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 493     {
 494       if (apply_safelen ())
 495         return opt_result::success ();
 496
 497       if (dump_enabled_p ())
 498         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 499                          "versioning for alias required: "
 500                          "bad dist vector for %T and %T\n",
 501                          DR_REF (dra), DR_REF (drb));
 502       /* Add to list of ddrs that need to be tested at run-time.  */
 503       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 504     }
 505
 506   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 507
 508   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 509       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 510                                                 loop_depth, max_vf))
 511     return opt_result::success ();
 512
 513   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 514     {
 515       int dist = dist_v[loop_depth];
 516
 517       if (dump_enabled_p ())
 518         dump_printf_loc (MSG_NOTE, vect_location,
 519                          "dependence distance  = %d.\n", dist);
 520
 521       if (dist == 0)
 522         {
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "dependence distance == 0 between %T and %T\n",
 526                              DR_REF (dra), DR_REF (drb));
 527
 528           /* When we perform grouped accesses and perform implicit CSE
 529              by detecting equal accesses and doing disambiguation with
 530              runtime alias tests like for
 531                 .. = a[i];
 532                 .. = a[i+1];
 533                 a[i] = ..;
 534                 a[i+1] = ..;
 535                 *p = ..;
 536                 .. = a[i];
 537                 .. = a[i+1];
 538              where we will end up loading { a[i], a[i+1] } once, make
 539              sure that inserting group loads before the first load and
 540              stores after the last store will do the right thing.
 541              Similar for groups like
 542                 a[i] = ...;
 543                 ... = a[i];
 544                 a[i+1] = ...;
 545              where loads from the group interleave with the store.  */
 546           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 547             return opt_result::failure_at (stmtinfo_a->stmt,
 548                                            "READ_WRITE dependence"
 549                                            " in interleaving.\n");
 550
 551           if (loop->safelen < 2)
 552             {
 553               tree indicator = dr_zero_step_indicator (dra);
 554               if (!indicator || integer_zerop (indicator))
 555                 return opt_result::failure_at (stmtinfo_a->stmt,
 556                                                "access also has a zero step\n");
 557               else if (TREE_CODE (indicator) != INTEGER_CST)
 558                 vect_check_nonzero_value (loop_vinfo, indicator);
 559             }
 560           continue;
 561         }
 562
 563       if (dist > 0 && DDR_REVERSED_P (ddr))
 564         {
 565           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 566              reversed (to make distance vector positive), and the actual
 567              distance is negative.  */
 568           if (dump_enabled_p ())
 569             dump_printf_loc (MSG_NOTE, vect_location,
 570                              "dependence distance negative.\n");
 571           /* When doing outer loop vectorization, we need to check if there is
 572              a backward dependence at the inner loop level if the dependence
 573              at the outer loop is reversed.  See PR81740.  */
 574           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 575               || nested_in_vect_loop_p (loop, stmtinfo_b))
 576             {
 577               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 578                                                          DDR_LOOP_NEST (ddr));
 579               if (dist_v[inner_depth] < 0)
 580                 return opt_result::failure_at (stmtinfo_a->stmt,
 581                                                "not vectorized, dependence "
 582                                                "between data-refs %T and %T\n",
 583                                                DR_REF (dra), DR_REF (drb));
 584             }
 585           /* Record a negative dependence distance to later limit the
 586              amount of stmt copying / unrolling we can perform.
 587              Only need to handle read-after-write dependence.  */
 588           if (DR_IS_READ (drb)
 589               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 590                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 591             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 592           continue;
 593         }
 594
 595       unsigned int abs_dist = abs (dist);
 596       if (abs_dist >= 2 && abs_dist < *max_vf)
 597         {
 598           /* The dependence distance requires reduction of the maximal
 599              vectorization factor.  */
 600           *max_vf = abs_dist;
 601           if (dump_enabled_p ())
 602             dump_printf_loc (MSG_NOTE, vect_location,
 603                              "adjusting maximal vectorization factor to %i\n",
 604                              *max_vf);
 605         }
 606
 607       if (abs_dist >= *max_vf)
 608         {
 609           /* Dependence distance does not create dependence, as far as
 610              vectorization is concerned, in this case.  */
 611           if (dump_enabled_p ())
 612             dump_printf_loc (MSG_NOTE, vect_location,
 613                              "dependence distance >= VF.\n");
 614           continue;
 615         }
 616
 617       return opt_result::failure_at (stmtinfo_a->stmt,
 618                                      "not vectorized, possible dependence "
 619                                      "between data-refs %T and %T\n",
 620                                      DR_REF (dra), DR_REF (drb));
 621     }
 622
 623   return opt_result::success ();
 624 }
 625
 626 /* Function vect_analyze_early_break_dependences.
 627
 628    Examine all the data references in the loop and make sure that if we have
 629    multiple exits that we are able to safely move stores such that they become
 630    safe for vectorization.  The function also calculates the place where to move
 631    the instructions to and computes what the new vUSE chain should be.
 632
 633    This works in tandem with the CFG that will be produced by
 634    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 635
 636    This function tries to validate whether an early break vectorization
 637    is possible for the current instruction sequence. Returns True i
 638    possible, otherwise False.
 639
 640    Requirements:
 641      - Any memory access must be to a fixed size buffer.
 642      - There must not be any loads and stores to the same object.
 643      - Multiple loads are allowed as long as they don't alias.
 644
 645    NOTE:
 646      This implementation is very conservative. Any overlapping loads/stores
 647      that take place before the early break statement gets rejected aside from
 648      WAR dependencies.
 649
 650      i.e.:
 651
 652         a[i] = 8
 653         c = a[i]
 654         if (b[i])
 655           ...
 656
 657         is not allowed, but
 658
 659         c = a[i]
 660         a[i] = 8
 661         if (b[i])
 662           ...
 663
 664         is which is the common case.  */
 665
 666 static opt_result
 667 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 668 {
 669   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 670
 671   /* List of all load data references found during traversal.  */
 672   auto_vec<data_reference *> bases;
 673   basic_block dest_bb = NULL;
 674
 675   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 676   class loop *loop_nest = loop_outer (loop);
 677
 678   if (dump_enabled_p ())
 679     dump_printf_loc (MSG_NOTE, vect_location,
 680                      "loop contains multiple exits, analyzing"
 681                      " statement dependencies.\n");
 682
 683   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 684     if (dump_enabled_p ())
 685       dump_printf_loc (MSG_NOTE, vect_location,
 686                        "alternate exit has been chosen as main exit.\n");
 687
 688   /* Since we don't support general control flow, the location we'll move the
 689      side-effects to is always the latch connected exit.  When we support
 690      general control flow we can do better but for now this is fine.  Move
 691      side-effects to the in-loop destination of the last early exit.  For the
 692      PEELED case we move the side-effects to the latch block as this is
 693      guaranteed to be the last block to be executed when a vector iteration
 694      finished.  */
 695   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
 696     dest_bb = loop->latch;
 697   else
 698     dest_bb = single_pred (loop->latch);
 699
 700   /* We start looking from dest_bb, for the non-PEELED case we don't want to
 701      move any stores already present, but we do want to read and validate the
 702      loads.  */
 703   basic_block bb = dest_bb;
 704
 705   /* We move stores across all loads to the beginning of dest_bb, so
 706      the first block processed below doesn't need dependence checking.  */
 707   bool check_deps = false;
 708
 709   do
 710     {
 711       gimple_stmt_iterator gsi = gsi_last_bb (bb);
 712
 713       /* Now analyze all the remaining statements and try to determine which
 714          instructions are allowed/needed to be moved.  */
 715       while (!gsi_end_p (gsi))
 716         {
 717           gimple *stmt = gsi_stmt (gsi);
 718           gsi_prev (&gsi);
 719           if (is_gimple_debug (stmt))
 720             continue;
 721
 722           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 723           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 724           if (!dr_ref)
 725             continue;
 726
 727           /* We know everything below dest_bb is safe since we know we
 728              had a full vector iteration when reaching it.  Either by
 729              the loop entry / IV exit test being last or because this
 730              is the loop latch itself.  */
 731           if (!check_deps)
 732             continue;
 733
 734           /* Check if vector accesses to the object will be within bounds.
 735              must be a constant or assume loop will be versioned or niters
 736              bounded by VF so accesses are within range.  We only need to check
 737              the reads since writes are moved to a safe place where if we get
 738              there we know they are safe to perform.  */
 739           if (DR_IS_READ (dr_ref)
 740               && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
 741             {
 742               if (dump_enabled_p ())
 743                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 744                                  "early breaks not supported: vectorization "
 745                                  "would %s beyond size of obj.\n",
 746                                  DR_IS_READ (dr_ref) ? "read" : "write");
 747               return opt_result::failure_at (stmt,
 748                                  "can't safely apply code motion to "
 749                                  "dependencies of %G to vectorize "
 750                                  "the early exit.\n", stmt);
 751             }
 752
 753           if (DR_IS_READ (dr_ref))
 754             bases.safe_push (dr_ref);
 755           else if (DR_IS_WRITE (dr_ref))
 756             {
 757               /* We are moving writes down in the CFG.  To be sure that this
 758                  is valid after vectorization we have to check all the loads
 759                  we are sinking the stores past to see if any of them may
 760                  alias or are the same object.
 761
 762                  Same objects will not be an issue because unless the store
 763                  is marked volatile the value can be forwarded.  If the
 764                  store is marked volatile we don't vectorize the loop
 765                  anyway.
 766
 767                  That leaves the check for aliasing.  We don't really need
 768                  to care about the stores aliasing with each other since the
 769                  stores are moved in order so the effects are still observed
 770                  correctly.  This leaves the check for WAR dependencies
 771                  which we would be introducing here if the DR can alias.
 772                  The check is quadratic in loads/stores but I have not found
 773                  a better API to do this.  I believe all loads and stores
 774                  must be checked.  We also must check them when we
 775                  encountered the store, since we don't care about loads past
 776                  the store.  */
 777
 778               for (auto dr_read : bases)
 779                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 780                   {
 781                     if (dump_enabled_p ())
 782                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 783                                        vect_location,
 784                                        "early breaks not supported: "
 785                                        "overlapping loads and stores "
 786                                        "found before the break "
 787                                        "statement.\n");
 788
 789                     return opt_result::failure_at (stmt,
 790                              "can't safely apply code motion to dependencies"
 791                              " to vectorize the early exit. %G may alias with"
 792                              " %G\n", stmt, dr_read->stmt);
 793                   }
 794             }
 795
 796           if (gimple_vdef (stmt))
 797             {
 798               if (dump_enabled_p ())
 799                 dump_printf_loc (MSG_NOTE, vect_location,
 800                                  "==> recording stmt %G", stmt);
 801
 802               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 803             }
 804           else if (gimple_vuse (stmt))
 805             {
 806               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 807               if (dump_enabled_p ())
 808                 dump_printf_loc (MSG_NOTE, vect_location,
 809                                  "marked statement for vUSE update: %G", stmt);
 810             }
 811         }
 812
 813       if (!single_pred_p (bb))
 814         {
 815           gcc_assert (bb == loop->header);
 816           break;
 817         }
 818
 819       /* If we possibly sink through a virtual PHI make sure to elide that.  */
 820       if (gphi *vphi = get_virtual_phi (bb))
 821         LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
 822
 823       /* All earlier blocks need dependence checking.  */
 824       check_deps = true;
 825       bb = single_pred (bb);
 826     }
 827   while (1);
 828
 829   /* We don't allow outer -> inner loop transitions which should have been
 830      trapped already during loop form analysis.  */
 831   gcc_assert (dest_bb->loop_father == loop);
 832
 833   /* Check that the destination block we picked has only one pred.  To relax this we
 834      have to take special care when moving the statements.  We don't currently support
 835      such control flow however this check is there to simplify how we handle
 836      labels that may be present anywhere in the IL.  This check is to ensure that the
 837      labels aren't significant for the CFG.  */
 838   if (!single_pred (dest_bb))
 839     return opt_result::failure_at (vect_location,
 840                              "chosen loop exit block (BB %d) does not have a "
 841                              "single predecessor which is currently not "
 842                              "supported for early break vectorization.\n",
 843                              dest_bb->index);
 844
 845   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 846
 847   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 848     {
 849       /* All uses shall be updated to that of the first load.  Entries are
 850          stored in reverse order.  */
 851       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 852       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 853         {
 854           if (dump_enabled_p ())
 855           dump_printf_loc (MSG_NOTE, vect_location,
 856                            "will update use: %T, mem_ref: %G", vuse, g);
 857         }
 858     }
 859
 860   if (dump_enabled_p ())
 861     dump_printf_loc (MSG_NOTE, vect_location,
 862                      "recorded statements to be moved to BB %d\n",
 863                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 864
 865   return opt_result::success ();
 866 }
 867
 868 /* Function vect_analyze_data_ref_dependences.
 869
 870    Examine all the data references in the loop, and make sure there do not
 871    exist any data dependences between them.  Set *MAX_VF according to
 872    the maximum vectorization factor the data dependences allow.  */
 873
 874 opt_result
 875 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 876                                    unsigned int *max_vf)
 877 {
 878   unsigned int i;
 879   struct data_dependence_relation *ddr;
 880
 881   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 882
 883   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 884     {
 885       LOOP_VINFO_DDRS (loop_vinfo)
 886         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 887                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 888       /* We do not need read-read dependences.  */
 889       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 890                                           &LOOP_VINFO_DDRS (loop_vinfo),
 891                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 892                                           false);
 893       gcc_assert (res);
 894     }
 895
 896   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 897
 898   /* For epilogues we either have no aliases or alias versioning
 899      was applied to original loop.  Therefore we may just get max_vf
 900      using VF of original loop.  */
 901   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 902     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 903   else
 904     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 905       {
 906         opt_result res
 907           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 908         if (!res)
 909           return res;
 910       }
 911
 912   /* If we have early break statements in the loop, check to see if they
 913      are of a form we can vectorizer.  */
 914   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 915     return vect_analyze_early_break_dependences (loop_vinfo);
 916
 917   return opt_result::success ();
 918 }
 919
 920
 921 /* Function vect_slp_analyze_data_ref_dependence.
 922
 923    Return TRUE if there (might) exist a dependence between a memory-reference
 924    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 925    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 926    according to the data dependence.  */
 927
 928 static bool
 929 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 930                                       struct data_dependence_relation *ddr)
 931 {
 932   struct data_reference *dra = DDR_A (ddr);
 933   struct data_reference *drb = DDR_B (ddr);
 934   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 935   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 936
 937   /* We need to check dependences of statements marked as unvectorizable
 938      as well, they still can prohibit vectorization.  */
 939
 940   /* Independent data accesses.  */
 941   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 942     return false;
 943
 944   if (dra == drb)
 945     return false;
 946
 947   /* Read-read is OK.  */
 948   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 949     return false;
 950
 951   /* If dra and drb are part of the same interleaving chain consider
 952      them independent.  */
 953   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 954       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 955           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 956     return false;
 957
 958   /* Unknown data dependence.  */
 959   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 960     {
 961       if  (dump_enabled_p ())
 962         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 963                          "can't determine dependence between %T and %T\n",
 964                          DR_REF (dra), DR_REF (drb));
 965     }
 966   else if (dump_enabled_p ())
 967     dump_printf_loc (MSG_NOTE, vect_location,
 968                      "determined dependence between %T and %T\n",
 969                      DR_REF (dra), DR_REF (drb));
 970
 971   return true;
 972 }
 973
 974
 975 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 976
 977 static bool
 978 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 979 {
 980   /* This walks over all stmts involved in the SLP store done
 981      in NODE verifying we can sink them up to the last stmt in the
 982      group.  */
 983   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 984   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 985
 986   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 987     {
 988       stmt_vec_info access_info
 989         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 990       if (access_info == last_access_info)
 991         continue;
 992       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 993       ao_ref ref;
 994       bool ref_initialized_p = false;
 995       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 996            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 997         {
 998           gimple *stmt = gsi_stmt (gsi);
 999           if (! gimple_vuse (stmt))
1000             continue;
1001
1002           /* If we couldn't record a (single) data reference for this
1003              stmt we have to resort to the alias oracle.  */
1004           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1005           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1006           if (!dr_b)
1007             {
1008               /* We are moving a store - this means
1009                  we cannot use TBAA for disambiguation.  */
1010               if (!ref_initialized_p)
1011                 ao_ref_init (&ref, DR_REF (dr_a));
1012               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1013                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1014                 return false;
1015               continue;
1016             }
1017
1018           gcc_assert (!gimple_visited_p (stmt));
1019
1020           ddr_p ddr = initialize_data_dependence_relation (dr_a,
1021                                                            dr_b, vNULL);
1022           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1023           free_dependence_relation (ddr);
1024           if (dependent)
1025             return false;
1026         }
1027     }
1028   return true;
1029 }
1030
1031 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1032    contain the vector of scalar stores of this instance if we are
1033    disambiguating the loads.  */
1034
1035 static bool
1036 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1037                                    vec<stmt_vec_info> stores,
1038                                    stmt_vec_info last_store_info)
1039 {
1040   /* This walks over all stmts involved in the SLP load done
1041      in NODE verifying we can hoist them up to the first stmt in the
1042      group.  */
1043   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1044   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1045
1046   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1047     {
1048       if (! SLP_TREE_SCALAR_STMTS (node)[k])
1049         continue;
1050       stmt_vec_info access_info
1051         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1052       if (access_info == first_access_info)
1053         continue;
1054       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1055       ao_ref ref;
1056       bool ref_initialized_p = false;
1057       hash_set<stmt_vec_info> grp_visited;
1058       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1059            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1060         {
1061           gimple *stmt = gsi_stmt (gsi);
1062           if (! gimple_vdef (stmt))
1063             continue;
1064
1065           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1066
1067           /* If we run into a store of this same instance (we've just
1068              marked those) then delay dependence checking until we run
1069              into the last store because this is where it will have
1070              been sunk to (and we verified that we can do that already).  */
1071           if (gimple_visited_p (stmt))
1072             {
1073               if (stmt_info != last_store_info)
1074                 continue;
1075
1076               for (stmt_vec_info &store_info : stores)
1077                 {
1078                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1079                   ddr_p ddr = initialize_data_dependence_relation
1080                                 (dr_a, store_dr, vNULL);
1081                   bool dependent
1082                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1083                   free_dependence_relation (ddr);
1084                   if (dependent)
1085                     return false;
1086                 }
1087               continue;
1088             }
1089
1090           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1091             {
1092               /* We are hoisting a load - this means we can use TBAA for
1093                  disambiguation.  */
1094               if (!ref_initialized_p)
1095                 ao_ref_init (&ref, DR_REF (dr_a));
1096               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1097                 {
1098                   /* If we couldn't record a (single) data reference for this
1099                      stmt we have to give up now.  */
1100                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1101                   if (!dr_b)
1102                     return false;
1103                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1104                                                                    dr_b, vNULL);
1105                   bool dependent
1106                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1107                   free_dependence_relation (ddr);
1108                   if (dependent)
1109                     return false;
1110                 }
1111               /* No dependence.  */
1112               return true;
1113             };
1114           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1115             {
1116               /* When we run into a store group we have to honor
1117                  that earlier stores might be moved here.  We don't
1118                  know exactly which and where to since we lack a
1119                  back-mapping from DR to SLP node, so assume all
1120                  earlier stores are sunk here.  It's enough to
1121                  consider the last stmt of a group for this.
1122                  ???  Both this and the fact that we disregard that
1123                  the conflicting instance might be removed later
1124                  is overly conservative.  */
1125               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1126                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1127                      store_info != NULL;
1128                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1129                   if ((store_info == stmt_info
1130                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1131                       && !check_hoist (store_info))
1132                     return false;
1133             }
1134           else
1135             {
1136               if (!check_hoist (stmt_info))
1137                 return false;
1138             }
1139         }
1140     }
1141   return true;
1142 }
1143
1144
1145 /* Function vect_analyze_data_ref_dependences.
1146
1147    Examine all the data references in the basic-block, and make sure there
1148    do not exist any data dependences between them.  Set *MAX_VF according to
1149    the maximum vectorization factor the data dependences allow.  */
1150
1151 bool
1152 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1153 {
1154   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1155
1156   /* The stores of this instance are at the root of the SLP tree.  */
1157   slp_tree store = NULL;
1158   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1159     store = SLP_INSTANCE_TREE (instance);
1160
1161   /* Verify we can sink stores to the vectorized stmt insert location.  */
1162   stmt_vec_info last_store_info = NULL;
1163   if (store)
1164     {
1165       if (! vect_slp_analyze_store_dependences (vinfo, store))
1166         return false;
1167
1168       /* Mark stores in this instance and remember the last one.  */
1169       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1170       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1171         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1172     }
1173
1174   bool res = true;
1175
1176   /* Verify we can sink loads to the vectorized stmt insert location,
1177      special-casing stores of this instance.  */
1178   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1179     if (! vect_slp_analyze_load_dependences (vinfo, load,
1180                                              store
1181                                              ? SLP_TREE_SCALAR_STMTS (store)
1182                                              : vNULL, last_store_info))
1183       {
1184         res = false;
1185         break;
1186       }
1187
1188   /* Unset the visited flag.  */
1189   if (store)
1190     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1191       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1192
1193   return res;
1194 }
1195
1196 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1197    applied.  */
1198
1199 int
1200 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1201 {
1202   HOST_WIDE_INT diff = 0;
1203   /* Alignment is only analyzed for the first element of a DR group,
1204      use that but adjust misalignment by the offset of the access.  */
1205   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1206     {
1207       dr_vec_info *first_dr
1208         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1209       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1210          INTEGER_CSTs and the first element in the group has the lowest
1211          address.  */
1212       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1213               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1214       gcc_assert (diff >= 0);
1215       dr_info = first_dr;
1216     }
1217
1218   int misalign = dr_info->misalignment;
1219   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1220   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1221     return misalign;
1222
1223   /* If the access is only aligned for a vector type with smaller alignment
1224      requirement the access has unknown misalignment.  */
1225   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1226                 targetm.vectorize.preferred_vector_alignment (vectype)))
1227     return DR_MISALIGNMENT_UNKNOWN;
1228
1229   /* Apply the offset from the DR group start and the externally supplied
1230      offset which can for example result from a negative stride access.  */
1231   poly_int64 misalignment = misalign + diff + offset;
1232
1233   /* vect_compute_data_ref_alignment will have ensured that target_alignment
1234      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
1235   unsigned HOST_WIDE_INT target_alignment_c
1236     = dr_info->target_alignment.to_constant ();
1237   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1238     return DR_MISALIGNMENT_UNKNOWN;
1239   return misalign;
1240 }
1241
1242 /* Record the base alignment guarantee given by DRB, which occurs
1243    in STMT_INFO.  */
1244
1245 static void
1246 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1247                             innermost_loop_behavior *drb)
1248 {
1249   bool existed;
1250   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1251     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1252   if (!existed || entry.second->base_alignment < drb->base_alignment)
1253     {
1254       entry = std::make_pair (stmt_info, drb);
1255       if (dump_enabled_p ())
1256         dump_printf_loc (MSG_NOTE, vect_location,
1257                          "recording new base alignment for %T\n"
1258                          "  alignment:    %d\n"
1259                          "  misalignment: %d\n"
1260                          "  based on:     %G",
1261                          drb->base_address,
1262                          drb->base_alignment,
1263                          drb->base_misalignment,
1264                          stmt_info->stmt);
1265     }
1266 }
1267
1268 /* If the region we're going to vectorize is reached, all unconditional
1269    data references occur at least once.  We can therefore pool the base
1270    alignment guarantees from each unconditional reference.  Do this by
1271    going through all the data references in VINFO and checking whether
1272    the containing statement makes the reference unconditionally.  If so,
1273    record the alignment of the base address in VINFO so that it can be
1274    used for all other references with the same base.  */
1275
1276 void
1277 vect_record_base_alignments (vec_info *vinfo)
1278 {
1279   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1280   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1281   for (data_reference *dr : vinfo->shared->datarefs)
1282     {
1283       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1284       stmt_vec_info stmt_info = dr_info->stmt;
1285       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1286           && STMT_VINFO_VECTORIZABLE (stmt_info)
1287           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1288         {
1289           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1290
1291           /* If DR is nested in the loop that is being vectorized, we can also
1292              record the alignment of the base wrt the outer loop.  */
1293           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1294             vect_record_base_alignment
1295               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1296         }
1297     }
1298 }
1299
1300 /* Function vect_compute_data_ref_alignment
1301
1302    Compute the misalignment of the data reference DR_INFO when vectorizing
1303    with VECTYPE.
1304
1305    Output:
1306    1. initialized misalignment info for DR_INFO
1307
1308    FOR NOW: No analysis is actually performed. Misalignment is calculated
1309    only for trivial cases. TODO.  */
1310
1311 static void
1312 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1313                                  tree vectype)
1314 {
1315   stmt_vec_info stmt_info = dr_info->stmt;
1316   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1317   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1318   class loop *loop = NULL;
1319   tree ref = DR_REF (dr_info->dr);
1320
1321   if (dump_enabled_p ())
1322     dump_printf_loc (MSG_NOTE, vect_location,
1323                      "vect_compute_data_ref_alignment:\n");
1324
1325   if (loop_vinfo)
1326     loop = LOOP_VINFO_LOOP (loop_vinfo);
1327
1328   /* Initialize misalignment to unknown.  */
1329   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1330
1331   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1332     return;
1333
1334   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1335   bool step_preserves_misalignment_p;
1336
1337   poly_uint64 vector_alignment
1338     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1339                  BITS_PER_UNIT);
1340   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1341
1342   /* If the main loop has peeled for alignment we have no way of knowing
1343      whether the data accesses in the epilogues are aligned.  We can't at
1344      compile time answer the question whether we have entered the main loop or
1345      not.  Fixes PR 92351.  */
1346   if (loop_vinfo)
1347     {
1348       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1349       if (orig_loop_vinfo
1350           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1351         return;
1352     }
1353
1354   unsigned HOST_WIDE_INT vect_align_c;
1355   if (!vector_alignment.is_constant (&vect_align_c))
1356     return;
1357
1358   /* No step for BB vectorization.  */
1359   if (!loop)
1360     {
1361       gcc_assert (integer_zerop (drb->step));
1362       step_preserves_misalignment_p = true;
1363     }
1364
1365   else
1366     {
1367       /* We can only use base and misalignment information relative to
1368          an innermost loop if the misalignment stays the same throughout the
1369          execution of the loop.  As above, this is the case if the stride of
1370          the dataref evenly divides by the alignment.  */
1371       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1372       step_preserves_misalignment_p
1373         = multiple_p (drb->step_alignment * vf, vect_align_c);
1374
1375       if (!step_preserves_misalignment_p && dump_enabled_p ())
1376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                          "step doesn't divide the vector alignment.\n");
1378
1379       /* In case the dataref is in an inner-loop of the loop that is being
1380          vectorized (LOOP), we use the base and misalignment information
1381          relative to the outer-loop (LOOP).  This is ok only if the
1382          misalignment stays the same throughout the execution of the
1383          inner-loop, which is why we have to check that the stride of the
1384          dataref in the inner-loop evenly divides by the vector alignment.  */
1385       if (step_preserves_misalignment_p
1386           && nested_in_vect_loop_p (loop, stmt_info))
1387         {
1388           step_preserves_misalignment_p
1389             = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1390
1391           if (dump_enabled_p ())
1392             {
1393               if (step_preserves_misalignment_p)
1394                 dump_printf_loc (MSG_NOTE, vect_location,
1395                                  "inner step divides the vector alignment.\n");
1396               else
1397                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398                                  "inner step doesn't divide the vector"
1399                                  " alignment.\n");
1400             }
1401         }
1402     }
1403
1404   unsigned int base_alignment = drb->base_alignment;
1405   unsigned int base_misalignment = drb->base_misalignment;
1406
1407   /* Calculate the maximum of the pooled base address alignment and the
1408      alignment that we can compute for DR itself.  */
1409   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1410     = base_alignments->get (drb->base_address);
1411   if (entry
1412       && base_alignment < (*entry).second->base_alignment
1413       && (loop_vinfo
1414           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1415                               gimple_bb (entry->first->stmt))
1416               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1417                   || (entry->first->dr_aux.group <= dr_info->group)))))
1418     {
1419       base_alignment = entry->second->base_alignment;
1420       base_misalignment = entry->second->base_misalignment;
1421     }
1422
1423   if (drb->offset_alignment < vect_align_c
1424       || !step_preserves_misalignment_p
1425       /* We need to know whether the step wrt the vectorized loop is
1426          negative when computing the starting misalignment below.  */
1427       || TREE_CODE (drb->step) != INTEGER_CST)
1428     {
1429       if (dump_enabled_p ())
1430         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1431                          "Unknown alignment for access: %T\n", ref);
1432       return;
1433     }
1434
1435   if (base_alignment < vect_align_c)
1436     {
1437       unsigned int max_alignment;
1438       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1439       if (max_alignment < vect_align_c
1440           || !vect_can_force_dr_alignment_p (base,
1441                                              vect_align_c * BITS_PER_UNIT))
1442         {
1443           if (dump_enabled_p ())
1444             dump_printf_loc (MSG_NOTE, vect_location,
1445                              "can't force alignment of ref: %T\n", ref);
1446           return;
1447         }
1448
1449       /* Force the alignment of the decl.
1450          NOTE: This is the only change to the code we make during
1451          the analysis phase, before deciding to vectorize the loop.  */
1452       if (dump_enabled_p ())
1453         dump_printf_loc (MSG_NOTE, vect_location,
1454                          "force alignment of %T\n", ref);
1455
1456       dr_info->base_decl = base;
1457       dr_info->base_misaligned = true;
1458       base_misalignment = 0;
1459     }
1460   poly_int64 misalignment
1461     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1462
1463   unsigned int const_misalignment;
1464   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1465     {
1466       if (dump_enabled_p ())
1467         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1468                          "Non-constant misalignment for access: %T\n", ref);
1469       return;
1470     }
1471
1472   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1473
1474   if (dump_enabled_p ())
1475     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476                      "misalign = %d bytes of ref %T\n",
1477                      const_misalignment, ref);
1478
1479   return;
1480 }
1481
1482 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1483    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1484    is made aligned via peeling.  */
1485
1486 static bool
1487 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1488                                          dr_vec_info *dr_peel_info)
1489 {
1490   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1491                   DR_TARGET_ALIGNMENT (dr_info)))
1492     {
1493       poly_offset_int diff
1494         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1495            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1496       if (known_eq (diff, 0)
1497           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1498         return true;
1499     }
1500   return false;
1501 }
1502
1503 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1504    aligned via peeling.  */
1505
1506 static bool
1507 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1508                                  dr_vec_info *dr_peel_info)
1509 {
1510   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1511                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1512       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1513                            DR_OFFSET (dr_peel_info->dr), 0)
1514       || !operand_equal_p (DR_STEP (dr_info->dr),
1515                            DR_STEP (dr_peel_info->dr), 0))
1516     return false;
1517
1518   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1519 }
1520
1521 /* Compute the value for dr_info->misalign so that the access appears
1522    aligned.  This is used by peeling to compensate for dr_misalignment
1523    applying the offset for negative step.  */
1524
1525 int
1526 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1527 {
1528   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1529     return 0;
1530
1531   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1532   poly_int64 misalignment
1533     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1534        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1535
1536   unsigned HOST_WIDE_INT target_alignment_c;
1537   int misalign;
1538   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1539       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1540     return DR_MISALIGNMENT_UNKNOWN;
1541   return misalign;
1542 }
1543
1544 /* Function vect_update_misalignment_for_peel.
1545    Sets DR_INFO's misalignment
1546    - to 0 if it has the same alignment as DR_PEEL_INFO,
1547    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1548    - to -1 (unknown) otherwise.
1549
1550    DR_INFO - the data reference whose misalignment is to be adjusted.
1551    DR_PEEL_INFO - the data reference whose misalignment is being made
1552                   zero in the vector loop by the peel.
1553    NPEEL - the number of iterations in the peel loop if the misalignment
1554            of DR_PEEL_INFO is known at compile time.  */
1555
1556 static void
1557 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1558                                    dr_vec_info *dr_peel_info, int npeel)
1559 {
1560   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1561   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1562     {
1563       SET_DR_MISALIGNMENT (dr_info,
1564                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1565       return;
1566     }
1567
1568   unsigned HOST_WIDE_INT alignment;
1569   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1570       && known_alignment_for_access_p (dr_info,
1571                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1572       && known_alignment_for_access_p (dr_peel_info,
1573                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1574     {
1575       int misal = dr_info->misalignment;
1576       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1577       misal &= alignment - 1;
1578       set_dr_misalignment (dr_info, misal);
1579       return;
1580     }
1581
1582   if (dump_enabled_p ())
1583     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1584                      "to unknown (-1).\n");
1585   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1586 }
1587
1588 /* Return true if alignment is relevant for DR_INFO.  */
1589
1590 static bool
1591 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1592 {
1593   stmt_vec_info stmt_info = dr_info->stmt;
1594
1595   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1596     return false;
1597
1598   /* For interleaving, only the alignment of the first access matters.  */
1599   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1600       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1601     return false;
1602
1603   /* Scatter-gather and invariant accesses continue to address individual
1604      scalars, so vector-level alignment is irrelevant.  */
1605   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1606       || integer_zerop (DR_STEP (dr_info->dr)))
1607     return false;
1608
1609   /* Strided accesses perform only component accesses, alignment is
1610      irrelevant for them.  */
1611   if (STMT_VINFO_STRIDED_P (stmt_info)
1612       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1613     return false;
1614
1615   return true;
1616 }
1617
1618 /* Given an memory reference EXP return whether its alignment is less
1619    than its size.  */
1620
1621 static bool
1622 not_size_aligned (tree exp)
1623 {
1624   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1625     return true;
1626
1627   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1628           > get_object_alignment (exp));
1629 }
1630
1631 /* Function vector_alignment_reachable_p
1632
1633    Return true if vector alignment for DR_INFO is reachable by peeling
1634    a few loop iterations.  Return false otherwise.  */
1635
1636 static bool
1637 vector_alignment_reachable_p (dr_vec_info *dr_info)
1638 {
1639   stmt_vec_info stmt_info = dr_info->stmt;
1640   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1641
1642   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1643     {
1644       /* For interleaved access we peel only if number of iterations in
1645          the prolog loop ({VF - misalignment}), is a multiple of the
1646          number of the interleaved accesses.  */
1647       int elem_size, mis_in_elements;
1648
1649       /* FORNOW: handle only known alignment.  */
1650       if (!known_alignment_for_access_p (dr_info, vectype))
1651         return false;
1652
1653       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1654       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1655       elem_size = vector_element_size (vector_size, nelements);
1656       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1657
1658       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1659         return false;
1660     }
1661
1662   /* If misalignment is known at the compile time then allow peeling
1663      only if natural alignment is reachable through peeling.  */
1664   if (known_alignment_for_access_p (dr_info, vectype)
1665       && !aligned_access_p (dr_info, vectype))
1666     {
1667       HOST_WIDE_INT elmsize =
1668                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1669       if (dump_enabled_p ())
1670         {
1671           dump_printf_loc (MSG_NOTE, vect_location,
1672                            "data size = %wd. misalignment = %d.\n", elmsize,
1673                            dr_misalignment (dr_info, vectype));
1674         }
1675       if (dr_misalignment (dr_info, vectype) % elmsize)
1676         {
1677           if (dump_enabled_p ())
1678             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679                              "data size does not divide the misalignment.\n");
1680           return false;
1681         }
1682     }
1683
1684   if (!known_alignment_for_access_p (dr_info, vectype))
1685     {
1686       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1687       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1688       if (dump_enabled_p ())
1689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690                          "Unknown misalignment, %snaturally aligned\n",
1691                          is_packed ? "not " : "");
1692       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1693     }
1694
1695   return true;
1696 }
1697
1698
1699 /* Calculate the cost of the memory access represented by DR_INFO.  */
1700
1701 static void
1702 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1703                            dr_alignment_support alignment_support_scheme,
1704                            int misalignment,
1705                            unsigned int *inside_cost,
1706                            unsigned int *outside_cost,
1707                            stmt_vector_for_cost *body_cost_vec,
1708                            stmt_vector_for_cost *prologue_cost_vec)
1709 {
1710   stmt_vec_info stmt_info = dr_info->stmt;
1711   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1712   int ncopies;
1713
1714   if (PURE_SLP_STMT (stmt_info))
1715     ncopies = 1;
1716   else
1717     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1718
1719   if (DR_IS_READ (dr_info->dr))
1720     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1721                         misalignment, true, inside_cost,
1722                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1723   else
1724     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1725                          misalignment, inside_cost, body_cost_vec);
1726
1727   if (dump_enabled_p ())
1728     dump_printf_loc (MSG_NOTE, vect_location,
1729                      "vect_get_data_access_cost: inside_cost = %d, "
1730                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1731 }
1732
1733
1734 typedef struct _vect_peel_info
1735 {
1736   dr_vec_info *dr_info;
1737   int npeel;
1738   unsigned int count;
1739 } *vect_peel_info;
1740
1741 typedef struct _vect_peel_extended_info
1742 {
1743   vec_info *vinfo;
1744   struct _vect_peel_info peel_info;
1745   unsigned int inside_cost;
1746   unsigned int outside_cost;
1747 } *vect_peel_extended_info;
1748
1749
1750 /* Peeling hashtable helpers.  */
1751
1752 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1753 {
1754   static inline hashval_t hash (const _vect_peel_info *);
1755   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1756 };
1757
1758 inline hashval_t
1759 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1760 {
1761   return (hashval_t) peel_info->npeel;
1762 }
1763
1764 inline bool
1765 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1766 {
1767   return (a->npeel == b->npeel);
1768 }
1769
1770
1771 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1772
1773 static void
1774 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1775                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1776                           int npeel, bool supportable_if_not_aligned)
1777 {
1778   struct _vect_peel_info elem, *slot;
1779   _vect_peel_info **new_slot;
1780
1781   elem.npeel = npeel;
1782   slot = peeling_htab->find (&elem);
1783   if (slot)
1784     slot->count++;
1785   else
1786     {
1787       slot = XNEW (struct _vect_peel_info);
1788       slot->npeel = npeel;
1789       slot->dr_info = dr_info;
1790       slot->count = 1;
1791       new_slot = peeling_htab->find_slot (slot, INSERT);
1792       *new_slot = slot;
1793     }
1794
1795   /* If this DR is not supported with unknown misalignment then bias
1796      this slot when the cost model is disabled.  */
1797   if (!supportable_if_not_aligned
1798       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1799     slot->count += VECT_MAX_COST;
1800 }
1801
1802
1803 /* Traverse peeling hash table to find peeling option that aligns maximum
1804    number of data accesses.  */
1805
1806 int
1807 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1808                                      _vect_peel_extended_info *max)
1809 {
1810   vect_peel_info elem = *slot;
1811
1812   if (elem->count > max->peel_info.count
1813       || (elem->count == max->peel_info.count
1814           && max->peel_info.npeel > elem->npeel))
1815     {
1816       max->peel_info.npeel = elem->npeel;
1817       max->peel_info.count = elem->count;
1818       max->peel_info.dr_info = elem->dr_info;
1819     }
1820
1821   return 1;
1822 }
1823
1824 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1825    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1826    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1827    after peeling.  */
1828
1829 static void
1830 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1831                                 dr_vec_info *dr0_info,
1832                                 unsigned int *inside_cost,
1833                                 unsigned int *outside_cost,
1834                                 stmt_vector_for_cost *body_cost_vec,
1835                                 stmt_vector_for_cost *prologue_cost_vec,
1836                                 unsigned int npeel)
1837 {
1838   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1839
1840   bool dr0_alignment_known_p
1841     = (dr0_info
1842        && known_alignment_for_access_p (dr0_info,
1843                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1844
1845   for (data_reference *dr : datarefs)
1846     {
1847       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1848       if (!vect_relevant_for_alignment_p (dr_info))
1849         continue;
1850
1851       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1852       dr_alignment_support alignment_support_scheme;
1853       int misalignment;
1854       unsigned HOST_WIDE_INT alignment;
1855
1856       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1857                                             size_zero_node) < 0;
1858       poly_int64 off = 0;
1859       if (negative)
1860         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1861                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1862
1863       if (npeel == 0)
1864         misalignment = dr_misalignment (dr_info, vectype, off);
1865       else if (dr_info == dr0_info
1866                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1867         misalignment = 0;
1868       else if (!dr0_alignment_known_p
1869                || !known_alignment_for_access_p (dr_info, vectype)
1870                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1871         misalignment = DR_MISALIGNMENT_UNKNOWN;
1872       else
1873         {
1874           misalignment = dr_misalignment (dr_info, vectype, off);
1875           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1876           misalignment &= alignment - 1;
1877         }
1878       alignment_support_scheme
1879         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1880                                          misalignment);
1881
1882       vect_get_data_access_cost (loop_vinfo, dr_info,
1883                                  alignment_support_scheme, misalignment,
1884                                  inside_cost, outside_cost,
1885                                  body_cost_vec, prologue_cost_vec);
1886     }
1887 }
1888
1889 /* Traverse peeling hash table and calculate cost for each peeling option.
1890    Find the one with the lowest cost.  */
1891
1892 int
1893 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1894                                    _vect_peel_extended_info *min)
1895 {
1896   vect_peel_info elem = *slot;
1897   int dummy;
1898   unsigned int inside_cost = 0, outside_cost = 0;
1899   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1900   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1901                        epilogue_cost_vec;
1902
1903   prologue_cost_vec.create (2);
1904   body_cost_vec.create (2);
1905   epilogue_cost_vec.create (2);
1906
1907   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1908                                   &outside_cost, &body_cost_vec,
1909                                   &prologue_cost_vec, elem->npeel);
1910
1911   body_cost_vec.release ();
1912
1913   outside_cost += vect_get_known_peeling_cost
1914     (loop_vinfo, elem->npeel, &dummy,
1915      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1916      &prologue_cost_vec, &epilogue_cost_vec);
1917
1918   /* Prologue and epilogue costs are added to the target model later.
1919      These costs depend only on the scalar iteration cost, the
1920      number of peeling iterations finally chosen, and the number of
1921      misaligned statements.  So discard the information found here.  */
1922   prologue_cost_vec.release ();
1923   epilogue_cost_vec.release ();
1924
1925   if (inside_cost < min->inside_cost
1926       || (inside_cost == min->inside_cost
1927           && outside_cost < min->outside_cost))
1928     {
1929       min->inside_cost = inside_cost;
1930       min->outside_cost = outside_cost;
1931       min->peel_info.dr_info = elem->dr_info;
1932       min->peel_info.npeel = elem->npeel;
1933       min->peel_info.count = elem->count;
1934     }
1935
1936   return 1;
1937 }
1938
1939
1940 /* Choose best peeling option by traversing peeling hash table and either
1941    choosing an option with the lowest cost (if cost model is enabled) or the
1942    option that aligns as many accesses as possible.  */
1943
1944 static struct _vect_peel_extended_info
1945 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1946                                        loop_vec_info loop_vinfo)
1947 {
1948    struct _vect_peel_extended_info res;
1949
1950    res.peel_info.dr_info = NULL;
1951    res.vinfo = loop_vinfo;
1952
1953    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1954      {
1955        res.inside_cost = INT_MAX;
1956        res.outside_cost = INT_MAX;
1957        peeling_htab->traverse <_vect_peel_extended_info *,
1958                                vect_peeling_hash_get_lowest_cost> (&res);
1959      }
1960    else
1961      {
1962        res.peel_info.count = 0;
1963        peeling_htab->traverse <_vect_peel_extended_info *,
1964                                vect_peeling_hash_get_most_frequent> (&res);
1965        res.inside_cost = 0;
1966        res.outside_cost = 0;
1967      }
1968
1969    return res;
1970 }
1971
1972 /* Return true if the new peeling NPEEL is supported.  */
1973
1974 static bool
1975 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1976                           unsigned npeel)
1977 {
1978   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1979   enum dr_alignment_support supportable_dr_alignment;
1980
1981   bool dr0_alignment_known_p
1982     = known_alignment_for_access_p (dr0_info,
1983                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1984
1985   /* Ensure that all data refs can be vectorized after the peel.  */
1986   for (data_reference *dr : datarefs)
1987     {
1988       if (dr == dr0_info->dr)
1989         continue;
1990
1991       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1992       if (!vect_relevant_for_alignment_p (dr_info)
1993           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1994         continue;
1995
1996       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1997       int misalignment;
1998       unsigned HOST_WIDE_INT alignment;
1999       if (!dr0_alignment_known_p
2000           || !known_alignment_for_access_p (dr_info, vectype)
2001           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
2002         misalignment = DR_MISALIGNMENT_UNKNOWN;
2003       else
2004         {
2005           misalignment = dr_misalignment (dr_info, vectype);
2006           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2007           misalignment &= alignment - 1;
2008         }
2009       supportable_dr_alignment
2010         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2011                                          misalignment);
2012       if (supportable_dr_alignment == dr_unaligned_unsupported)
2013         return false;
2014     }
2015
2016   return true;
2017 }
2018
2019 /* Compare two data-references DRA and DRB to group them into chunks
2020    with related alignment.  */
2021
2022 static int
2023 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2024 {
2025   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2026   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2027   int cmp;
2028
2029   /* Stabilize sort.  */
2030   if (dra == drb)
2031     return 0;
2032
2033   /* Ordering of DRs according to base.  */
2034   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2035                                DR_BASE_ADDRESS (drb));
2036   if (cmp != 0)
2037     return cmp;
2038
2039   /* And according to DR_OFFSET.  */
2040   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2041   if (cmp != 0)
2042     return cmp;
2043
2044   /* And after step.  */
2045   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2046   if (cmp != 0)
2047     return cmp;
2048
2049   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2050   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2051   if (cmp == 0)
2052     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2053   return cmp;
2054 }
2055
2056 /* Function vect_enhance_data_refs_alignment
2057
2058    This pass will use loop versioning and loop peeling in order to enhance
2059    the alignment of data references in the loop.
2060
2061    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2062    original loop is to be vectorized.  Any other loops that are created by
2063    the transformations performed in this pass - are not supposed to be
2064    vectorized.  This restriction will be relaxed.
2065
2066    This pass will require a cost model to guide it whether to apply peeling
2067    or versioning or a combination of the two.  For example, the scheme that
2068    intel uses when given a loop with several memory accesses, is as follows:
2069    choose one memory access ('p') which alignment you want to force by doing
2070    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2071    other accesses are not necessarily aligned, or (2) use loop versioning to
2072    generate one loop in which all accesses are aligned, and another loop in
2073    which only 'p' is necessarily aligned.
2074
2075    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2076    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2077    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2078
2079    Devising a cost model is the most critical aspect of this work.  It will
2080    guide us on which access to peel for, whether to use loop versioning, how
2081    many versions to create, etc.  The cost model will probably consist of
2082    generic considerations as well as target specific considerations (on
2083    powerpc for example, misaligned stores are more painful than misaligned
2084    loads).
2085
2086    Here are the general steps involved in alignment enhancements:
2087
2088      -- original loop, before alignment analysis:
2089         for (i=0; i<N; i++){
2090           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2091           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2092         }
2093
2094      -- After vect_compute_data_refs_alignment:
2095         for (i=0; i<N; i++){
2096           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2097           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2098         }
2099
2100      -- Possibility 1: we do loop versioning:
2101      if (p is aligned) {
2102         for (i=0; i<N; i++){    # loop 1A
2103           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2104           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2105         }
2106      }
2107      else {
2108         for (i=0; i<N; i++){    # loop 1B
2109           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2110           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2111         }
2112      }
2113
2114      -- Possibility 2: we do loop peeling:
2115      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2116         x = q[i];
2117         p[i] = y;
2118      }
2119      for (i = 3; i < N; i++){   # loop 2A
2120         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2121         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2122      }
2123
2124      -- Possibility 3: combination of loop peeling and versioning:
2125      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2126         x = q[i];
2127         p[i] = y;
2128      }
2129      if (p is aligned) {
2130         for (i = 3; i<N; i++){  # loop 3A
2131           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2132           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2133         }
2134      }
2135      else {
2136         for (i = 3; i<N; i++){  # loop 3B
2137           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2138           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2139         }
2140      }
2141
2142      These loops are later passed to loop_transform to be vectorized.  The
2143      vectorizer will use the alignment information to guide the transformation
2144      (whether to generate regular loads/stores, or with special handling for
2145      misalignment).  */
2146
2147 opt_result
2148 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2149 {
2150   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2151   dr_vec_info *first_store = NULL;
2152   dr_vec_info *dr0_info = NULL;
2153   struct data_reference *dr;
2154   unsigned int i;
2155   bool do_peeling = false;
2156   bool do_versioning = false;
2157   unsigned int npeel = 0;
2158   bool one_misalignment_known = false;
2159   bool one_misalignment_unknown = false;
2160   bool one_dr_unsupportable = false;
2161   dr_vec_info *unsupportable_dr_info = NULL;
2162   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2163   hash_table<peel_info_hasher> peeling_htab (1);
2164
2165   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2166
2167   /* Reset data so we can safely be called multiple times.  */
2168   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2169   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2170
2171   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2172     return opt_result::success ();
2173
2174   /* Sort the vector of datarefs so DRs that have the same or dependent
2175      alignment are next to each other.  */
2176   auto_vec<data_reference_p> datarefs
2177     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2178   datarefs.qsort (dr_align_group_sort_cmp);
2179
2180   /* Compute the number of DRs that become aligned when we peel
2181      a dataref so it becomes aligned.  */
2182   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2183   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2184   unsigned i0;
2185   for (i0 = 0; i0 < datarefs.length (); ++i0)
2186     if (DR_BASE_ADDRESS (datarefs[i0]))
2187       break;
2188   for (i = i0 + 1; i <= datarefs.length (); ++i)
2189     {
2190       if (i == datarefs.length ()
2191           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2192                                DR_BASE_ADDRESS (datarefs[i]), 0)
2193           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2194                                DR_OFFSET (datarefs[i]), 0)
2195           || !operand_equal_p (DR_STEP (datarefs[i0]),
2196                                DR_STEP (datarefs[i]), 0))
2197         {
2198           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2199              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2200              will get known misalignment if we align one of the refs
2201              with the largest DR_TARGET_ALIGNMENT.  */
2202           for (unsigned j = i0; j < i; ++j)
2203             {
2204               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2205               for (unsigned k = i0; k < i; ++k)
2206                 {
2207                   if (k == j)
2208                     continue;
2209                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2210                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2211                                                                dr_infoj))
2212                     n_same_align_refs[j]++;
2213                 }
2214             }
2215           i0 = i;
2216         }
2217     }
2218
2219   /* While cost model enhancements are expected in the future, the high level
2220      view of the code at this time is as follows:
2221
2222      A) If there is a misaligned access then see if peeling to align
2223         this access can make all data references satisfy
2224         vect_supportable_dr_alignment.  If so, update data structures
2225         as needed and return true.
2226
2227      B) If peeling wasn't possible and there is a data reference with an
2228         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2229         then see if loop versioning checks can be used to make all data
2230         references satisfy vect_supportable_dr_alignment.  If so, update
2231         data structures as needed and return true.
2232
2233      C) If neither peeling nor versioning were successful then return false if
2234         any data reference does not satisfy vect_supportable_dr_alignment.
2235
2236      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2237
2238      Note, Possibility 3 above (which is peeling and versioning together) is not
2239      being done at this time.  */
2240
2241   /* (1) Peeling to force alignment.  */
2242
2243   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2244      Considerations:
2245      + How many accesses will become aligned due to the peeling
2246      - How many accesses will become unaligned due to the peeling,
2247        and the cost of misaligned accesses.
2248      - The cost of peeling (the extra runtime checks, the increase
2249        in code size).  */
2250
2251   FOR_EACH_VEC_ELT (datarefs, i, dr)
2252     {
2253       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2254       if (!vect_relevant_for_alignment_p (dr_info))
2255         continue;
2256
2257       stmt_vec_info stmt_info = dr_info->stmt;
2258       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2259       do_peeling = vector_alignment_reachable_p (dr_info);
2260       if (do_peeling)
2261         {
2262           if (known_alignment_for_access_p (dr_info, vectype))
2263             {
2264               unsigned int npeel_tmp = 0;
2265               bool negative = tree_int_cst_compare (DR_STEP (dr),
2266                                                     size_zero_node) < 0;
2267
2268               /* If known_alignment_for_access_p then we have set
2269                  DR_MISALIGNMENT which is only done if we know it at compiler
2270                  time, so it is safe to assume target alignment is constant.
2271                */
2272               unsigned int target_align =
2273                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2274               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2275               poly_int64 off = 0;
2276               if (negative)
2277                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2278               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2279               mis = negative ? mis : -mis;
2280               if (mis != 0)
2281                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2282
2283               /* For multiple types, it is possible that the bigger type access
2284                  will have more than one peeling option.  E.g., a loop with two
2285                  types: one of size (vector size / 4), and the other one of
2286                  size (vector size / 8).  Vectorization factor will 8.  If both
2287                  accesses are misaligned by 3, the first one needs one scalar
2288                  iteration to be aligned, and the second one needs 5.  But the
2289                  first one will be aligned also by peeling 5 scalar
2290                  iterations, and in that case both accesses will be aligned.
2291                  Hence, except for the immediate peeling amount, we also want
2292                  to try to add full vector size, while we don't exceed
2293                  vectorization factor.
2294                  We do this automatically for cost model, since we calculate
2295                  cost for every peeling option.  */
2296               poly_uint64 nscalars = npeel_tmp;
2297               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2298                 {
2299                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2300                   unsigned group_size = 1;
2301                   if (STMT_SLP_TYPE (stmt_info)
2302                       && STMT_VINFO_GROUPED_ACCESS (stmt_info))
2303                     group_size = DR_GROUP_SIZE (stmt_info);
2304                   nscalars = vf * group_size;
2305                 }
2306
2307               /* Save info about DR in the hash table.  Also include peeling
2308                  amounts according to the explanation above.  Indicate
2309                  the alignment status when the ref is not aligned.
2310                  ???  Rather than using unknown alignment here we should
2311                  prune all entries from the peeling hashtable which cause
2312                  DRs to be not supported.  */
2313               bool supportable_if_not_aligned
2314                 = vect_supportable_dr_alignment
2315                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2316               while (known_le (npeel_tmp, nscalars))
2317                 {
2318                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2319                                             dr_info, npeel_tmp,
2320                                             supportable_if_not_aligned);
2321                   npeel_tmp += MAX (1, target_align / dr_size);
2322                 }
2323
2324               one_misalignment_known = true;
2325             }
2326           else
2327             {
2328               /* If we don't know any misalignment values, we prefer
2329                  peeling for data-ref that has the maximum number of data-refs
2330                  with the same alignment, unless the target prefers to align
2331                  stores over load.  */
2332               unsigned same_align_drs = n_same_align_refs[i];
2333               if (!dr0_info
2334                   || dr0_same_align_drs < same_align_drs)
2335                 {
2336                   dr0_same_align_drs = same_align_drs;
2337                   dr0_info = dr_info;
2338                 }
2339               /* For data-refs with the same number of related
2340                  accesses prefer the one where the misalign
2341                  computation will be invariant in the outermost loop.  */
2342               else if (dr0_same_align_drs == same_align_drs)
2343                 {
2344                   class loop *ivloop0, *ivloop;
2345                   ivloop0 = outermost_invariant_loop_for_expr
2346                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2347                   ivloop = outermost_invariant_loop_for_expr
2348                     (loop, DR_BASE_ADDRESS (dr));
2349                   if ((ivloop && !ivloop0)
2350                       || (ivloop && ivloop0
2351                           && flow_loop_nested_p (ivloop, ivloop0)))
2352                     dr0_info = dr_info;
2353                 }
2354
2355               one_misalignment_unknown = true;
2356
2357               /* Check for data refs with unsupportable alignment that
2358                  can be peeled.  */
2359               enum dr_alignment_support supportable_dr_alignment
2360                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2361                                                  DR_MISALIGNMENT_UNKNOWN);
2362               if (supportable_dr_alignment == dr_unaligned_unsupported)
2363                 {
2364                   one_dr_unsupportable = true;
2365                   unsupportable_dr_info = dr_info;
2366                 }
2367
2368               if (!first_store && DR_IS_WRITE (dr))
2369                 {
2370                   first_store = dr_info;
2371                   first_store_same_align_drs = same_align_drs;
2372                 }
2373             }
2374         }
2375       else
2376         {
2377           if (!aligned_access_p (dr_info, vectype))
2378             {
2379               if (dump_enabled_p ())
2380                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381                                  "vector alignment may not be reachable\n");
2382               break;
2383             }
2384         }
2385     }
2386
2387   /* Check if we can possibly peel the loop.  */
2388   if (!vect_can_advance_ivs_p (loop_vinfo)
2389       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2390                                        loop_preheader_edge (loop))
2391       || loop->inner
2392       || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2393     do_peeling = false;
2394
2395   struct _vect_peel_extended_info peel_for_known_alignment;
2396   struct _vect_peel_extended_info peel_for_unknown_alignment;
2397   struct _vect_peel_extended_info best_peel;
2398
2399   peel_for_unknown_alignment.inside_cost = INT_MAX;
2400   peel_for_unknown_alignment.outside_cost = INT_MAX;
2401   peel_for_unknown_alignment.peel_info.count = 0;
2402
2403   if (do_peeling
2404       && one_misalignment_unknown)
2405     {
2406       /* Check if the target requires to prefer stores over loads, i.e., if
2407          misaligned stores are more expensive than misaligned loads (taking
2408          drs with same alignment into account).  */
2409       unsigned int load_inside_cost = 0;
2410       unsigned int load_outside_cost = 0;
2411       unsigned int store_inside_cost = 0;
2412       unsigned int store_outside_cost = 0;
2413       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2414
2415       stmt_vector_for_cost dummy;
2416       dummy.create (2);
2417       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2418                                       &load_inside_cost,
2419                                       &load_outside_cost,
2420                                       &dummy, &dummy, estimated_npeels);
2421       dummy.release ();
2422
2423       if (first_store)
2424         {
2425           dummy.create (2);
2426           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2427                                           &store_inside_cost,
2428                                           &store_outside_cost,
2429                                           &dummy, &dummy,
2430                                           estimated_npeels);
2431           dummy.release ();
2432         }
2433       else
2434         {
2435           store_inside_cost = INT_MAX;
2436           store_outside_cost = INT_MAX;
2437         }
2438
2439       if (load_inside_cost > store_inside_cost
2440           || (load_inside_cost == store_inside_cost
2441               && load_outside_cost > store_outside_cost))
2442         {
2443           dr0_info = first_store;
2444           dr0_same_align_drs = first_store_same_align_drs;
2445           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2446           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2447         }
2448       else
2449         {
2450           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2451           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2452         }
2453
2454       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2455       prologue_cost_vec.create (2);
2456       epilogue_cost_vec.create (2);
2457
2458       int dummy2;
2459       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2460         (loop_vinfo, estimated_npeels, &dummy2,
2461          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2462          &prologue_cost_vec, &epilogue_cost_vec);
2463
2464       prologue_cost_vec.release ();
2465       epilogue_cost_vec.release ();
2466
2467       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2468     }
2469
2470   peel_for_unknown_alignment.peel_info.npeel = 0;
2471   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2472
2473   best_peel = peel_for_unknown_alignment;
2474
2475   peel_for_known_alignment.inside_cost = INT_MAX;
2476   peel_for_known_alignment.outside_cost = INT_MAX;
2477   peel_for_known_alignment.peel_info.count = 0;
2478   peel_for_known_alignment.peel_info.dr_info = NULL;
2479
2480   if (do_peeling && one_misalignment_known)
2481     {
2482       /* Peeling is possible, but there is no data access that is not supported
2483          unless aligned.  So we try to choose the best possible peeling from
2484          the hash table.  */
2485       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2486         (&peeling_htab, loop_vinfo);
2487     }
2488
2489   /* Compare costs of peeling for known and unknown alignment. */
2490   if (peel_for_known_alignment.peel_info.dr_info != NULL
2491       && peel_for_unknown_alignment.inside_cost
2492       >= peel_for_known_alignment.inside_cost)
2493     {
2494       best_peel = peel_for_known_alignment;
2495
2496       /* If the best peeling for known alignment has NPEEL == 0, perform no
2497          peeling at all except if there is an unsupportable dr that we can
2498          align.  */
2499       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2500         do_peeling = false;
2501     }
2502
2503   /* If there is an unsupportable data ref, prefer this over all choices so far
2504      since we'd have to discard a chosen peeling except when it accidentally
2505      aligned the unsupportable data ref.  */
2506   if (one_dr_unsupportable)
2507     dr0_info = unsupportable_dr_info;
2508   else if (do_peeling)
2509     {
2510       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2511          TODO: Use nopeel_outside_cost or get rid of it?  */
2512       unsigned nopeel_inside_cost = 0;
2513       unsigned nopeel_outside_cost = 0;
2514
2515       stmt_vector_for_cost dummy;
2516       dummy.create (2);
2517       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2518                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2519       dummy.release ();
2520
2521       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2522          costs will be recorded.  */
2523       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2524       prologue_cost_vec.create (2);
2525       epilogue_cost_vec.create (2);
2526
2527       int dummy2;
2528       nopeel_outside_cost += vect_get_known_peeling_cost
2529         (loop_vinfo, 0, &dummy2,
2530          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2531          &prologue_cost_vec, &epilogue_cost_vec);
2532
2533       prologue_cost_vec.release ();
2534       epilogue_cost_vec.release ();
2535
2536       npeel = best_peel.peel_info.npeel;
2537       dr0_info = best_peel.peel_info.dr_info;
2538
2539       /* If no peeling is not more expensive than the best peeling we
2540          have so far, don't perform any peeling.  */
2541       if (nopeel_inside_cost <= best_peel.inside_cost)
2542         do_peeling = false;
2543     }
2544
2545   if (do_peeling)
2546     {
2547       stmt_vec_info stmt_info = dr0_info->stmt;
2548       if (known_alignment_for_access_p (dr0_info,
2549                                         STMT_VINFO_VECTYPE (stmt_info)))
2550         {
2551           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2552                                                 size_zero_node) < 0;
2553           if (!npeel)
2554             {
2555               /* Since it's known at compile time, compute the number of
2556                  iterations in the peeled loop (the peeling factor) for use in
2557                  updating DR_MISALIGNMENT values.  The peeling factor is the
2558                  vectorization factor minus the misalignment as an element
2559                  count.  */
2560               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2561               poly_int64 off = 0;
2562               if (negative)
2563                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2564                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2565               unsigned int mis
2566                 = dr_misalignment (dr0_info, vectype, off);
2567               mis = negative ? mis : -mis;
2568               /* If known_alignment_for_access_p then we have set
2569                  DR_MISALIGNMENT which is only done if we know it at compiler
2570                  time, so it is safe to assume target alignment is constant.
2571                */
2572               unsigned int target_align =
2573                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2574               npeel = ((mis & (target_align - 1))
2575                        / vect_get_scalar_dr_size (dr0_info));
2576             }
2577
2578           /* For interleaved data access every iteration accesses all the
2579              members of the group, therefore we divide the number of iterations
2580              by the group size.  */
2581           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2582             npeel /= DR_GROUP_SIZE (stmt_info);
2583
2584           if (dump_enabled_p ())
2585             dump_printf_loc (MSG_NOTE, vect_location,
2586                              "Try peeling by %d\n", npeel);
2587         }
2588
2589       /* Ensure that all datarefs can be vectorized after the peel.  */
2590       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2591         do_peeling = false;
2592
2593       /* Check if all datarefs are supportable and log.  */
2594       if (do_peeling
2595           && npeel == 0
2596           && known_alignment_for_access_p (dr0_info,
2597                                            STMT_VINFO_VECTYPE (stmt_info)))
2598         return opt_result::success ();
2599
2600       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2601       if (do_peeling)
2602         {
2603           unsigned max_allowed_peel
2604             = param_vect_max_peeling_for_alignment;
2605           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2606             max_allowed_peel = 0;
2607           if (max_allowed_peel != (unsigned)-1)
2608             {
2609               unsigned max_peel = npeel;
2610               if (max_peel == 0)
2611                 {
2612                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2613                   unsigned HOST_WIDE_INT target_align_c;
2614                   if (target_align.is_constant (&target_align_c))
2615                     max_peel =
2616                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2617                   else
2618                     {
2619                       do_peeling = false;
2620                       if (dump_enabled_p ())
2621                         dump_printf_loc (MSG_NOTE, vect_location,
2622                           "Disable peeling, max peels set and vector"
2623                           " alignment unknown\n");
2624                     }
2625                 }
2626               if (max_peel > max_allowed_peel)
2627                 {
2628                   do_peeling = false;
2629                   if (dump_enabled_p ())
2630                     dump_printf_loc (MSG_NOTE, vect_location,
2631                         "Disable peeling, max peels reached: %d\n", max_peel);
2632                 }
2633             }
2634         }
2635
2636       /* Cost model #2 - if peeling may result in a remaining loop not
2637          iterating enough to be vectorized then do not peel.  Since this
2638          is a cost heuristic rather than a correctness decision, use the
2639          most likely runtime value for variable vectorization factors.  */
2640       if (do_peeling
2641           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2642         {
2643           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2644           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2645           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2646               < assumed_vf + max_peel)
2647             do_peeling = false;
2648         }
2649
2650       if (do_peeling)
2651         {
2652           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2653              If the misalignment of DR_i is identical to that of dr0 then set
2654              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2655              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2656              by the peeling factor times the element size of DR_i (MOD the
2657              vectorization factor times the size).  Otherwise, the
2658              misalignment of DR_i must be set to unknown.  */
2659           FOR_EACH_VEC_ELT (datarefs, i, dr)
2660             if (dr != dr0_info->dr)
2661               {
2662                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2663                 if (!vect_relevant_for_alignment_p (dr_info))
2664                   continue;
2665
2666                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2667               }
2668
2669           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2670           if (npeel)
2671             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2672           else
2673             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2674           SET_DR_MISALIGNMENT (dr0_info,
2675                                vect_dr_misalign_for_aligned_access (dr0_info));
2676           if (dump_enabled_p ())
2677             {
2678               dump_printf_loc (MSG_NOTE, vect_location,
2679                                "Alignment of access forced using peeling.\n");
2680               dump_printf_loc (MSG_NOTE, vect_location,
2681                                "Peeling for alignment will be applied.\n");
2682             }
2683
2684           /* The inside-loop cost will be accounted for in vectorizable_load
2685              and vectorizable_store correctly with adjusted alignments.
2686              Drop the body_cst_vec on the floor here.  */
2687           return opt_result::success ();
2688         }
2689     }
2690
2691   /* (2) Versioning to force alignment.  */
2692
2693   /* Try versioning if:
2694      1) optimize loop for speed and the cost-model is not cheap
2695      2) there is at least one unsupported misaligned data ref with an unknown
2696         misalignment, and
2697      3) all misaligned data refs with a known misalignment are supported, and
2698      4) the number of runtime alignment checks is within reason.  */
2699
2700   do_versioning
2701     = (optimize_loop_nest_for_speed_p (loop)
2702        && !loop->inner /* FORNOW */
2703        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2704
2705   if (do_versioning)
2706     {
2707       FOR_EACH_VEC_ELT (datarefs, i, dr)
2708         {
2709           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2710           if (!vect_relevant_for_alignment_p (dr_info))
2711             continue;
2712
2713           stmt_vec_info stmt_info = dr_info->stmt;
2714           if (STMT_VINFO_STRIDED_P (stmt_info))
2715             {
2716               do_versioning = false;
2717               break;
2718             }
2719
2720           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2721           bool negative = tree_int_cst_compare (DR_STEP (dr),
2722                                                 size_zero_node) < 0;
2723           poly_int64 off = 0;
2724           if (negative)
2725             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2726                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2727           int misalignment;
2728           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2729             continue;
2730
2731           enum dr_alignment_support supportable_dr_alignment
2732             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2733                                              misalignment);
2734           if (supportable_dr_alignment == dr_unaligned_unsupported)
2735             {
2736               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2737                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2738                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2739                 {
2740                   do_versioning = false;
2741                   break;
2742                 }
2743
2744               /* At present we don't support versioning for alignment
2745                  with variable VF, since there's no guarantee that the
2746                  VF is a power of two.  We could relax this if we added
2747                  a way of enforcing a power-of-two size.  */
2748               unsigned HOST_WIDE_INT size;
2749               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2750                 {
2751                   do_versioning = false;
2752                   break;
2753                 }
2754
2755               /* Forcing alignment in the first iteration is no good if
2756                  we don't keep it across iterations.  For now, just disable
2757                  versioning in this case.
2758                  ?? We could actually unroll the loop to achieve the required
2759                  overall step alignment, and forcing the alignment could be
2760                  done by doing some iterations of the non-vectorized loop.  */
2761               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2762                                * DR_STEP_ALIGNMENT (dr),
2763                                DR_TARGET_ALIGNMENT (dr_info)))
2764                 {
2765                   do_versioning = false;
2766                   break;
2767                 }
2768
2769               /* The rightmost bits of an aligned address must be zeros.
2770                  Construct the mask needed for this test.  For example,
2771                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2772                  mask must be 15 = 0xf. */
2773               int mask = size - 1;
2774
2775               /* FORNOW: use the same mask to test all potentially unaligned
2776                  references in the loop.  */
2777               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2778                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2779                 {
2780                   do_versioning = false;
2781                   break;
2782                 }
2783
2784               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2785               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2786             }
2787         }
2788
2789       /* Versioning requires at least one misaligned data reference.  */
2790       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2791         do_versioning = false;
2792       else if (!do_versioning)
2793         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2794     }
2795
2796   if (do_versioning)
2797     {
2798       const vec<stmt_vec_info> &may_misalign_stmts
2799         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2800       stmt_vec_info stmt_info;
2801
2802       /* It can now be assumed that the data references in the statements
2803          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2804          of the loop being vectorized.  */
2805       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2806         {
2807           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2808           SET_DR_MISALIGNMENT (dr_info,
2809                                vect_dr_misalign_for_aligned_access (dr_info));
2810           if (dump_enabled_p ())
2811             dump_printf_loc (MSG_NOTE, vect_location,
2812                              "Alignment of access forced using versioning.\n");
2813         }
2814
2815       if (dump_enabled_p ())
2816         dump_printf_loc (MSG_NOTE, vect_location,
2817                          "Versioning for alignment will be applied.\n");
2818
2819       /* Peeling and versioning can't be done together at this time.  */
2820       gcc_assert (! (do_peeling && do_versioning));
2821
2822       return opt_result::success ();
2823     }
2824
2825   /* This point is reached if neither peeling nor versioning is being done.  */
2826   gcc_assert (! (do_peeling || do_versioning));
2827
2828   return opt_result::success ();
2829 }
2830
2831
2832 /* Function vect_analyze_data_refs_alignment
2833
2834    Analyze the alignment of the data-references in the loop.
2835    Return FALSE if a data reference is found that cannot be vectorized.  */
2836
2837 opt_result
2838 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2839 {
2840   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2841
2842   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2843   struct data_reference *dr;
2844   unsigned int i;
2845
2846   vect_record_base_alignments (loop_vinfo);
2847   FOR_EACH_VEC_ELT (datarefs, i, dr)
2848     {
2849       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2850       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2851         {
2852           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2853               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2854             continue;
2855           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2856                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2857         }
2858     }
2859
2860   return opt_result::success ();
2861 }
2862
2863
2864 /* Analyze alignment of DRs of stmts in NODE.  */
2865
2866 static bool
2867 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2868 {
2869   /* Alignment is maintained in the first element of the group.  */
2870   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2871   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2872   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2873   tree vectype = SLP_TREE_VECTYPE (node);
2874   poly_uint64 vector_alignment
2875     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2876                  BITS_PER_UNIT);
2877   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2878     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2879   /* Re-analyze alignment when we're facing a vectorization with a bigger
2880      alignment requirement.  */
2881   else if (known_lt (dr_info->target_alignment, vector_alignment))
2882     {
2883       poly_uint64 old_target_alignment = dr_info->target_alignment;
2884       int old_misalignment = dr_info->misalignment;
2885       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2886       /* But keep knowledge about a smaller alignment.  */
2887       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2888           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2889         {
2890           dr_info->target_alignment = old_target_alignment;
2891           dr_info->misalignment = old_misalignment;
2892         }
2893     }
2894   /* When we ever face unordered target alignments the first one wins in terms
2895      of analyzing and the other will become unknown in dr_misalignment.  */
2896   return true;
2897 }
2898
2899 /* Function vect_slp_analyze_instance_alignment
2900
2901    Analyze the alignment of the data-references in the SLP instance.
2902    Return FALSE if a data reference is found that cannot be vectorized.  */
2903
2904 bool
2905 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2906                                                 slp_instance instance)
2907 {
2908   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2909
2910   slp_tree node;
2911   unsigned i;
2912   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2913     if (! vect_slp_analyze_node_alignment (vinfo, node))
2914       return false;
2915
2916   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2917       && ! vect_slp_analyze_node_alignment
2918              (vinfo, SLP_INSTANCE_TREE (instance)))
2919     return false;
2920
2921   return true;
2922 }
2923
2924
2925 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2926    accesses of legal size, step, etc.  Detect gaps, single element
2927    interleaving, and other special cases. Set grouped access info.
2928    Collect groups of strided stores for further use in SLP analysis.
2929    Worker for vect_analyze_group_access.  */
2930
2931 static bool
2932 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2933 {
2934   data_reference *dr = dr_info->dr;
2935   tree step = DR_STEP (dr);
2936   tree scalar_type = TREE_TYPE (DR_REF (dr));
2937   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2938   stmt_vec_info stmt_info = dr_info->stmt;
2939   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2940   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2941   HOST_WIDE_INT dr_step = -1;
2942   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2943   bool slp_impossible = false;
2944
2945   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2946      size of the interleaving group (including gaps).  */
2947   if (tree_fits_shwi_p (step))
2948     {
2949       dr_step = tree_to_shwi (step);
2950       /* Check that STEP is a multiple of type size.  Otherwise there is
2951          a non-element-sized gap at the end of the group which we
2952          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2953          ???  As we can handle non-constant step fine here we should
2954          simply remove uses of DR_GROUP_GAP between the last and first
2955          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2956          simply not include that gap.  */
2957       if ((dr_step % type_size) != 0)
2958         {
2959           if (dump_enabled_p ())
2960             dump_printf_loc (MSG_NOTE, vect_location,
2961                              "Step %T is not a multiple of the element size"
2962                              " for %T\n",
2963                              step, DR_REF (dr));
2964           return false;
2965         }
2966       groupsize = absu_hwi (dr_step) / type_size;
2967     }
2968   else
2969     groupsize = 0;
2970
2971   /* Not consecutive access is possible only if it is a part of interleaving.  */
2972   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2973     {
2974       /* Check if it this DR is a part of interleaving, and is a single
2975          element of the group that is accessed in the loop.  */
2976
2977       /* Gaps are supported only for loads. STEP must be a multiple of the type
2978          size.  */
2979       if (DR_IS_READ (dr)
2980           && (dr_step % type_size) == 0
2981           && groupsize > 0
2982           /* This could be UINT_MAX but as we are generating code in a very
2983              inefficient way we have to cap earlier.
2984              See PR91403 for example.  */
2985           && groupsize <= 4096)
2986         {
2987           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2988           DR_GROUP_SIZE (stmt_info) = groupsize;
2989           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2990           if (dump_enabled_p ())
2991             dump_printf_loc (MSG_NOTE, vect_location,
2992                              "Detected single element interleaving %T"
2993                              " step %T\n",
2994                              DR_REF (dr), step);
2995
2996           return true;
2997         }
2998
2999       if (dump_enabled_p ())
3000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                          "not consecutive access %G", stmt_info->stmt);
3002
3003       if (bb_vinfo)
3004         {
3005           /* Mark the statement as unvectorizable.  */
3006           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3007           return true;
3008         }
3009
3010       if (dump_enabled_p ())
3011         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3012       STMT_VINFO_STRIDED_P (stmt_info) = true;
3013       return true;
3014     }
3015
3016   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3017     {
3018       /* First stmt in the interleaving chain. Check the chain.  */
3019       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3020       struct data_reference *data_ref = dr;
3021       unsigned int count = 1;
3022       tree prev_init = DR_INIT (data_ref);
3023       HOST_WIDE_INT diff, gaps = 0;
3024
3025       /* By construction, all group members have INTEGER_CST DR_INITs.  */
3026       while (next)
3027         {
3028           /* We never have the same DR multiple times.  */
3029           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3030                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3031
3032           data_ref = STMT_VINFO_DATA_REF (next);
3033
3034           /* All group members have the same STEP by construction.  */
3035           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3036
3037           /* Check that the distance between two accesses is equal to the type
3038              size. Otherwise, we have gaps.  */
3039           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3040                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3041           if (diff < 1 || diff > UINT_MAX)
3042             {
3043               /* For artificial testcases with array accesses with large
3044                  constant indices we can run into overflow issues which
3045                  can end up fooling the groupsize constraint below so
3046                  check the individual gaps (which are represented as
3047                  unsigned int) as well.  */
3048               if (dump_enabled_p ())
3049                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3050                                  "interleaved access with gap larger "
3051                                  "than representable\n");
3052               return false;
3053             }
3054           if (diff != 1)
3055             {
3056               /* FORNOW: SLP of accesses with gaps is not supported.  */
3057               slp_impossible = true;
3058               if (DR_IS_WRITE (data_ref))
3059                 {
3060                   if (dump_enabled_p ())
3061                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3062                                      "interleaved store with gaps\n");
3063                   return false;
3064                 }
3065
3066               gaps += diff - 1;
3067             }
3068
3069           last_accessed_element += diff;
3070
3071           /* Store the gap from the previous member of the group. If there is no
3072              gap in the access, DR_GROUP_GAP is always 1.  */
3073           DR_GROUP_GAP (next) = diff;
3074
3075           prev_init = DR_INIT (data_ref);
3076           next = DR_GROUP_NEXT_ELEMENT (next);
3077           /* Count the number of data-refs in the chain.  */
3078           count++;
3079         }
3080
3081       if (groupsize == 0)
3082         groupsize = count + gaps;
3083
3084       /* This could be UINT_MAX but as we are generating code in a very
3085          inefficient way we have to cap earlier.  See PR78699 for example.  */
3086       if (groupsize > 4096)
3087         {
3088           if (dump_enabled_p ())
3089             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090                              "group is too large\n");
3091           return false;
3092         }
3093
3094       /* Check that the size of the interleaving is equal to count for stores,
3095          i.e., that there are no gaps.  */
3096       if (groupsize != count
3097           && !DR_IS_READ (dr))
3098         {
3099           groupsize = count;
3100           STMT_VINFO_STRIDED_P (stmt_info) = true;
3101         }
3102
3103       /* If there is a gap after the last load in the group it is the
3104          difference between the groupsize and the last accessed
3105          element.
3106          When there is no gap, this difference should be 0.  */
3107       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3108
3109       DR_GROUP_SIZE (stmt_info) = groupsize;
3110       if (dump_enabled_p ())
3111         {
3112           dump_printf_loc (MSG_NOTE, vect_location,
3113                            "Detected interleaving ");
3114           if (DR_IS_READ (dr))
3115             dump_printf (MSG_NOTE, "load ");
3116           else if (STMT_VINFO_STRIDED_P (stmt_info))
3117             dump_printf (MSG_NOTE, "strided store ");
3118           else
3119             dump_printf (MSG_NOTE, "store ");
3120           dump_printf (MSG_NOTE, "of size %u\n",
3121                        (unsigned)groupsize);
3122           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3123           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3124           while (next)
3125             {
3126               if (DR_GROUP_GAP (next) != 1)
3127                 dump_printf_loc (MSG_NOTE, vect_location,
3128                                  "\t<gap of %d elements>\n",
3129                                  DR_GROUP_GAP (next) - 1);
3130               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3131               next = DR_GROUP_NEXT_ELEMENT (next);
3132             }
3133           if (DR_GROUP_GAP (stmt_info) != 0)
3134             dump_printf_loc (MSG_NOTE, vect_location,
3135                              "\t<gap of %d elements>\n",
3136                              DR_GROUP_GAP (stmt_info));
3137         }
3138
3139       /* SLP: create an SLP data structure for every interleaving group of
3140          stores for further analysis in vect_analyse_slp.  */
3141       if (DR_IS_WRITE (dr) && !slp_impossible)
3142         {
3143           if (loop_vinfo)
3144             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3145           if (bb_vinfo)
3146             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3147         }
3148     }
3149
3150   return true;
3151 }
3152
3153 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3154    accesses of legal size, step, etc.  Detect gaps, single element
3155    interleaving, and other special cases. Set grouped access info.
3156    Collect groups of strided stores for further use in SLP analysis.  */
3157
3158 static bool
3159 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3160 {
3161   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3162     {
3163       /* Dissolve the group if present.  */
3164       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3165       while (stmt_info)
3166         {
3167           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3168           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3169           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3170           stmt_info = next;
3171         }
3172       return false;
3173     }
3174   return true;
3175 }
3176
3177 /* Analyze the access pattern of the data-reference DR_INFO.
3178    In case of non-consecutive accesses call vect_analyze_group_access() to
3179    analyze groups of accesses.  */
3180
3181 static bool
3182 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3183 {
3184   data_reference *dr = dr_info->dr;
3185   tree step = DR_STEP (dr);
3186   tree scalar_type = TREE_TYPE (DR_REF (dr));
3187   stmt_vec_info stmt_info = dr_info->stmt;
3188   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3189   class loop *loop = NULL;
3190
3191   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3192     return true;
3193
3194   if (loop_vinfo)
3195     loop = LOOP_VINFO_LOOP (loop_vinfo);
3196
3197   if (loop_vinfo && !step)
3198     {
3199       if (dump_enabled_p ())
3200         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201                          "bad data-ref access in loop\n");
3202       return false;
3203     }
3204
3205   /* Allow loads with zero step in inner-loop vectorization.  */
3206   if (loop_vinfo && integer_zerop (step))
3207     {
3208       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3209       if (!nested_in_vect_loop_p (loop, stmt_info))
3210         return DR_IS_READ (dr);
3211       /* Allow references with zero step for outer loops marked
3212          with pragma omp simd only - it guarantees absence of
3213          loop-carried dependencies between inner loop iterations.  */
3214       if (loop->safelen < 2)
3215         {
3216           if (dump_enabled_p ())
3217             dump_printf_loc (MSG_NOTE, vect_location,
3218                              "zero step in inner loop of nest\n");
3219           return false;
3220         }
3221     }
3222
3223   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3224     {
3225       /* Interleaved accesses are not yet supported within outer-loop
3226         vectorization for references in the inner-loop.  */
3227       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3228
3229       /* For the rest of the analysis we use the outer-loop step.  */
3230       step = STMT_VINFO_DR_STEP (stmt_info);
3231       if (integer_zerop (step))
3232         {
3233           if (dump_enabled_p ())
3234             dump_printf_loc (MSG_NOTE, vect_location,
3235                              "zero step in outer loop.\n");
3236           return DR_IS_READ (dr);
3237         }
3238     }
3239
3240   /* Consecutive?  */
3241   if (TREE_CODE (step) == INTEGER_CST)
3242     {
3243       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3244       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3245           || (dr_step < 0
3246               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3247         {
3248           /* Mark that it is not interleaving.  */
3249           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3250           return true;
3251         }
3252     }
3253
3254   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3255     {
3256       if (dump_enabled_p ())
3257         dump_printf_loc (MSG_NOTE, vect_location,
3258                          "grouped access in outer loop.\n");
3259       return false;
3260     }
3261
3262
3263   /* Assume this is a DR handled by non-constant strided load case.  */
3264   if (TREE_CODE (step) != INTEGER_CST)
3265     return (STMT_VINFO_STRIDED_P (stmt_info)
3266             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3267                 || vect_analyze_group_access (vinfo, dr_info)));
3268
3269   /* Not consecutive access - check if it's a part of interleaving group.  */
3270   return vect_analyze_group_access (vinfo, dr_info);
3271 }
3272
3273 /* Compare two data-references DRA and DRB to group them into chunks
3274    suitable for grouping.  */
3275
3276 static int
3277 dr_group_sort_cmp (const void *dra_, const void *drb_)
3278 {
3279   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3280   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3281   data_reference_p dra = dra_info->dr;
3282   data_reference_p drb = drb_info->dr;
3283   int cmp;
3284
3285   /* Stabilize sort.  */
3286   if (dra == drb)
3287     return 0;
3288
3289   /* Different group IDs lead never belong to the same group.  */
3290   if (dra_info->group != drb_info->group)
3291     return dra_info->group < drb_info->group ? -1 : 1;
3292
3293   /* Ordering of DRs according to base.  */
3294   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3295                                DR_BASE_ADDRESS (drb));
3296   if (cmp != 0)
3297     return cmp;
3298
3299   /* And according to DR_OFFSET.  */
3300   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3301   if (cmp != 0)
3302     return cmp;
3303
3304   /* Put reads before writes.  */
3305   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3306     return DR_IS_READ (dra) ? -1 : 1;
3307
3308   /* Then sort after access size.  */
3309   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3310                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3311   if (cmp != 0)
3312     return cmp;
3313
3314   /* And after step.  */
3315   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3316   if (cmp != 0)
3317     return cmp;
3318
3319   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3320   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3321   if (cmp == 0)
3322     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3323   return cmp;
3324 }
3325
3326 /* If OP is the result of a conversion, return the unconverted value,
3327    otherwise return null.  */
3328
3329 static tree
3330 strip_conversion (tree op)
3331 {
3332   if (TREE_CODE (op) != SSA_NAME)
3333     return NULL_TREE;
3334   gimple *stmt = SSA_NAME_DEF_STMT (op);
3335   if (!is_gimple_assign (stmt)
3336       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3337     return NULL_TREE;
3338   return gimple_assign_rhs1 (stmt);
3339 }
3340
3341 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3342    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3343    be grouped in SLP mode.  */
3344
3345 static bool
3346 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3347                    bool allow_slp_p)
3348 {
3349   if (gimple_assign_single_p (stmt1_info->stmt))
3350     return gimple_assign_single_p (stmt2_info->stmt);
3351
3352   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3353   if (call1 && gimple_call_internal_p (call1))
3354     {
3355       /* Check for two masked loads or two masked stores.  */
3356       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3357       if (!call2 || !gimple_call_internal_p (call2))
3358         return false;
3359       internal_fn ifn = gimple_call_internal_fn (call1);
3360       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3361         return false;
3362       if (ifn != gimple_call_internal_fn (call2))
3363         return false;
3364
3365       /* Check that the masks are the same.  Cope with casts of masks,
3366          like those created by build_mask_conversion.  */
3367       tree mask1 = gimple_call_arg (call1, 2);
3368       tree mask2 = gimple_call_arg (call2, 2);
3369       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3370         {
3371           mask1 = strip_conversion (mask1);
3372           if (!mask1)
3373             return false;
3374           mask2 = strip_conversion (mask2);
3375           if (!mask2)
3376             return false;
3377           if (!operand_equal_p (mask1, mask2, 0))
3378             return false;
3379         }
3380       return true;
3381     }
3382
3383   return false;
3384 }
3385
3386 /* Function vect_analyze_data_ref_accesses.
3387
3388    Analyze the access pattern of all the data references in the loop.
3389
3390    FORNOW: the only access pattern that is considered vectorizable is a
3391            simple step 1 (consecutive) access.
3392
3393    FORNOW: handle only arrays and pointer accesses.  */
3394
3395 opt_result
3396 vect_analyze_data_ref_accesses (vec_info *vinfo,
3397                                 vec<int> *dataref_groups)
3398 {
3399   unsigned int i;
3400   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3401
3402   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3403
3404   if (datarefs.is_empty ())
3405     return opt_result::success ();
3406
3407   /* Sort the array of datarefs to make building the interleaving chains
3408      linear.  Don't modify the original vector's order, it is needed for
3409      determining what dependencies are reversed.  */
3410   vec<dr_vec_info *> datarefs_copy;
3411   datarefs_copy.create (datarefs.length ());
3412   for (unsigned i = 0; i < datarefs.length (); i++)
3413     {
3414       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3415       /* If the caller computed DR grouping use that, otherwise group by
3416          basic blocks.  */
3417       if (dataref_groups)
3418         dr_info->group = (*dataref_groups)[i];
3419       else
3420         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3421       datarefs_copy.quick_push (dr_info);
3422     }
3423   datarefs_copy.qsort (dr_group_sort_cmp);
3424   hash_set<stmt_vec_info> to_fixup;
3425
3426   /* Build the interleaving chains.  */
3427   for (i = 0; i < datarefs_copy.length () - 1;)
3428     {
3429       dr_vec_info *dr_info_a = datarefs_copy[i];
3430       data_reference_p dra = dr_info_a->dr;
3431       int dra_group_id = dr_info_a->group;
3432       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3433       stmt_vec_info lastinfo = NULL;
3434       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3435           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3436         {
3437           ++i;
3438           continue;
3439         }
3440       for (i = i + 1; i < datarefs_copy.length (); ++i)
3441         {
3442           dr_vec_info *dr_info_b = datarefs_copy[i];
3443           data_reference_p drb = dr_info_b->dr;
3444           int drb_group_id = dr_info_b->group;
3445           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3446           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3447               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3448             break;
3449
3450           /* ???  Imperfect sorting (non-compatible types, non-modulo
3451              accesses, same accesses) can lead to a group to be artificially
3452              split here as we don't just skip over those.  If it really
3453              matters we can push those to a worklist and re-iterate
3454              over them.  The we can just skip ahead to the next DR here.  */
3455
3456           /* DRs in a different DR group should not be put into the same
3457              interleaving group.  */
3458           if (dra_group_id != drb_group_id)
3459             break;
3460
3461           /* Check that the data-refs have same first location (except init)
3462              and they are both either store or load (not load and store,
3463              not masked loads or stores).  */
3464           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3465               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3466                                         DR_BASE_ADDRESS (drb)) != 0
3467               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3468               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3469             break;
3470
3471           /* Check that the data-refs have the same constant size.  */
3472           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3473           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3474           if (!tree_fits_uhwi_p (sza)
3475               || !tree_fits_uhwi_p (szb)
3476               || !tree_int_cst_equal (sza, szb))
3477             break;
3478
3479           /* Check that the data-refs have the same step.  */
3480           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3481             break;
3482
3483           /* Check the types are compatible.
3484              ???  We don't distinguish this during sorting.  */
3485           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3486                                    TREE_TYPE (DR_REF (drb))))
3487             break;
3488
3489           /* Check that the DR_INITs are compile-time constants.  */
3490           if (!tree_fits_shwi_p (DR_INIT (dra))
3491               || !tree_fits_shwi_p (DR_INIT (drb)))
3492             break;
3493
3494           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3495              just hold extra information.  */
3496           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3497               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3498               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3499             break;
3500
3501           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3502           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3503           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3504           HOST_WIDE_INT init_prev
3505             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3506           gcc_assert (init_a <= init_b
3507                       && init_a <= init_prev
3508                       && init_prev <= init_b);
3509
3510           /* Do not place the same access in the interleaving chain twice.  */
3511           if (init_b == init_prev)
3512             {
3513               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3514                           < gimple_uid (DR_STMT (drb)));
3515               /* Simply link in duplicates and fix up the chain below.  */
3516             }
3517           else
3518             {
3519               /* If init_b == init_a + the size of the type * k, we have an
3520                  interleaving, and DRA is accessed before DRB.  */
3521               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3522               if (type_size_a == 0
3523                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3524                       % type_size_a != 0))
3525                 break;
3526
3527               /* If we have a store, the accesses are adjacent.  This splits
3528                  groups into chunks we support (we don't support vectorization
3529                  of stores with gaps).  */
3530               if (!DR_IS_READ (dra)
3531                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3532                       != type_size_a))
3533                 break;
3534
3535               /* If the step (if not zero or non-constant) is smaller than the
3536                  difference between data-refs' inits this splits groups into
3537                  suitable sizes.  */
3538               if (tree_fits_shwi_p (DR_STEP (dra)))
3539                 {
3540                   unsigned HOST_WIDE_INT step
3541                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3542                   if (step != 0
3543                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3544                     break;
3545                 }
3546             }
3547
3548           if (dump_enabled_p ())
3549             dump_printf_loc (MSG_NOTE, vect_location,
3550                              DR_IS_READ (dra)
3551                              ? "Detected interleaving load %T and %T\n"
3552                              : "Detected interleaving store %T and %T\n",
3553                              DR_REF (dra), DR_REF (drb));
3554
3555           /* Link the found element into the group list.  */
3556           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3557             {
3558               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3559               lastinfo = stmtinfo_a;
3560             }
3561           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3562           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3563           lastinfo = stmtinfo_b;
3564
3565           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3566             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3567
3568           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3569             dump_printf_loc (MSG_NOTE, vect_location,
3570                              "Load suitable for SLP vectorization only.\n");
3571
3572           if (init_b == init_prev
3573               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3574               && dump_enabled_p ())
3575             dump_printf_loc (MSG_NOTE, vect_location,
3576                              "Queuing group with duplicate access for fixup\n");
3577         }
3578     }
3579
3580   /* Fixup groups with duplicate entries by splitting it.  */
3581   while (1)
3582     {
3583       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3584       if (!(it != to_fixup.end ()))
3585         break;
3586       stmt_vec_info grp = *it;
3587       to_fixup.remove (grp);
3588
3589       /* Find the earliest duplicate group member.  */
3590       unsigned first_duplicate = -1u;
3591       stmt_vec_info next, g = grp;
3592       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3593         {
3594           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3595                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3596               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3597             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3598           g = next;
3599         }
3600       if (first_duplicate == -1U)
3601         continue;
3602
3603       /* Then move all stmts after the first duplicate to a new group.
3604          Note this is a heuristic but one with the property that *it
3605          is fixed up completely.  */
3606       g = grp;
3607       stmt_vec_info newgroup = NULL, ng = grp;
3608       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3609         {
3610           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3611             {
3612               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3613               if (!newgroup)
3614                 newgroup = next;
3615               else
3616                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3617               ng = next;
3618               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3619             }
3620           else
3621             g = DR_GROUP_NEXT_ELEMENT (g);
3622         }
3623       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3624
3625       /* Fixup the new group which still may contain duplicates.  */
3626       to_fixup.add (newgroup);
3627     }
3628
3629   dr_vec_info *dr_info;
3630   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3631     {
3632       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3633           && !vect_analyze_data_ref_access (vinfo, dr_info))
3634         {
3635           if (dump_enabled_p ())
3636             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3637                              "not vectorized: complicated access pattern.\n");
3638
3639           if (is_a <bb_vec_info> (vinfo))
3640             {
3641               /* Mark the statement as not vectorizable.  */
3642               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3643               continue;
3644             }
3645           else
3646             {
3647               datarefs_copy.release ();
3648               return opt_result::failure_at (dr_info->stmt->stmt,
3649                                              "not vectorized:"
3650                                              " complicated access pattern.\n");
3651             }
3652         }
3653     }
3654
3655   datarefs_copy.release ();
3656   return opt_result::success ();
3657 }
3658
3659 /* Function vect_vfa_segment_size.
3660
3661    Input:
3662      DR_INFO: The data reference.
3663      LENGTH_FACTOR: segment length to consider.
3664
3665    Return a value suitable for the dr_with_seg_len::seg_len field.
3666    This is the "distance travelled" by the pointer from the first
3667    iteration in the segment to the last.  Note that it does not include
3668    the size of the access; in effect it only describes the first byte.  */
3669
3670 static tree
3671 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3672 {
3673   length_factor = size_binop (MINUS_EXPR,
3674                               fold_convert (sizetype, length_factor),
3675                               size_one_node);
3676   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3677                      length_factor);
3678 }
3679
3680 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3681    gives the worst-case number of bytes covered by the segment.  */
3682
3683 static unsigned HOST_WIDE_INT
3684 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3685 {
3686   stmt_vec_info stmt_vinfo = dr_info->stmt;
3687   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3688   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3689   unsigned HOST_WIDE_INT access_size = ref_size;
3690   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3691     {
3692       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3693       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3694     }
3695   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3696   int misalignment;
3697   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3698       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3699       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3700           == dr_explicit_realign_optimized))
3701     {
3702       /* We might access a full vector's worth.  */
3703       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3704     }
3705   return access_size;
3706 }
3707
3708 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3709    describes.  */
3710
3711 static unsigned int
3712 vect_vfa_align (dr_vec_info *dr_info)
3713 {
3714   return dr_alignment (dr_info->dr);
3715 }
3716
3717 /* Function vect_no_alias_p.
3718
3719    Given data references A and B with equal base and offset, see whether
3720    the alias relation can be decided at compilation time.  Return 1 if
3721    it can and the references alias, 0 if it can and the references do
3722    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3723    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3724    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3725
3726 static int
3727 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3728                          tree segment_length_a, tree segment_length_b,
3729                          unsigned HOST_WIDE_INT access_size_a,
3730                          unsigned HOST_WIDE_INT access_size_b)
3731 {
3732   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3733   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3734   poly_uint64 const_length_a;
3735   poly_uint64 const_length_b;
3736
3737   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3738      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3739      [a, a+12) */
3740   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3741     {
3742       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3743       offset_a -= const_length_a;
3744     }
3745   else
3746     const_length_a = tree_to_poly_uint64 (segment_length_a);
3747   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3748     {
3749       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3750       offset_b -= const_length_b;
3751     }
3752   else
3753     const_length_b = tree_to_poly_uint64 (segment_length_b);
3754
3755   const_length_a += access_size_a;
3756   const_length_b += access_size_b;
3757
3758   if (ranges_known_overlap_p (offset_a, const_length_a,
3759                               offset_b, const_length_b))
3760     return 1;
3761
3762   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3763                                offset_b, const_length_b))
3764     return 0;
3765
3766   return -1;
3767 }
3768
3769 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3770    in DDR is >= VF.  */
3771
3772 static bool
3773 dependence_distance_ge_vf (data_dependence_relation *ddr,
3774                            unsigned int loop_depth, poly_uint64 vf)
3775 {
3776   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3777       || DDR_NUM_DIST_VECTS (ddr) == 0)
3778     return false;
3779
3780   /* If the dependence is exact, we should have limited the VF instead.  */
3781   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3782
3783   unsigned int i;
3784   lambda_vector dist_v;
3785   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3786     {
3787       HOST_WIDE_INT dist = dist_v[loop_depth];
3788       if (dist != 0
3789           && !(dist > 0 && DDR_REVERSED_P (ddr))
3790           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3791         return false;
3792     }
3793
3794   if (dump_enabled_p ())
3795     dump_printf_loc (MSG_NOTE, vect_location,
3796                      "dependence distance between %T and %T is >= VF\n",
3797                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3798
3799   return true;
3800 }
3801
3802 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3803
3804 static void
3805 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3806 {
3807   dump_printf (dump_kind, "%s (%T) >= ",
3808                lower_bound.unsigned_p ? "unsigned" : "abs",
3809                lower_bound.expr);
3810   dump_dec (dump_kind, lower_bound.min_value);
3811 }
3812
3813 /* Record that the vectorized loop requires the vec_lower_bound described
3814    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3815
3816 static void
3817 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3818                         poly_uint64 min_value)
3819 {
3820   vec<vec_lower_bound> &lower_bounds
3821     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3822   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3823     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3824       {
3825         unsigned_p &= lower_bounds[i].unsigned_p;
3826         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3827         if (lower_bounds[i].unsigned_p != unsigned_p
3828             || maybe_lt (lower_bounds[i].min_value, min_value))
3829           {
3830             lower_bounds[i].unsigned_p = unsigned_p;
3831             lower_bounds[i].min_value = min_value;
3832             if (dump_enabled_p ())
3833               {
3834                 dump_printf_loc (MSG_NOTE, vect_location,
3835                                  "updating run-time check to ");
3836                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3837                 dump_printf (MSG_NOTE, "\n");
3838               }
3839           }
3840         return;
3841       }
3842
3843   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3844   if (dump_enabled_p ())
3845     {
3846       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3847       dump_lower_bound (MSG_NOTE, lower_bound);
3848       dump_printf (MSG_NOTE, "\n");
3849     }
3850   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3851 }
3852
3853 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3854    will span fewer than GAP bytes.  */
3855
3856 static bool
3857 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3858                   poly_int64 gap)
3859 {
3860   stmt_vec_info stmt_info = dr_info->stmt;
3861   HOST_WIDE_INT count
3862     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3863   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3864     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3865   return (estimated_poly_value (gap)
3866           <= count * vect_get_scalar_dr_size (dr_info));
3867 }
3868
3869 /* Return true if we know that there is no alias between DR_INFO_A and
3870    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3871    When returning true, set *LOWER_BOUND_OUT to this N.  */
3872
3873 static bool
3874 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3875                                 poly_uint64 *lower_bound_out)
3876 {
3877   /* Check that there is a constant gap of known sign between DR_A
3878      and DR_B.  */
3879   data_reference *dr_a = dr_info_a->dr;
3880   data_reference *dr_b = dr_info_b->dr;
3881   poly_int64 init_a, init_b;
3882   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3883       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3884       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3885       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3886       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3887       || !ordered_p (init_a, init_b))
3888     return false;
3889
3890   /* Sort DR_A and DR_B by the address they access.  */
3891   if (maybe_lt (init_b, init_a))
3892     {
3893       std::swap (init_a, init_b);
3894       std::swap (dr_info_a, dr_info_b);
3895       std::swap (dr_a, dr_b);
3896     }
3897
3898   /* If the two accesses could be dependent within a scalar iteration,
3899      make sure that we'd retain their order.  */
3900   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3901       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3902     return false;
3903
3904   /* There is no alias if abs (DR_STEP) is greater than or equal to
3905      the bytes spanned by the combination of the two accesses.  */
3906   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3907   return true;
3908 }
3909
3910 /* Function vect_prune_runtime_alias_test_list.
3911
3912    Prune a list of ddrs to be tested at run-time by versioning for alias.
3913    Merge several alias checks into one if possible.
3914    Return FALSE if resulting list of ddrs is longer then allowed by
3915    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3916
3917 opt_result
3918 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3919 {
3920   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3921   hash_set <tree_pair_hash> compared_objects;
3922
3923   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3924   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3925     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3926   const vec<vec_object_pair> &check_unequal_addrs
3927     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3928   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3929   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3930
3931   ddr_p ddr;
3932   unsigned int i;
3933   tree length_factor;
3934
3935   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3936
3937   /* Step values are irrelevant for aliasing if the number of vector
3938      iterations is equal to the number of scalar iterations (which can
3939      happen for fully-SLP loops).  */
3940   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3941
3942   if (!vf_one_p)
3943     {
3944       /* Convert the checks for nonzero steps into bound tests.  */
3945       tree value;
3946       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3947         vect_check_lower_bound (loop_vinfo, value, true, 1);
3948     }
3949
3950   if (may_alias_ddrs.is_empty ())
3951     return opt_result::success ();
3952
3953   comp_alias_ddrs.create (may_alias_ddrs.length ());
3954
3955   unsigned int loop_depth
3956     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3957                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3958
3959   /* First, we collect all data ref pairs for aliasing checks.  */
3960   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3961     {
3962       poly_uint64 lower_bound;
3963       tree segment_length_a, segment_length_b;
3964       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3965       unsigned int align_a, align_b;
3966
3967       /* Ignore the alias if the VF we chose ended up being no greater
3968          than the dependence distance.  */
3969       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3970         continue;
3971
3972       if (DDR_OBJECT_A (ddr))
3973         {
3974           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3975           if (!compared_objects.add (new_pair))
3976             {
3977               if (dump_enabled_p ())
3978                 dump_printf_loc (MSG_NOTE, vect_location,
3979                                  "checking that %T and %T"
3980                                  " have different addresses\n",
3981                                  new_pair.first, new_pair.second);
3982               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3983             }
3984           continue;
3985         }
3986
3987       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3988       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3989
3990       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3991       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3992
3993       bool preserves_scalar_order_p
3994         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3995       bool ignore_step_p
3996           = (vf_one_p
3997              && (preserves_scalar_order_p
3998                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3999                                      DR_STEP (dr_info_b->dr))));
4000
4001       /* Skip the pair if inter-iteration dependencies are irrelevant
4002          and intra-iteration dependencies are guaranteed to be honored.  */
4003       if (ignore_step_p
4004           && (preserves_scalar_order_p
4005               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4006                                                  &lower_bound)))
4007         {
4008           if (dump_enabled_p ())
4009             dump_printf_loc (MSG_NOTE, vect_location,
4010                              "no need for alias check between "
4011                              "%T and %T when VF is 1\n",
4012                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4013           continue;
4014         }
4015
4016       /* See whether we can handle the alias using a bounds check on
4017          the step, and whether that's likely to be the best approach.
4018          (It might not be, for example, if the minimum step is much larger
4019          than the number of bytes handled by one vector iteration.)  */
4020       if (!ignore_step_p
4021           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4022           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4023                                              &lower_bound)
4024           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4025               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4026         {
4027           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4028           if (dump_enabled_p ())
4029             {
4030               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4031                                "%T and %T when the step %T is outside ",
4032                                DR_REF (dr_info_a->dr),
4033                                DR_REF (dr_info_b->dr),
4034                                DR_STEP (dr_info_a->dr));
4035               if (unsigned_p)
4036                 dump_printf (MSG_NOTE, "[0");
4037               else
4038                 {
4039                   dump_printf (MSG_NOTE, "(");
4040                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4041                 }
4042               dump_printf (MSG_NOTE, ", ");
4043               dump_dec (MSG_NOTE, lower_bound);
4044               dump_printf (MSG_NOTE, ")\n");
4045             }
4046           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4047                                   unsigned_p, lower_bound);
4048           continue;
4049         }
4050
4051       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4052       if (dr_group_first_a)
4053         {
4054           stmt_info_a = dr_group_first_a;
4055           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4056         }
4057
4058       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4059       if (dr_group_first_b)
4060         {
4061           stmt_info_b = dr_group_first_b;
4062           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4063         }
4064
4065       if (ignore_step_p)
4066         {
4067           segment_length_a = size_zero_node;
4068           segment_length_b = size_zero_node;
4069         }
4070       else
4071         {
4072           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4073                                 DR_STEP (dr_info_b->dr), 0))
4074             length_factor = scalar_loop_iters;
4075           else
4076             length_factor = size_int (vect_factor);
4077           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4078           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4079         }
4080       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4081       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4082       align_a = vect_vfa_align (dr_info_a);
4083       align_b = vect_vfa_align (dr_info_b);
4084
4085       /* See whether the alias is known at compilation time.  */
4086       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4087                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4088           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4089                               DR_OFFSET (dr_info_b->dr), 0)
4090           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4091           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4092           && poly_int_tree_p (segment_length_a)
4093           && poly_int_tree_p (segment_length_b))
4094         {
4095           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4096                                              segment_length_a,
4097                                              segment_length_b,
4098                                              access_size_a,
4099                                              access_size_b);
4100           if (res >= 0 && dump_enabled_p ())
4101             {
4102               dump_printf_loc (MSG_NOTE, vect_location,
4103                                "can tell at compile time that %T and %T",
4104                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4105               if (res == 0)
4106                 dump_printf (MSG_NOTE, " do not alias\n");
4107               else
4108                 dump_printf (MSG_NOTE, " alias\n");
4109             }
4110
4111           if (res == 0)
4112             continue;
4113
4114           if (res == 1)
4115             return opt_result::failure_at (stmt_info_b->stmt,
4116                                            "not vectorized:"
4117                                            " compilation time alias: %G%G",
4118                                            stmt_info_a->stmt,
4119                                            stmt_info_b->stmt);
4120         }
4121
4122       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4123                             access_size_a, align_a);
4124       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4125                             access_size_b, align_b);
4126       /* Canonicalize the order to be the one that's needed for accurate
4127          RAW, WAR and WAW flags, in cases where the data references are
4128          well-ordered.  The order doesn't really matter otherwise,
4129          but we might as well be consistent.  */
4130       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4131         std::swap (dr_a, dr_b);
4132
4133       dr_with_seg_len_pair_t dr_with_seg_len_pair
4134         (dr_a, dr_b, (preserves_scalar_order_p
4135                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4136                       : dr_with_seg_len_pair_t::REORDERED));
4137
4138       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4139     }
4140
4141   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4142
4143   unsigned int count = (comp_alias_ddrs.length ()
4144                         + check_unequal_addrs.length ());
4145
4146   if (count
4147       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4148           == VECT_COST_MODEL_VERY_CHEAP))
4149     return opt_result::failure_at
4150       (vect_location, "would need a runtime alias check\n");
4151
4152   if (dump_enabled_p ())
4153     dump_printf_loc (MSG_NOTE, vect_location,
4154                      "improved number of alias checks from %d to %d\n",
4155                      may_alias_ddrs.length (), count);
4156   unsigned limit = param_vect_max_version_for_alias_checks;
4157   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4158     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4159   if (count > limit)
4160     return opt_result::failure_at
4161       (vect_location,
4162        "number of versioning for alias run-time tests exceeds %d "
4163        "(--param vect-max-version-for-alias-checks)\n", limit);
4164
4165   return opt_result::success ();
4166 }
4167
4168 /* Check whether we can use an internal function for a gather load
4169    or scatter store.  READ_P is true for loads and false for stores.
4170    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4171    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4172    is the type of the offset that is being applied to the invariant
4173    base address.  SCALE is the amount by which the offset should
4174    be multiplied *after* it has been converted to address width.
4175
4176    Return true if the function is supported, storing the function id in
4177    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
4178
4179 bool
4180 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4181                           tree vectype, tree memory_type, tree offset_type,
4182                           int scale, internal_fn *ifn_out,
4183                           tree *offset_vectype_out)
4184 {
4185   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4186   unsigned int element_bits = vector_element_bits (vectype);
4187   if (element_bits != memory_bits)
4188     /* For now the vector elements must be the same width as the
4189        memory elements.  */
4190     return false;
4191
4192   /* Work out which function we need.  */
4193   internal_fn ifn, alt_ifn, alt_ifn2;
4194   if (read_p)
4195     {
4196       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4197       alt_ifn = IFN_MASK_GATHER_LOAD;
4198       /* When target supports MASK_LEN_GATHER_LOAD, we always
4199          use MASK_LEN_GATHER_LOAD regardless whether len and
4200          mask are valid or not.  */
4201       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4202     }
4203   else
4204     {
4205       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4206       alt_ifn = IFN_MASK_SCATTER_STORE;
4207       /* When target supports MASK_LEN_SCATTER_STORE, we always
4208          use MASK_LEN_SCATTER_STORE regardless whether len and
4209          mask are valid or not.  */
4210       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4211     }
4212
4213   for (;;)
4214     {
4215       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4216       if (!offset_vectype)
4217         return false;
4218
4219       /* Test whether the target supports this combination.  */
4220       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4221                                                   offset_vectype, scale))
4222         {
4223           *ifn_out = ifn;
4224           *offset_vectype_out = offset_vectype;
4225           return true;
4226         }
4227       else if (!masked_p
4228                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4229                                                           memory_type,
4230                                                           offset_vectype,
4231                                                           scale))
4232         {
4233           *ifn_out = alt_ifn;
4234           *offset_vectype_out = offset_vectype;
4235           return true;
4236         }
4237       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4238                                                        memory_type,
4239                                                        offset_vectype, scale))
4240         {
4241           *ifn_out = alt_ifn2;
4242           *offset_vectype_out = offset_vectype;
4243           return true;
4244         }
4245
4246       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4247           && TYPE_PRECISION (offset_type) >= element_bits)
4248         return false;
4249
4250       offset_type = build_nonstandard_integer_type
4251         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4252     }
4253 }
4254
4255 /* STMT_INFO is a call to an internal gather load or scatter store function.
4256    Describe the operation in INFO.  */
4257
4258 static void
4259 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4260                                    gather_scatter_info *info)
4261 {
4262   gcall *call = as_a <gcall *> (stmt_info->stmt);
4263   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4264   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4265
4266   info->ifn = gimple_call_internal_fn (call);
4267   info->decl = NULL_TREE;
4268   info->base = gimple_call_arg (call, 0);
4269   info->offset = gimple_call_arg (call, 1);
4270   info->offset_dt = vect_unknown_def_type;
4271   info->offset_vectype = NULL_TREE;
4272   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4273   info->element_type = TREE_TYPE (vectype);
4274   info->memory_type = TREE_TYPE (DR_REF (dr));
4275 }
4276
4277 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4278    gather load or scatter store.  Describe the operation in *INFO if so.  */
4279
4280 bool
4281 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4282                            gather_scatter_info *info)
4283 {
4284   HOST_WIDE_INT scale = 1;
4285   poly_int64 pbitpos, pbitsize;
4286   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4287   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4288   tree offtype = NULL_TREE;
4289   tree decl = NULL_TREE, base, off;
4290   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4291   tree memory_type = TREE_TYPE (DR_REF (dr));
4292   machine_mode pmode;
4293   int punsignedp, reversep, pvolatilep = 0;
4294   internal_fn ifn;
4295   tree offset_vectype;
4296   bool masked_p = false;
4297
4298   /* See whether this is already a call to a gather/scatter internal function.
4299      If not, see whether it's a masked load or store.  */
4300   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4301   if (call && gimple_call_internal_p (call))
4302     {
4303       ifn = gimple_call_internal_fn (call);
4304       if (internal_gather_scatter_fn_p (ifn))
4305         {
4306           vect_describe_gather_scatter_call (stmt_info, info);
4307           return true;
4308         }
4309       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4310     }
4311
4312   /* True if we should aim to use internal functions rather than
4313      built-in functions.  */
4314   bool use_ifn_p = (DR_IS_READ (dr)
4315                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4316                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4317
4318   base = DR_REF (dr);
4319   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4320      see if we can use the def stmt of the address.  */
4321   if (masked_p
4322       && TREE_CODE (base) == MEM_REF
4323       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4324       && integer_zerop (TREE_OPERAND (base, 1))
4325       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4326     {
4327       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4328       if (is_gimple_assign (def_stmt)
4329           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4330         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4331     }
4332
4333   /* The gather and scatter builtins need address of the form
4334      loop_invariant + vector * {1, 2, 4, 8}
4335      or
4336      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4337      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4338      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4339      multiplications and additions in it.  To get a vector, we need
4340      a single SSA_NAME that will be defined in the loop and will
4341      contain everything that is not loop invariant and that can be
4342      vectorized.  The following code attempts to find such a preexistng
4343      SSA_NAME OFF and put the loop invariants into a tree BASE
4344      that can be gimplified before the loop.  */
4345   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4346                               &punsignedp, &reversep, &pvolatilep);
4347   if (reversep)
4348     return false;
4349
4350   /* PR 107346.  Packed structs can have fields at offsets that are not
4351      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4352   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4353     return false;
4354
4355   /* We need to be able to form an address to the base which for example
4356      isn't possible for hard registers.  */
4357   if (may_be_nonaddressable_p (base))
4358     return false;
4359
4360   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4361
4362   if (TREE_CODE (base) == MEM_REF)
4363     {
4364       if (!integer_zerop (TREE_OPERAND (base, 1)))
4365         {
4366           if (off == NULL_TREE)
4367             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4368           else
4369             off = size_binop (PLUS_EXPR, off,
4370                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4371         }
4372       base = TREE_OPERAND (base, 0);
4373     }
4374   else
4375     base = build_fold_addr_expr (base);
4376
4377   if (off == NULL_TREE)
4378     off = size_zero_node;
4379
4380   /* If base is not loop invariant, either off is 0, then we start with just
4381      the constant offset in the loop invariant BASE and continue with base
4382      as OFF, otherwise give up.
4383      We could handle that case by gimplifying the addition of base + off
4384      into some SSA_NAME and use that as off, but for now punt.  */
4385   if (!expr_invariant_in_loop_p (loop, base))
4386     {
4387       if (!integer_zerop (off))
4388         return false;
4389       off = base;
4390       base = size_int (pbytepos);
4391     }
4392   /* Otherwise put base + constant offset into the loop invariant BASE
4393      and continue with OFF.  */
4394   else
4395     {
4396       base = fold_convert (sizetype, base);
4397       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4398     }
4399
4400   /* OFF at this point may be either a SSA_NAME or some tree expression
4401      from get_inner_reference.  Try to peel off loop invariants from it
4402      into BASE as long as possible.  */
4403   STRIP_NOPS (off);
4404   while (offtype == NULL_TREE)
4405     {
4406       enum tree_code code;
4407       tree op0, op1, add = NULL_TREE;
4408
4409       if (TREE_CODE (off) == SSA_NAME)
4410         {
4411           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4412
4413           if (expr_invariant_in_loop_p (loop, off))
4414             return false;
4415
4416           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4417             break;
4418
4419           op0 = gimple_assign_rhs1 (def_stmt);
4420           code = gimple_assign_rhs_code (def_stmt);
4421           op1 = gimple_assign_rhs2 (def_stmt);
4422         }
4423       else
4424         {
4425           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4426             return false;
4427           code = TREE_CODE (off);
4428           extract_ops_from_tree (off, &code, &op0, &op1);
4429         }
4430       switch (code)
4431         {
4432         case POINTER_PLUS_EXPR:
4433         case PLUS_EXPR:
4434           if (expr_invariant_in_loop_p (loop, op0))
4435             {
4436               add = op0;
4437               off = op1;
4438             do_add:
4439               add = fold_convert (sizetype, add);
4440               if (scale != 1)
4441                 add = size_binop (MULT_EXPR, add, size_int (scale));
4442               base = size_binop (PLUS_EXPR, base, add);
4443               continue;
4444             }
4445           if (expr_invariant_in_loop_p (loop, op1))
4446             {
4447               add = op1;
4448               off = op0;
4449               goto do_add;
4450             }
4451           break;
4452         case MINUS_EXPR:
4453           if (expr_invariant_in_loop_p (loop, op1))
4454             {
4455               add = fold_convert (sizetype, op1);
4456               add = size_binop (MINUS_EXPR, size_zero_node, add);
4457               off = op0;
4458               goto do_add;
4459             }
4460           break;
4461         case MULT_EXPR:
4462           if (scale == 1 && tree_fits_shwi_p (op1))
4463             {
4464               int new_scale = tree_to_shwi (op1);
4465               /* Only treat this as a scaling operation if the target
4466                  supports it for at least some offset type.  */
4467               if (use_ifn_p
4468                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4469                                                 masked_p, vectype, memory_type,
4470                                                 signed_char_type_node,
4471                                                 new_scale, &ifn,
4472                                                 &offset_vectype)
4473                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4474                                                 masked_p, vectype, memory_type,
4475                                                 unsigned_char_type_node,
4476                                                 new_scale, &ifn,
4477                                                 &offset_vectype))
4478                 break;
4479               scale = new_scale;
4480               off = op0;
4481               continue;
4482             }
4483           break;
4484         case SSA_NAME:
4485           off = op0;
4486           continue;
4487         CASE_CONVERT:
4488           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4489               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4490             break;
4491
4492           /* Don't include the conversion if the target is happy with
4493              the current offset type.  */
4494           if (use_ifn_p
4495               && TREE_CODE (off) == SSA_NAME
4496               && !POINTER_TYPE_P (TREE_TYPE (off))
4497               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4498                                            masked_p, vectype, memory_type,
4499                                            TREE_TYPE (off), scale, &ifn,
4500                                            &offset_vectype))
4501             break;
4502
4503           if (TYPE_PRECISION (TREE_TYPE (op0))
4504               == TYPE_PRECISION (TREE_TYPE (off)))
4505             {
4506               off = op0;
4507               continue;
4508             }
4509
4510           /* Include the conversion if it is widening and we're using
4511              the IFN path or the target can handle the converted from
4512              offset or the current size is not already the same as the
4513              data vector element size.  */
4514           if ((TYPE_PRECISION (TREE_TYPE (op0))
4515                < TYPE_PRECISION (TREE_TYPE (off)))
4516               && (use_ifn_p
4517                   || (DR_IS_READ (dr)
4518                       ? (targetm.vectorize.builtin_gather
4519                          && targetm.vectorize.builtin_gather (vectype,
4520                                                               TREE_TYPE (op0),
4521                                                               scale))
4522                       : (targetm.vectorize.builtin_scatter
4523                          && targetm.vectorize.builtin_scatter (vectype,
4524                                                                TREE_TYPE (op0),
4525                                                                scale)))
4526                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4527                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4528             {
4529               off = op0;
4530               offtype = TREE_TYPE (off);
4531               STRIP_NOPS (off);
4532               continue;
4533             }
4534           break;
4535         default:
4536           break;
4537         }
4538       break;
4539     }
4540
4541   /* If at the end OFF still isn't a SSA_NAME or isn't
4542      defined in the loop, punt.  */
4543   if (TREE_CODE (off) != SSA_NAME
4544       || expr_invariant_in_loop_p (loop, off))
4545     return false;
4546
4547   if (offtype == NULL_TREE)
4548     offtype = TREE_TYPE (off);
4549
4550   if (use_ifn_p)
4551     {
4552       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4553                                      vectype, memory_type, offtype, scale,
4554                                      &ifn, &offset_vectype))
4555         ifn = IFN_LAST;
4556       decl = NULL_TREE;
4557     }
4558   else
4559     {
4560       if (DR_IS_READ (dr))
4561         {
4562           if (targetm.vectorize.builtin_gather)
4563             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4564         }
4565       else
4566         {
4567           if (targetm.vectorize.builtin_scatter)
4568             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4569         }
4570       ifn = IFN_LAST;
4571       /* The offset vector type will be read from DECL when needed.  */
4572       offset_vectype = NULL_TREE;
4573     }
4574
4575   info->ifn = ifn;
4576   info->decl = decl;
4577   info->base = base;
4578   info->offset = off;
4579   info->offset_dt = vect_unknown_def_type;
4580   info->offset_vectype = offset_vectype;
4581   info->scale = scale;
4582   info->element_type = TREE_TYPE (vectype);
4583   info->memory_type = memory_type;
4584   return true;
4585 }
4586
4587 /* Find the data references in STMT, analyze them with respect to LOOP and
4588    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4589    be handled.  */
4590
4591 opt_result
4592 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4593                                vec<data_reference_p> *datarefs,
4594                                vec<int> *dataref_groups, int group_id)
4595 {
4596   /* We can ignore clobbers for dataref analysis - they are removed during
4597      loop vectorization and BB vectorization checks dependences with a
4598      stmt walk.  */
4599   if (gimple_clobber_p (stmt))
4600     return opt_result::success ();
4601
4602   if (gimple_has_volatile_ops (stmt))
4603     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4604                                    stmt);
4605
4606   if (stmt_can_throw_internal (cfun, stmt))
4607     return opt_result::failure_at (stmt,
4608                                    "not vectorized:"
4609                                    " statement can throw an exception: %G",
4610                                    stmt);
4611
4612   auto_vec<data_reference_p, 2> refs;
4613   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4614   if (!res)
4615     return res;
4616
4617   if (refs.is_empty ())
4618     return opt_result::success ();
4619
4620   if (refs.length () > 1)
4621     {
4622       while (!refs.is_empty ())
4623         free_data_ref (refs.pop ());
4624       return opt_result::failure_at (stmt,
4625                                      "not vectorized: more than one "
4626                                      "data ref in stmt: %G", stmt);
4627     }
4628
4629   data_reference_p dr = refs.pop ();
4630   if (gcall *call = dyn_cast <gcall *> (stmt))
4631     if (!gimple_call_internal_p (call)
4632         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4633             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4634       {
4635         free_data_ref (dr);
4636         return opt_result::failure_at (stmt,
4637                                        "not vectorized: dr in a call %G", stmt);
4638       }
4639
4640   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4641       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4642     {
4643       free_data_ref (dr);
4644       return opt_result::failure_at (stmt,
4645                                      "not vectorized:"
4646                                      " statement is an unsupported"
4647                                      " bitfield access %G", stmt);
4648     }
4649
4650   if (DR_BASE_ADDRESS (dr)
4651       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4652     {
4653       free_data_ref (dr);
4654       return opt_result::failure_at (stmt,
4655                                      "not vectorized:"
4656                                      " base addr of dr is a constant\n");
4657     }
4658
4659   /* Check whether this may be a SIMD lane access and adjust the
4660      DR to make it easier for us to handle it.  */
4661   if (loop
4662       && loop->simduid
4663       && (!DR_BASE_ADDRESS (dr)
4664           || !DR_OFFSET (dr)
4665           || !DR_INIT (dr)
4666           || !DR_STEP (dr)))
4667     {
4668       struct data_reference *newdr
4669         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4670                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4671       if (DR_BASE_ADDRESS (newdr)
4672           && DR_OFFSET (newdr)
4673           && DR_INIT (newdr)
4674           && DR_STEP (newdr)
4675           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4676           && integer_zerop (DR_STEP (newdr)))
4677         {
4678           tree base_address = DR_BASE_ADDRESS (newdr);
4679           tree off = DR_OFFSET (newdr);
4680           tree step = ssize_int (1);
4681           if (integer_zerop (off)
4682               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4683             {
4684               off = TREE_OPERAND (base_address, 1);
4685               base_address = TREE_OPERAND (base_address, 0);
4686             }
4687           STRIP_NOPS (off);
4688           if (TREE_CODE (off) == MULT_EXPR
4689               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4690             {
4691               step = TREE_OPERAND (off, 1);
4692               off = TREE_OPERAND (off, 0);
4693               STRIP_NOPS (off);
4694             }
4695           if (CONVERT_EXPR_P (off)
4696               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4697                   < TYPE_PRECISION (TREE_TYPE (off))))
4698             off = TREE_OPERAND (off, 0);
4699           if (TREE_CODE (off) == SSA_NAME)
4700             {
4701               gimple *def = SSA_NAME_DEF_STMT (off);
4702               /* Look through widening conversion.  */
4703               if (is_gimple_assign (def)
4704                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4705                 {
4706                   tree rhs1 = gimple_assign_rhs1 (def);
4707                   if (TREE_CODE (rhs1) == SSA_NAME
4708                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4709                       && (TYPE_PRECISION (TREE_TYPE (off))
4710                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4711                     def = SSA_NAME_DEF_STMT (rhs1);
4712                 }
4713               if (is_gimple_call (def)
4714                   && gimple_call_internal_p (def)
4715                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4716                 {
4717                   tree arg = gimple_call_arg (def, 0);
4718                   tree reft = TREE_TYPE (DR_REF (newdr));
4719                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4720                   arg = SSA_NAME_VAR (arg);
4721                   if (arg == loop->simduid
4722                       /* For now.  */
4723                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4724                     {
4725                       DR_BASE_ADDRESS (newdr) = base_address;
4726                       DR_OFFSET (newdr) = ssize_int (0);
4727                       DR_STEP (newdr) = step;
4728                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4729                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4730                       /* Mark as simd-lane access.  */
4731                       tree arg2 = gimple_call_arg (def, 1);
4732                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4733                       free_data_ref (dr);
4734                       datarefs->safe_push (newdr);
4735                       if (dataref_groups)
4736                         dataref_groups->safe_push (group_id);
4737                       return opt_result::success ();
4738                     }
4739                 }
4740             }
4741         }
4742       free_data_ref (newdr);
4743     }
4744
4745   datarefs->safe_push (dr);
4746   if (dataref_groups)
4747     dataref_groups->safe_push (group_id);
4748   return opt_result::success ();
4749 }
4750
4751 /* Function vect_analyze_data_refs.
4752
4753   Find all the data references in the loop or basic block.
4754
4755    The general structure of the analysis of data refs in the vectorizer is as
4756    follows:
4757    1- vect_analyze_data_refs(loop/bb): call
4758       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4759       in the loop/bb and their dependences.
4760    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4761    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4762    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4763
4764 */
4765
4766 opt_result
4767 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4768 {
4769   class loop *loop = NULL;
4770   unsigned int i;
4771   struct data_reference *dr;
4772   tree scalar_type;
4773
4774   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4775
4776   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4777     loop = LOOP_VINFO_LOOP (loop_vinfo);
4778
4779   /* Go through the data-refs, check that the analysis succeeded.  Update
4780      pointer from stmt_vec_info struct to DR and vectype.  */
4781
4782   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4783   FOR_EACH_VEC_ELT (datarefs, i, dr)
4784     {
4785       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4786       poly_uint64 vf;
4787
4788       gcc_assert (DR_REF (dr));
4789       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4790       gcc_assert (!stmt_info->dr_aux.dr);
4791       stmt_info->dr_aux.dr = dr;
4792       stmt_info->dr_aux.stmt = stmt_info;
4793
4794       /* Check that analysis of the data-ref succeeded.  */
4795       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4796           || !DR_STEP (dr))
4797         {
4798           bool maybe_gather
4799             = DR_IS_READ (dr)
4800               && !TREE_THIS_VOLATILE (DR_REF (dr));
4801           bool maybe_scatter
4802             = DR_IS_WRITE (dr)
4803               && !TREE_THIS_VOLATILE (DR_REF (dr));
4804
4805           /* If target supports vector gather loads or scatter stores,
4806              see if they can't be used.  */
4807           if (is_a <loop_vec_info> (vinfo)
4808               && !nested_in_vect_loop_p (loop, stmt_info))
4809             {
4810               if (maybe_gather || maybe_scatter)
4811                 {
4812                   if (maybe_gather)
4813                     gatherscatter = GATHER;
4814                   else
4815                     gatherscatter = SCATTER;
4816                 }
4817             }
4818
4819           if (gatherscatter == SG_NONE)
4820             {
4821               if (dump_enabled_p ())
4822                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4823                                  "not vectorized: data ref analysis "
4824                                  "failed %G", stmt_info->stmt);
4825               if (is_a <bb_vec_info> (vinfo))
4826                 {
4827                   /* In BB vectorization the ref can still participate
4828                      in dependence analysis, we just can't vectorize it.  */
4829                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4830                   continue;
4831                 }
4832               return opt_result::failure_at (stmt_info->stmt,
4833                                              "not vectorized:"
4834                                              " data ref analysis failed: %G",
4835                                              stmt_info->stmt);
4836             }
4837         }
4838
4839       /* See if this was detected as SIMD lane access.  */
4840       if (dr->aux == (void *)-1
4841           || dr->aux == (void *)-2
4842           || dr->aux == (void *)-3
4843           || dr->aux == (void *)-4)
4844         {
4845           if (nested_in_vect_loop_p (loop, stmt_info))
4846             return opt_result::failure_at (stmt_info->stmt,
4847                                            "not vectorized:"
4848                                            " data ref analysis failed: %G",
4849                                            stmt_info->stmt);
4850           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4851             = -(uintptr_t) dr->aux;
4852         }
4853
4854       tree base = get_base_address (DR_REF (dr));
4855       if (base && VAR_P (base) && DECL_NONALIASED (base))
4856         {
4857           if (dump_enabled_p ())
4858             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4859                              "not vectorized: base object not addressable "
4860                              "for stmt: %G", stmt_info->stmt);
4861           if (is_a <bb_vec_info> (vinfo))
4862             {
4863               /* In BB vectorization the ref can still participate
4864                  in dependence analysis, we just can't vectorize it.  */
4865               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4866               continue;
4867             }
4868           return opt_result::failure_at (stmt_info->stmt,
4869                                          "not vectorized: base object not"
4870                                          " addressable for stmt: %G",
4871                                          stmt_info->stmt);
4872         }
4873
4874       if (is_a <loop_vec_info> (vinfo)
4875           && DR_STEP (dr)
4876           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4877         {
4878           if (nested_in_vect_loop_p (loop, stmt_info))
4879             return opt_result::failure_at (stmt_info->stmt,
4880                                            "not vectorized: "
4881                                            "not suitable for strided load %G",
4882                                            stmt_info->stmt);
4883           STMT_VINFO_STRIDED_P (stmt_info) = true;
4884         }
4885
4886       /* Update DR field in stmt_vec_info struct.  */
4887
4888       /* If the dataref is in an inner-loop of the loop that is considered for
4889          for vectorization, we also want to analyze the access relative to
4890          the outer-loop (DR contains information only relative to the
4891          inner-most enclosing loop).  We do that by building a reference to the
4892          first location accessed by the inner-loop, and analyze it relative to
4893          the outer-loop.  */
4894       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4895         {
4896           /* Build a reference to the first location accessed by the
4897              inner loop: *(BASE + INIT + OFFSET).  By construction,
4898              this address must be invariant in the inner loop, so we
4899              can consider it as being used in the outer loop.  */
4900           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4901           tree offset = unshare_expr (DR_OFFSET (dr));
4902           tree init = unshare_expr (DR_INIT (dr));
4903           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4904                                           init, offset);
4905           tree init_addr = fold_build_pointer_plus (base, init_offset);
4906           tree init_ref = build_fold_indirect_ref (init_addr);
4907
4908           if (dump_enabled_p ())
4909             dump_printf_loc (MSG_NOTE, vect_location,
4910                              "analyze in outer loop: %T\n", init_ref);
4911
4912           opt_result res
4913             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4914                                     init_ref, loop, stmt_info->stmt);
4915           if (!res)
4916             /* dr_analyze_innermost already explained the failure.  */
4917             return res;
4918
4919           if (dump_enabled_p ())
4920             dump_printf_loc (MSG_NOTE, vect_location,
4921                              "\touter base_address: %T\n"
4922                              "\touter offset from base address: %T\n"
4923                              "\touter constant offset from base address: %T\n"
4924                              "\touter step: %T\n"
4925                              "\touter base alignment: %d\n\n"
4926                              "\touter base misalignment: %d\n"
4927                              "\touter offset alignment: %d\n"
4928                              "\touter step alignment: %d\n",
4929                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4930                              STMT_VINFO_DR_OFFSET (stmt_info),
4931                              STMT_VINFO_DR_INIT (stmt_info),
4932                              STMT_VINFO_DR_STEP (stmt_info),
4933                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4934                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4935                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4936                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4937         }
4938
4939       /* Set vectype for STMT.  */
4940       scalar_type = TREE_TYPE (DR_REF (dr));
4941       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4942       if (!vectype)
4943         {
4944           if (dump_enabled_p ())
4945             {
4946               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4947                                "not vectorized: no vectype for stmt: %G",
4948                                stmt_info->stmt);
4949               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4950               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4951                                  scalar_type);
4952               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4953             }
4954
4955           if (is_a <bb_vec_info> (vinfo))
4956             {
4957               /* No vector type is fine, the ref can still participate
4958                  in dependence analysis, we just can't vectorize it.  */
4959               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4960               continue;
4961             }
4962           if (fatal)
4963             *fatal = false;
4964           return opt_result::failure_at (stmt_info->stmt,
4965                                          "not vectorized:"
4966                                          " no vectype for stmt: %G"
4967                                          " scalar_type: %T\n",
4968                                          stmt_info->stmt, scalar_type);
4969         }
4970       else
4971         {
4972           if (dump_enabled_p ())
4973             dump_printf_loc (MSG_NOTE, vect_location,
4974                              "got vectype for stmt: %G%T\n",
4975                              stmt_info->stmt, vectype);
4976         }
4977
4978       /* Adjust the minimal vectorization factor according to the
4979          vector type.  */
4980       vf = TYPE_VECTOR_SUBPARTS (vectype);
4981       *min_vf = upper_bound (*min_vf, vf);
4982
4983       /* Leave the BB vectorizer to pick the vector type later, based on
4984          the final dataref group size and SLP node size.  */
4985       if (is_a <loop_vec_info> (vinfo))
4986         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4987
4988       if (gatherscatter != SG_NONE)
4989         {
4990           gather_scatter_info gs_info;
4991           if (!vect_check_gather_scatter (stmt_info,
4992                                           as_a <loop_vec_info> (vinfo),
4993                                           &gs_info)
4994               || !get_vectype_for_scalar_type (vinfo,
4995                                                TREE_TYPE (gs_info.offset)))
4996             {
4997               if (fatal)
4998                 *fatal = false;
4999               return opt_result::failure_at
5000                         (stmt_info->stmt,
5001                          (gatherscatter == GATHER)
5002                          ? "not vectorized: not suitable for gather load %G"
5003                          : "not vectorized: not suitable for scatter store %G",
5004                          stmt_info->stmt);
5005             }
5006           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
5007         }
5008     }
5009
5010   /* We used to stop processing and prune the list here.  Verify we no
5011      longer need to.  */
5012   gcc_assert (i == datarefs.length ());
5013
5014   return opt_result::success ();
5015 }
5016
5017
5018 /* Function vect_get_new_vect_var.
5019
5020    Returns a name for a new variable.  The current naming scheme appends the
5021    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5022    the name of vectorizer generated variables, and appends that to NAME if
5023    provided.  */
5024
5025 tree
5026 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5027 {
5028   const char *prefix;
5029   tree new_vect_var;
5030
5031   switch (var_kind)
5032   {
5033   case vect_simple_var:
5034     prefix = "vect";
5035     break;
5036   case vect_scalar_var:
5037     prefix = "stmp";
5038     break;
5039   case vect_mask_var:
5040     prefix = "mask";
5041     break;
5042   case vect_pointer_var:
5043     prefix = "vectp";
5044     break;
5045   default:
5046     gcc_unreachable ();
5047   }
5048
5049   if (name)
5050     {
5051       char* tmp = concat (prefix, "_", name, NULL);
5052       new_vect_var = create_tmp_reg (type, tmp);
5053       free (tmp);
5054     }
5055   else
5056     new_vect_var = create_tmp_reg (type, prefix);
5057
5058   return new_vect_var;
5059 }
5060
5061 /* Like vect_get_new_vect_var but return an SSA name.  */
5062
5063 tree
5064 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5065 {
5066   const char *prefix;
5067   tree new_vect_var;
5068
5069   switch (var_kind)
5070   {
5071   case vect_simple_var:
5072     prefix = "vect";
5073     break;
5074   case vect_scalar_var:
5075     prefix = "stmp";
5076     break;
5077   case vect_pointer_var:
5078     prefix = "vectp";
5079     break;
5080   default:
5081     gcc_unreachable ();
5082   }
5083
5084   if (name)
5085     {
5086       char* tmp = concat (prefix, "_", name, NULL);
5087       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5088       free (tmp);
5089     }
5090   else
5091     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5092
5093   return new_vect_var;
5094 }
5095
5096 /* Duplicate points-to info on NAME from DR_INFO.  */
5097
5098 static void
5099 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5100 {
5101   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5102   /* DR_PTR_INFO is for a base SSA name, not including constant or
5103      variable offsets in the ref so its alignment info does not apply.  */
5104   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5105 }
5106
5107 /* Function vect_create_addr_base_for_vector_ref.
5108
5109    Create an expression that computes the address of the first memory location
5110    that will be accessed for a data reference.
5111
5112    Input:
5113    STMT_INFO: The statement containing the data reference.
5114    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5115    OFFSET: Optional. If supplied, it is be added to the initial address.
5116    LOOP:    Specify relative to which loop-nest should the address be computed.
5117             For example, when the dataref is in an inner-loop nested in an
5118             outer-loop that is now being vectorized, LOOP can be either the
5119             outer-loop, or the inner-loop.  The first memory location accessed
5120             by the following dataref ('in' points to short):
5121
5122                 for (i=0; i<N; i++)
5123                    for (j=0; j<M; j++)
5124                      s += in[i+j]
5125
5126             is as follows:
5127             if LOOP=i_loop:     &in             (relative to i_loop)
5128             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5129
5130    Output:
5131    1. Return an SSA_NAME whose value is the address of the memory location of
5132       the first vector of the data reference.
5133    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5134       these statement(s) which define the returned SSA_NAME.
5135
5136    FORNOW: We are only handling array accesses with step 1.  */
5137
5138 tree
5139 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5140                                       gimple_seq *new_stmt_list,
5141                                       tree offset)
5142 {
5143   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5144   struct data_reference *dr = dr_info->dr;
5145   const char *base_name;
5146   tree addr_base;
5147   tree dest;
5148   gimple_seq seq = NULL;
5149   tree vect_ptr_type;
5150   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5151   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5152
5153   tree data_ref_base = unshare_expr (drb->base_address);
5154   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5155   tree init = unshare_expr (drb->init);
5156
5157   if (loop_vinfo)
5158     base_name = get_name (data_ref_base);
5159   else
5160     {
5161       base_offset = ssize_int (0);
5162       init = ssize_int (0);
5163       base_name = get_name (DR_REF (dr));
5164     }
5165
5166   /* Create base_offset */
5167   base_offset = size_binop (PLUS_EXPR,
5168                             fold_convert (sizetype, base_offset),
5169                             fold_convert (sizetype, init));
5170
5171   if (offset)
5172     {
5173       offset = fold_convert (sizetype, offset);
5174       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5175                                  base_offset, offset);
5176     }
5177
5178   /* base + base_offset */
5179   if (loop_vinfo)
5180     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5181   else
5182     addr_base = build1 (ADDR_EXPR,
5183                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5184                         /* Strip zero offset components since we don't need
5185                            them and they can confuse late diagnostics if
5186                            we CSE them wrongly.  See PR106904 for example.  */
5187                         unshare_expr (strip_zero_offset_components
5188                                                                 (DR_REF (dr))));
5189
5190   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5191   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5192   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5193   gimple_seq_add_seq (new_stmt_list, seq);
5194
5195   if (DR_PTR_INFO (dr)
5196       && TREE_CODE (addr_base) == SSA_NAME
5197       /* We should only duplicate pointer info to newly created SSA names.  */
5198       && SSA_NAME_VAR (addr_base) == dest)
5199     {
5200       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5201       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5202     }
5203
5204   if (dump_enabled_p ())
5205     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5206
5207   return addr_base;
5208 }
5209
5210
5211 /* Function vect_create_data_ref_ptr.
5212
5213    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5214    location accessed in the loop by STMT_INFO, along with the def-use update
5215    chain to appropriately advance the pointer through the loop iterations.
5216    Also set aliasing information for the pointer.  This pointer is used by
5217    the callers to this function to create a memory reference expression for
5218    vector load/store access.
5219
5220    Input:
5221    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5222          GIMPLE_ASSIGN <name, data-ref> or
5223          GIMPLE_ASSIGN <data-ref, name>.
5224    2. AGGR_TYPE: the type of the reference, which should be either a vector
5225         or an array.
5226    3. AT_LOOP: the loop where the vector memref is to be created.
5227    4. OFFSET (optional): a byte offset to be added to the initial address
5228         accessed by the data-ref in STMT_INFO.
5229    5. BSI: location where the new stmts are to be placed if there is no loop
5230    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5231         pointing to the initial address.
5232    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5233         to the IV during each iteration of the loop.  NULL says to move
5234         by one copy of AGGR_TYPE up or down, depending on the step of the
5235         data reference.
5236
5237    Output:
5238    1. Declare a new ptr to vector_type, and have it point to the base of the
5239       data reference (initial addressed accessed by the data reference).
5240       For example, for vector of type V8HI, the following code is generated:
5241
5242       v8hi *ap;
5243       ap = (v8hi *)initial_address;
5244
5245       if OFFSET is not supplied:
5246          initial_address = &a[init];
5247       if OFFSET is supplied:
5248          initial_address = &a[init] + OFFSET;
5249       if BYTE_OFFSET is supplied:
5250          initial_address = &a[init] + BYTE_OFFSET;
5251
5252       Return the initial_address in INITIAL_ADDRESS.
5253
5254    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5255       update the pointer in each iteration of the loop.
5256
5257       Return the increment stmt that updates the pointer in PTR_INCR.
5258
5259    3. Return the pointer.  */
5260
5261 tree
5262 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5263                           tree aggr_type, class loop *at_loop, tree offset,
5264                           tree *initial_address, gimple_stmt_iterator *gsi,
5265                           gimple **ptr_incr, bool only_init,
5266                           tree iv_step)
5267 {
5268   const char *base_name;
5269   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5270   class loop *loop = NULL;
5271   bool nested_in_vect_loop = false;
5272   class loop *containing_loop = NULL;
5273   tree aggr_ptr_type;
5274   tree aggr_ptr;
5275   tree new_temp;
5276   gimple_seq new_stmt_list = NULL;
5277   edge pe = NULL;
5278   basic_block new_bb;
5279   tree aggr_ptr_init;
5280   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5281   struct data_reference *dr = dr_info->dr;
5282   tree aptr;
5283   gimple_stmt_iterator incr_gsi;
5284   bool insert_after;
5285   tree indx_before_incr, indx_after_incr;
5286   gimple *incr;
5287   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5288
5289   gcc_assert (iv_step != NULL_TREE
5290               || TREE_CODE (aggr_type) == ARRAY_TYPE
5291               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5292
5293   if (loop_vinfo)
5294     {
5295       loop = LOOP_VINFO_LOOP (loop_vinfo);
5296       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5297       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5298       pe = loop_preheader_edge (loop);
5299     }
5300   else
5301     {
5302       gcc_assert (bb_vinfo);
5303       only_init = true;
5304       *ptr_incr = NULL;
5305     }
5306
5307   /* Create an expression for the first address accessed by this load
5308      in LOOP.  */
5309   base_name = get_name (DR_BASE_ADDRESS (dr));
5310
5311   if (dump_enabled_p ())
5312     {
5313       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5314       dump_printf_loc (MSG_NOTE, vect_location,
5315                        "create %s-pointer variable to type: %T",
5316                        get_tree_code_name (TREE_CODE (aggr_type)),
5317                        aggr_type);
5318       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5319         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5320       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5321         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5322       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5323         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5324       else
5325         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5326       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5327     }
5328
5329   /* (1) Create the new aggregate-pointer variable.
5330      Vector and array types inherit the alias set of their component
5331      type by default so we need to use a ref-all pointer if the data
5332      reference does not conflict with the created aggregated data
5333      reference because it is not addressable.  */
5334   bool need_ref_all = false;
5335   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5336                               get_alias_set (DR_REF (dr))))
5337     need_ref_all = true;
5338   /* Likewise for any of the data references in the stmt group.  */
5339   else if (DR_GROUP_SIZE (stmt_info) > 1)
5340     {
5341       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5342       do
5343         {
5344           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5345           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5346                                       get_alias_set (DR_REF (sdr))))
5347             {
5348               need_ref_all = true;
5349               break;
5350             }
5351           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5352         }
5353       while (sinfo);
5354     }
5355   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5356                                                need_ref_all);
5357   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5358
5359
5360   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5361      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5362      def-use update cycles for the pointer: one relative to the outer-loop
5363      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5364      to the inner-loop (which is the inner-most loop containing the dataref),
5365      and this is done be step (5) below.
5366
5367      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5368      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5369      redundant.  Steps (3),(4) create the following:
5370
5371         vp0 = &base_addr;
5372         LOOP:   vp1 = phi(vp0,vp2)
5373                 ...
5374                 ...
5375                 vp2 = vp1 + step
5376                 goto LOOP
5377
5378      If there is an inner-loop nested in loop, then step (5) will also be
5379      applied, and an additional update in the inner-loop will be created:
5380
5381         vp0 = &base_addr;
5382         LOOP:   vp1 = phi(vp0,vp2)
5383                 ...
5384         inner:     vp3 = phi(vp1,vp4)
5385                    vp4 = vp3 + inner_step
5386                    if () goto inner
5387                 ...
5388                 vp2 = vp1 + step
5389                 if () goto LOOP   */
5390
5391   /* (2) Calculate the initial address of the aggregate-pointer, and set
5392      the aggregate-pointer to point to it before the loop.  */
5393
5394   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5395
5396   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5397                                                    stmt_info, &new_stmt_list,
5398                                                    offset);
5399   if (new_stmt_list)
5400     {
5401       if (pe)
5402         {
5403           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5404           gcc_assert (!new_bb);
5405         }
5406       else
5407         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5408     }
5409
5410   *initial_address = new_temp;
5411   aggr_ptr_init = new_temp;
5412
5413   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5414      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5415      inner-loop nested in LOOP (during outer-loop vectorization).  */
5416
5417   /* No update in loop is required.  */
5418   if (only_init && (!loop_vinfo || at_loop == loop))
5419     aptr = aggr_ptr_init;
5420   else
5421     {
5422       /* Accesses to invariant addresses should be handled specially
5423          by the caller.  */
5424       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5425       gcc_assert (!integer_zerop (step));
5426
5427       if (iv_step == NULL_TREE)
5428         {
5429           /* The step of the aggregate pointer is the type size,
5430              negated for downward accesses.  */
5431           iv_step = TYPE_SIZE_UNIT (aggr_type);
5432           if (tree_int_cst_sgn (step) == -1)
5433             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5434         }
5435
5436       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5437
5438       create_iv (aggr_ptr_init, PLUS_EXPR,
5439                  fold_convert (aggr_ptr_type, iv_step),
5440                  aggr_ptr, loop, &incr_gsi, insert_after,
5441                  &indx_before_incr, &indx_after_incr);
5442       incr = gsi_stmt (incr_gsi);
5443
5444       /* Copy the points-to information if it exists. */
5445       if (DR_PTR_INFO (dr))
5446         {
5447           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5448           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5449         }
5450       if (ptr_incr)
5451         *ptr_incr = incr;
5452
5453       aptr = indx_before_incr;
5454     }
5455
5456   if (!nested_in_vect_loop || only_init)
5457     return aptr;
5458
5459
5460   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5461      nested in LOOP, if exists.  */
5462
5463   gcc_assert (nested_in_vect_loop);
5464   if (!only_init)
5465     {
5466       standard_iv_increment_position (containing_loop, &incr_gsi,
5467                                       &insert_after);
5468       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5469                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5470                  &indx_before_incr, &indx_after_incr);
5471       incr = gsi_stmt (incr_gsi);
5472
5473       /* Copy the points-to information if it exists. */
5474       if (DR_PTR_INFO (dr))
5475         {
5476           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5477           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5478         }
5479       if (ptr_incr)
5480         *ptr_incr = incr;
5481
5482       return indx_before_incr;
5483     }
5484   else
5485     gcc_unreachable ();
5486 }
5487
5488
5489 /* Function bump_vector_ptr
5490
5491    Increment a pointer (to a vector type) by vector-size. If requested,
5492    i.e. if PTR-INCR is given, then also connect the new increment stmt
5493    to the existing def-use update-chain of the pointer, by modifying
5494    the PTR_INCR as illustrated below:
5495
5496    The pointer def-use update-chain before this function:
5497                         DATAREF_PTR = phi (p_0, p_2)
5498                         ....
5499         PTR_INCR:       p_2 = DATAREF_PTR + step
5500
5501    The pointer def-use update-chain after this function:
5502                         DATAREF_PTR = phi (p_0, p_2)
5503                         ....
5504                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5505                         ....
5506         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5507
5508    Input:
5509    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5510                  in the loop.
5511    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5512               the loop.  The increment amount across iterations is expected
5513               to be vector_size.
5514    BSI - location where the new update stmt is to be placed.
5515    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5516    BUMP - optional. The offset by which to bump the pointer. If not given,
5517           the offset is assumed to be vector_size.
5518
5519    Output: Return NEW_DATAREF_PTR as illustrated above.
5520
5521 */
5522
5523 tree
5524 bump_vector_ptr (vec_info *vinfo,
5525                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5526                  stmt_vec_info stmt_info, tree bump)
5527 {
5528   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5529   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5530   tree update = TYPE_SIZE_UNIT (vectype);
5531   gimple *incr_stmt;
5532   ssa_op_iter iter;
5533   use_operand_p use_p;
5534   tree new_dataref_ptr;
5535
5536   if (bump)
5537     update = bump;
5538
5539   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5540     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5541   else if (is_gimple_min_invariant (dataref_ptr))
5542     /* When possible avoid emitting a separate increment stmt that will
5543        force the addressed object addressable.  */
5544     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5545                    fold_build2 (MEM_REF,
5546                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5547                                 dataref_ptr,
5548                                 fold_convert (ptr_type_node, update)));
5549   else
5550     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5551   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5552                                    dataref_ptr, update);
5553   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5554   /* Fold the increment, avoiding excessive chains use-def chains of
5555      those, leading to compile-time issues for passes until the next
5556      forwprop pass which would do this as well.  */
5557   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5558   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5559     {
5560       incr_stmt = gsi_stmt (fold_gsi);
5561       update_stmt (incr_stmt);
5562     }
5563
5564   /* Copy the points-to information if it exists. */
5565   if (DR_PTR_INFO (dr))
5566     {
5567       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5568       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5569     }
5570
5571   if (!ptr_incr)
5572     return new_dataref_ptr;
5573
5574   /* Update the vector-pointer's cross-iteration increment.  */
5575   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5576     {
5577       tree use = USE_FROM_PTR (use_p);
5578
5579       if (use == dataref_ptr)
5580         SET_USE (use_p, new_dataref_ptr);
5581       else
5582         gcc_assert (operand_equal_p (use, update, 0));
5583     }
5584
5585   return new_dataref_ptr;
5586 }
5587
5588
5589 /* Copy memory reference info such as base/clique from the SRC reference
5590    to the DEST MEM_REF.  */
5591
5592 void
5593 vect_copy_ref_info (tree dest, tree src)
5594 {
5595   if (TREE_CODE (dest) != MEM_REF)
5596     return;
5597
5598   tree src_base = src;
5599   while (handled_component_p (src_base))
5600     src_base = TREE_OPERAND (src_base, 0);
5601   if (TREE_CODE (src_base) != MEM_REF
5602       && TREE_CODE (src_base) != TARGET_MEM_REF)
5603     return;
5604
5605   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5606   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5607 }
5608
5609
5610 /* Function vect_create_destination_var.
5611
5612    Create a new temporary of type VECTYPE.  */
5613
5614 tree
5615 vect_create_destination_var (tree scalar_dest, tree vectype)
5616 {
5617   tree vec_dest;
5618   const char *name;
5619   char *new_name;
5620   tree type;
5621   enum vect_var_kind kind;
5622
5623   kind = vectype
5624     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5625     ? vect_mask_var
5626     : vect_simple_var
5627     : vect_scalar_var;
5628   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5629
5630   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5631
5632   name = get_name (scalar_dest);
5633   if (name)
5634     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5635   else
5636     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5637   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5638   free (new_name);
5639
5640   return vec_dest;
5641 }
5642
5643 /* Function vect_grouped_store_supported.
5644
5645    Returns TRUE if interleave high and interleave low permutations
5646    are supported, and FALSE otherwise.  */
5647
5648 bool
5649 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5650 {
5651   machine_mode mode = TYPE_MODE (vectype);
5652
5653   /* vect_permute_store_chain requires the group size to be equal to 3 or
5654      be a power of two.  */
5655   if (count != 3 && exact_log2 (count) == -1)
5656     {
5657       if (dump_enabled_p ())
5658         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5659                          "the size of the group of accesses"
5660                          " is not a power of 2 or not eqaul to 3\n");
5661       return false;
5662     }
5663
5664   /* Check that the permutation is supported.  */
5665   if (VECTOR_MODE_P (mode))
5666     {
5667       unsigned int i;
5668       if (count == 3)
5669         {
5670           unsigned int j0 = 0, j1 = 0, j2 = 0;
5671           unsigned int i, j;
5672
5673           unsigned int nelt;
5674           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5675             {
5676               if (dump_enabled_p ())
5677                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5678                                  "cannot handle groups of 3 stores for"
5679                                  " variable-length vectors\n");
5680               return false;
5681             }
5682
5683           vec_perm_builder sel (nelt, nelt, 1);
5684           sel.quick_grow (nelt);
5685           vec_perm_indices indices;
5686           for (j = 0; j < 3; j++)
5687             {
5688               int nelt0 = ((3 - j) * nelt) % 3;
5689               int nelt1 = ((3 - j) * nelt + 1) % 3;
5690               int nelt2 = ((3 - j) * nelt + 2) % 3;
5691               for (i = 0; i < nelt; i++)
5692                 {
5693                   if (3 * i + nelt0 < nelt)
5694                     sel[3 * i + nelt0] = j0++;
5695                   if (3 * i + nelt1 < nelt)
5696                     sel[3 * i + nelt1] = nelt + j1++;
5697                   if (3 * i + nelt2 < nelt)
5698                     sel[3 * i + nelt2] = 0;
5699                 }
5700               indices.new_vector (sel, 2, nelt);
5701               if (!can_vec_perm_const_p (mode, mode, indices))
5702                 {
5703                   if (dump_enabled_p ())
5704                     dump_printf (MSG_MISSED_OPTIMIZATION,
5705                                  "permutation op not supported by target.\n");
5706                   return false;
5707                 }
5708
5709               for (i = 0; i < nelt; i++)
5710                 {
5711                   if (3 * i + nelt0 < nelt)
5712                     sel[3 * i + nelt0] = 3 * i + nelt0;
5713                   if (3 * i + nelt1 < nelt)
5714                     sel[3 * i + nelt1] = 3 * i + nelt1;
5715                   if (3 * i + nelt2 < nelt)
5716                     sel[3 * i + nelt2] = nelt + j2++;
5717                 }
5718               indices.new_vector (sel, 2, nelt);
5719               if (!can_vec_perm_const_p (mode, mode, indices))
5720                 {
5721                   if (dump_enabled_p ())
5722                     dump_printf (MSG_MISSED_OPTIMIZATION,
5723                                  "permutation op not supported by target.\n");
5724                   return false;
5725                 }
5726             }
5727           return true;
5728         }
5729       else
5730         {
5731           /* If length is not equal to 3 then only power of 2 is supported.  */
5732           gcc_assert (pow2p_hwi (count));
5733           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5734
5735           /* The encoding has 2 interleaved stepped patterns.  */
5736           if(!multiple_p (nelt, 2))
5737             return false;
5738           vec_perm_builder sel (nelt, 2, 3);
5739           sel.quick_grow (6);
5740           for (i = 0; i < 3; i++)
5741             {
5742               sel[i * 2] = i;
5743               sel[i * 2 + 1] = i + nelt;
5744             }
5745           vec_perm_indices indices (sel, 2, nelt);
5746           if (can_vec_perm_const_p (mode, mode, indices))
5747             {
5748               for (i = 0; i < 6; i++)
5749                 sel[i] += exact_div (nelt, 2);
5750               indices.new_vector (sel, 2, nelt);
5751               if (can_vec_perm_const_p (mode, mode, indices))
5752                 return true;
5753             }
5754         }
5755     }
5756
5757   if (dump_enabled_p ())
5758     dump_printf (MSG_MISSED_OPTIMIZATION,
5759                  "permutation op not supported by target.\n");
5760   return false;
5761 }
5762
5763 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5764    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5765
5766 internal_fn
5767 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5768                             bool masked_p)
5769 {
5770   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5771                                     vec_mask_len_store_lanes_optab, vectype,
5772                                     count))
5773     return IFN_MASK_LEN_STORE_LANES;
5774   else if (masked_p)
5775     {
5776       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5777                                         vec_mask_store_lanes_optab, vectype,
5778                                         count))
5779         return IFN_MASK_STORE_LANES;
5780     }
5781   else
5782     {
5783       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5784                                         vec_store_lanes_optab, vectype, count))
5785         return IFN_STORE_LANES;
5786     }
5787   return IFN_LAST;
5788 }
5789
5790
5791 /* Function vect_permute_store_chain.
5792
5793    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5794    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5795    the data correctly for the stores.  Return the final references for stores
5796    in RESULT_CHAIN.
5797
5798    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5799    The input is 4 vectors each containing 8 elements.  We assign a number to
5800    each element, the input sequence is:
5801
5802    1st vec:   0  1  2  3  4  5  6  7
5803    2nd vec:   8  9 10 11 12 13 14 15
5804    3rd vec:  16 17 18 19 20 21 22 23
5805    4th vec:  24 25 26 27 28 29 30 31
5806
5807    The output sequence should be:
5808
5809    1st vec:  0  8 16 24  1  9 17 25
5810    2nd vec:  2 10 18 26  3 11 19 27
5811    3rd vec:  4 12 20 28  5 13 21 30
5812    4th vec:  6 14 22 30  7 15 23 31
5813
5814    i.e., we interleave the contents of the four vectors in their order.
5815
5816    We use interleave_high/low instructions to create such output.  The input of
5817    each interleave_high/low operation is two vectors:
5818    1st vec    2nd vec
5819    0 1 2 3    4 5 6 7
5820    the even elements of the result vector are obtained left-to-right from the
5821    high/low elements of the first vector.  The odd elements of the result are
5822    obtained left-to-right from the high/low elements of the second vector.
5823    The output of interleave_high will be:   0 4 1 5
5824    and of interleave_low:                   2 6 3 7
5825
5826
5827    The permutation is done in log LENGTH stages.  In each stage interleave_high
5828    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5829    where the first argument is taken from the first half of DR_CHAIN and the
5830    second argument from it's second half.
5831    In our example,
5832
5833    I1: interleave_high (1st vec, 3rd vec)
5834    I2: interleave_low (1st vec, 3rd vec)
5835    I3: interleave_high (2nd vec, 4th vec)
5836    I4: interleave_low (2nd vec, 4th vec)
5837
5838    The output for the first stage is:
5839
5840    I1:  0 16  1 17  2 18  3 19
5841    I2:  4 20  5 21  6 22  7 23
5842    I3:  8 24  9 25 10 26 11 27
5843    I4: 12 28 13 29 14 30 15 31
5844
5845    The output of the second stage, i.e. the final result is:
5846
5847    I1:  0  8 16 24  1  9 17 25
5848    I2:  2 10 18 26  3 11 19 27
5849    I3:  4 12 20 28  5 13 21 30
5850    I4:  6 14 22 30  7 15 23 31.  */
5851
5852 void
5853 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5854                           unsigned int length,
5855                           stmt_vec_info stmt_info,
5856                           gimple_stmt_iterator *gsi,
5857                           vec<tree> *result_chain)
5858 {
5859   tree vect1, vect2, high, low;
5860   gimple *perm_stmt;
5861   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5862   tree perm_mask_low, perm_mask_high;
5863   tree data_ref;
5864   tree perm3_mask_low, perm3_mask_high;
5865   unsigned int i, j, n, log_length = exact_log2 (length);
5866
5867   result_chain->quick_grow (length);
5868   memcpy (result_chain->address (), dr_chain.address (),
5869           length * sizeof (tree));
5870
5871   if (length == 3)
5872     {
5873       /* vect_grouped_store_supported ensures that this is constant.  */
5874       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5875       unsigned int j0 = 0, j1 = 0, j2 = 0;
5876
5877       vec_perm_builder sel (nelt, nelt, 1);
5878       sel.quick_grow (nelt);
5879       vec_perm_indices indices;
5880       for (j = 0; j < 3; j++)
5881         {
5882           int nelt0 = ((3 - j) * nelt) % 3;
5883           int nelt1 = ((3 - j) * nelt + 1) % 3;
5884           int nelt2 = ((3 - j) * nelt + 2) % 3;
5885
5886           for (i = 0; i < nelt; i++)
5887             {
5888               if (3 * i + nelt0 < nelt)
5889                 sel[3 * i + nelt0] = j0++;
5890               if (3 * i + nelt1 < nelt)
5891                 sel[3 * i + nelt1] = nelt + j1++;
5892               if (3 * i + nelt2 < nelt)
5893                 sel[3 * i + nelt2] = 0;
5894             }
5895           indices.new_vector (sel, 2, nelt);
5896           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5897
5898           for (i = 0; i < nelt; i++)
5899             {
5900               if (3 * i + nelt0 < nelt)
5901                 sel[3 * i + nelt0] = 3 * i + nelt0;
5902               if (3 * i + nelt1 < nelt)
5903                 sel[3 * i + nelt1] = 3 * i + nelt1;
5904               if (3 * i + nelt2 < nelt)
5905                 sel[3 * i + nelt2] = nelt + j2++;
5906             }
5907           indices.new_vector (sel, 2, nelt);
5908           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5909
5910           vect1 = dr_chain[0];
5911           vect2 = dr_chain[1];
5912
5913           /* Create interleaving stmt:
5914              low = VEC_PERM_EXPR <vect1, vect2,
5915                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5916                                    j + 2, nelt + j + 2, *, ...}>  */
5917           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5918           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5919                                            vect2, perm3_mask_low);
5920           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5921
5922           vect1 = data_ref;
5923           vect2 = dr_chain[2];
5924           /* Create interleaving stmt:
5925              low = VEC_PERM_EXPR <vect1, vect2,
5926                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5927                                    6, 7, nelt + j + 2, ...}>  */
5928           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5929           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5930                                            vect2, perm3_mask_high);
5931           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5932           (*result_chain)[j] = data_ref;
5933         }
5934     }
5935   else
5936     {
5937       /* If length is not equal to 3 then only power of 2 is supported.  */
5938       gcc_assert (pow2p_hwi (length));
5939
5940       /* The encoding has 2 interleaved stepped patterns.  */
5941       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5942       vec_perm_builder sel (nelt, 2, 3);
5943       sel.quick_grow (6);
5944       for (i = 0; i < 3; i++)
5945         {
5946           sel[i * 2] = i;
5947           sel[i * 2 + 1] = i + nelt;
5948         }
5949         vec_perm_indices indices (sel, 2, nelt);
5950         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5951
5952         for (i = 0; i < 6; i++)
5953           sel[i] += exact_div (nelt, 2);
5954         indices.new_vector (sel, 2, nelt);
5955         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5956
5957         for (i = 0, n = log_length; i < n; i++)
5958           {
5959             for (j = 0; j < length/2; j++)
5960               {
5961                 vect1 = dr_chain[j];
5962                 vect2 = dr_chain[j+length/2];
5963
5964                 /* Create interleaving stmt:
5965                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5966                                                         ...}>  */
5967                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5968                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5969                                                  vect2, perm_mask_high);
5970                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5971                 (*result_chain)[2*j] = high;
5972
5973                 /* Create interleaving stmt:
5974                    low = VEC_PERM_EXPR <vect1, vect2,
5975                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5976                                          ...}>  */
5977                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5978                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5979                                                  vect2, perm_mask_low);
5980                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5981                 (*result_chain)[2*j+1] = low;
5982               }
5983             memcpy (dr_chain.address (), result_chain->address (),
5984                     length * sizeof (tree));
5985           }
5986     }
5987 }
5988
5989 /* Function vect_setup_realignment
5990
5991    This function is called when vectorizing an unaligned load using
5992    the dr_explicit_realign[_optimized] scheme.
5993    This function generates the following code at the loop prolog:
5994
5995       p = initial_addr;
5996    x  msq_init = *(floor(p));   # prolog load
5997       realignment_token = call target_builtin;
5998     loop:
5999    x  msq = phi (msq_init, ---)
6000
6001    The stmts marked with x are generated only for the case of
6002    dr_explicit_realign_optimized.
6003
6004    The code above sets up a new (vector) pointer, pointing to the first
6005    location accessed by STMT_INFO, and a "floor-aligned" load using that
6006    pointer.  It also generates code to compute the "realignment-token"
6007    (if the relevant target hook was defined), and creates a phi-node at the
6008    loop-header bb whose arguments are the result of the prolog-load (created
6009    by this function) and the result of a load that takes place in the loop
6010    (to be created by the caller to this function).
6011
6012    For the case of dr_explicit_realign_optimized:
6013    The caller to this function uses the phi-result (msq) to create the
6014    realignment code inside the loop, and sets up the missing phi argument,
6015    as follows:
6016     loop:
6017       msq = phi (msq_init, lsq)
6018       lsq = *(floor(p'));        # load in loop
6019       result = realign_load (msq, lsq, realignment_token);
6020
6021    For the case of dr_explicit_realign:
6022     loop:
6023       msq = *(floor(p));        # load in loop
6024       p' = p + (VS-1);
6025       lsq = *(floor(p'));       # load in loop
6026       result = realign_load (msq, lsq, realignment_token);
6027
6028    Input:
6029    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6030                a memory location that may be unaligned.
6031    BSI - place where new code is to be inserted.
6032    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6033                               is used.
6034
6035    Output:
6036    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6037                        target hook, if defined.
6038    Return value - the result of the loop-header phi node.  */
6039
6040 tree
6041 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6042                         gimple_stmt_iterator *gsi, tree *realignment_token,
6043                         enum dr_alignment_support alignment_support_scheme,
6044                         tree init_addr,
6045                         class loop **at_loop)
6046 {
6047   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6048   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6049   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6050   struct data_reference *dr = dr_info->dr;
6051   class loop *loop = NULL;
6052   edge pe = NULL;
6053   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6054   tree vec_dest;
6055   gimple *inc;
6056   tree ptr;
6057   tree data_ref;
6058   basic_block new_bb;
6059   tree msq_init = NULL_TREE;
6060   tree new_temp;
6061   gphi *phi_stmt;
6062   tree msq = NULL_TREE;
6063   gimple_seq stmts = NULL;
6064   bool compute_in_loop = false;
6065   bool nested_in_vect_loop = false;
6066   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6067   class loop *loop_for_initial_load = NULL;
6068
6069   if (loop_vinfo)
6070     {
6071       loop = LOOP_VINFO_LOOP (loop_vinfo);
6072       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6073     }
6074
6075   gcc_assert (alignment_support_scheme == dr_explicit_realign
6076               || alignment_support_scheme == dr_explicit_realign_optimized);
6077
6078   /* We need to generate three things:
6079      1. the misalignment computation
6080      2. the extra vector load (for the optimized realignment scheme).
6081      3. the phi node for the two vectors from which the realignment is
6082       done (for the optimized realignment scheme).  */
6083
6084   /* 1. Determine where to generate the misalignment computation.
6085
6086      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6087      calculation will be generated by this function, outside the loop (in the
6088      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6089      caller, inside the loop.
6090
6091      Background: If the misalignment remains fixed throughout the iterations of
6092      the loop, then both realignment schemes are applicable, and also the
6093      misalignment computation can be done outside LOOP.  This is because we are
6094      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6095      are a multiple of VS (the Vector Size), and therefore the misalignment in
6096      different vectorized LOOP iterations is always the same.
6097      The problem arises only if the memory access is in an inner-loop nested
6098      inside LOOP, which is now being vectorized using outer-loop vectorization.
6099      This is the only case when the misalignment of the memory access may not
6100      remain fixed throughout the iterations of the inner-loop (as explained in
6101      detail in vect_supportable_dr_alignment).  In this case, not only is the
6102      optimized realignment scheme not applicable, but also the misalignment
6103      computation (and generation of the realignment token that is passed to
6104      REALIGN_LOAD) have to be done inside the loop.
6105
6106      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6107      or not, which in turn determines if the misalignment is computed inside
6108      the inner-loop, or outside LOOP.  */
6109
6110   if (init_addr != NULL_TREE || !loop_vinfo)
6111     {
6112       compute_in_loop = true;
6113       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6114     }
6115
6116
6117   /* 2. Determine where to generate the extra vector load.
6118
6119      For the optimized realignment scheme, instead of generating two vector
6120      loads in each iteration, we generate a single extra vector load in the
6121      preheader of the loop, and in each iteration reuse the result of the
6122      vector load from the previous iteration.  In case the memory access is in
6123      an inner-loop nested inside LOOP, which is now being vectorized using
6124      outer-loop vectorization, we need to determine whether this initial vector
6125      load should be generated at the preheader of the inner-loop, or can be
6126      generated at the preheader of LOOP.  If the memory access has no evolution
6127      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6128      to be generated inside LOOP (in the preheader of the inner-loop).  */
6129
6130   if (nested_in_vect_loop)
6131     {
6132       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6133       bool invariant_in_outerloop =
6134             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6135       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6136     }
6137   else
6138     loop_for_initial_load = loop;
6139   if (at_loop)
6140     *at_loop = loop_for_initial_load;
6141
6142   tree vuse = NULL_TREE;
6143   if (loop_for_initial_load)
6144     {
6145       pe = loop_preheader_edge (loop_for_initial_load);
6146       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6147         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6148     }
6149   if (!vuse)
6150     vuse = gimple_vuse (gsi_stmt (*gsi));
6151
6152   /* 3. For the case of the optimized realignment, create the first vector
6153       load at the loop preheader.  */
6154
6155   if (alignment_support_scheme == dr_explicit_realign_optimized)
6156     {
6157       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6158       gassign *new_stmt;
6159
6160       gcc_assert (!compute_in_loop);
6161       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6162       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6163                                       loop_for_initial_load, NULL_TREE,
6164                                       &init_addr, NULL, &inc, true);
6165       if (TREE_CODE (ptr) == SSA_NAME)
6166         new_temp = copy_ssa_name (ptr);
6167       else
6168         new_temp = make_ssa_name (TREE_TYPE (ptr));
6169       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6170       tree type = TREE_TYPE (ptr);
6171       new_stmt = gimple_build_assign
6172                    (new_temp, BIT_AND_EXPR, ptr,
6173                     fold_build2 (MINUS_EXPR, type,
6174                                  build_int_cst (type, 0),
6175                                  build_int_cst (type, align)));
6176       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6177       gcc_assert (!new_bb);
6178       data_ref
6179         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6180                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6181       vect_copy_ref_info (data_ref, DR_REF (dr));
6182       new_stmt = gimple_build_assign (vec_dest, data_ref);
6183       new_temp = make_ssa_name (vec_dest, new_stmt);
6184       gimple_assign_set_lhs (new_stmt, new_temp);
6185       gimple_set_vuse (new_stmt, vuse);
6186       if (pe)
6187         {
6188           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6189           gcc_assert (!new_bb);
6190         }
6191       else
6192          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6193
6194       msq_init = gimple_assign_lhs (new_stmt);
6195     }
6196
6197   /* 4. Create realignment token using a target builtin, if available.
6198       It is done either inside the containing loop, or before LOOP (as
6199       determined above).  */
6200
6201   if (targetm.vectorize.builtin_mask_for_load)
6202     {
6203       gcall *new_stmt;
6204       tree builtin_decl;
6205
6206       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6207       if (!init_addr)
6208         {
6209           /* Generate the INIT_ADDR computation outside LOOP.  */
6210           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6211                                                             stmt_info, &stmts,
6212                                                             NULL_TREE);
6213           if (loop)
6214             {
6215               pe = loop_preheader_edge (loop);
6216               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6217               gcc_assert (!new_bb);
6218             }
6219           else
6220              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6221         }
6222
6223       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6224       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6225       vec_dest =
6226         vect_create_destination_var (scalar_dest,
6227                                      gimple_call_return_type (new_stmt));
6228       new_temp = make_ssa_name (vec_dest, new_stmt);
6229       gimple_call_set_lhs (new_stmt, new_temp);
6230
6231       if (compute_in_loop)
6232         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6233       else
6234         {
6235           /* Generate the misalignment computation outside LOOP.  */
6236           pe = loop_preheader_edge (loop);
6237           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6238           gcc_assert (!new_bb);
6239         }
6240
6241       *realignment_token = gimple_call_lhs (new_stmt);
6242
6243       /* The result of the CALL_EXPR to this builtin is determined from
6244          the value of the parameter and no global variables are touched
6245          which makes the builtin a "const" function.  Requiring the
6246          builtin to have the "const" attribute makes it unnecessary
6247          to call mark_call_clobbered.  */
6248       gcc_assert (TREE_READONLY (builtin_decl));
6249     }
6250
6251   if (alignment_support_scheme == dr_explicit_realign)
6252     return msq;
6253
6254   gcc_assert (!compute_in_loop);
6255   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6256
6257
6258   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6259
6260   pe = loop_preheader_edge (containing_loop);
6261   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6262   msq = make_ssa_name (vec_dest);
6263   phi_stmt = create_phi_node (msq, containing_loop->header);
6264   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6265
6266   return msq;
6267 }
6268
6269
6270 /* Function vect_grouped_load_supported.
6271
6272    COUNT is the size of the load group (the number of statements plus the
6273    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6274    only one statement, with a gap of COUNT - 1.
6275
6276    Returns true if a suitable permute exists.  */
6277
6278 bool
6279 vect_grouped_load_supported (tree vectype, bool single_element_p,
6280                              unsigned HOST_WIDE_INT count)
6281 {
6282   machine_mode mode = TYPE_MODE (vectype);
6283
6284   /* If this is single-element interleaving with an element distance
6285      that leaves unused vector loads around punt - we at least create
6286      very sub-optimal code in that case (and blow up memory,
6287      see PR65518).  */
6288   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6289     {
6290       if (dump_enabled_p ())
6291         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6292                          "single-element interleaving not supported "
6293                          "for not adjacent vector loads\n");
6294       return false;
6295     }
6296
6297   /* vect_permute_load_chain requires the group size to be equal to 3 or
6298      be a power of two.  */
6299   if (count != 3 && exact_log2 (count) == -1)
6300     {
6301       if (dump_enabled_p ())
6302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303                          "the size of the group of accesses"
6304                          " is not a power of 2 or not equal to 3\n");
6305       return false;
6306     }
6307
6308   /* Check that the permutation is supported.  */
6309   if (VECTOR_MODE_P (mode))
6310     {
6311       unsigned int i, j;
6312       if (count == 3)
6313         {
6314           unsigned int nelt;
6315           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6316             {
6317               if (dump_enabled_p ())
6318                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6319                                  "cannot handle groups of 3 loads for"
6320                                  " variable-length vectors\n");
6321               return false;
6322             }
6323
6324           vec_perm_builder sel (nelt, nelt, 1);
6325           sel.quick_grow (nelt);
6326           vec_perm_indices indices;
6327           unsigned int k;
6328           for (k = 0; k < 3; k++)
6329             {
6330               for (i = 0; i < nelt; i++)
6331                 if (3 * i + k < 2 * nelt)
6332                   sel[i] = 3 * i + k;
6333                 else
6334                   sel[i] = 0;
6335               indices.new_vector (sel, 2, nelt);
6336               if (!can_vec_perm_const_p (mode, mode, indices))
6337                 {
6338                   if (dump_enabled_p ())
6339                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6340                                      "shuffle of 3 loads is not supported by"
6341                                      " target\n");
6342                   return false;
6343                 }
6344               for (i = 0, j = 0; i < nelt; i++)
6345                 if (3 * i + k < 2 * nelt)
6346                   sel[i] = i;
6347                 else
6348                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6349               indices.new_vector (sel, 2, nelt);
6350               if (!can_vec_perm_const_p (mode, mode, indices))
6351                 {
6352                   if (dump_enabled_p ())
6353                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6354                                      "shuffle of 3 loads is not supported by"
6355                                      " target\n");
6356                   return false;
6357                 }
6358             }
6359           return true;
6360         }
6361       else
6362         {
6363           /* If length is not equal to 3 then only power of 2 is supported.  */
6364           gcc_assert (pow2p_hwi (count));
6365           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6366
6367           /* The encoding has a single stepped pattern.  */
6368           vec_perm_builder sel (nelt, 1, 3);
6369           sel.quick_grow (3);
6370           for (i = 0; i < 3; i++)
6371             sel[i] = i * 2;
6372           vec_perm_indices indices (sel, 2, nelt);
6373           if (can_vec_perm_const_p (mode, mode, indices))
6374             {
6375               for (i = 0; i < 3; i++)
6376                 sel[i] = i * 2 + 1;
6377               indices.new_vector (sel, 2, nelt);
6378               if (can_vec_perm_const_p (mode, mode, indices))
6379                 return true;
6380             }
6381         }
6382     }
6383
6384   if (dump_enabled_p ())
6385     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6386                      "extract even/odd not supported by target\n");
6387   return false;
6388 }
6389
6390 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6391    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6392
6393 internal_fn
6394 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6395                            bool masked_p)
6396 {
6397   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6398                                     vec_mask_len_load_lanes_optab, vectype,
6399                                     count))
6400     return IFN_MASK_LEN_LOAD_LANES;
6401   else if (masked_p)
6402     {
6403       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6404                                         vec_mask_load_lanes_optab, vectype,
6405                                         count))
6406         return IFN_MASK_LOAD_LANES;
6407     }
6408   else
6409     {
6410       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6411                                         vectype, count))
6412         return IFN_LOAD_LANES;
6413     }
6414   return IFN_LAST;
6415 }
6416
6417 /* Function vect_permute_load_chain.
6418
6419    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6420    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6421    the input data correctly.  Return the final references for loads in
6422    RESULT_CHAIN.
6423
6424    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6425    The input is 4 vectors each containing 8 elements. We assign a number to each
6426    element, the input sequence is:
6427
6428    1st vec:   0  1  2  3  4  5  6  7
6429    2nd vec:   8  9 10 11 12 13 14 15
6430    3rd vec:  16 17 18 19 20 21 22 23
6431    4th vec:  24 25 26 27 28 29 30 31
6432
6433    The output sequence should be:
6434
6435    1st vec:  0 4  8 12 16 20 24 28
6436    2nd vec:  1 5  9 13 17 21 25 29
6437    3rd vec:  2 6 10 14 18 22 26 30
6438    4th vec:  3 7 11 15 19 23 27 31
6439
6440    i.e., the first output vector should contain the first elements of each
6441    interleaving group, etc.
6442
6443    We use extract_even/odd instructions to create such output.  The input of
6444    each extract_even/odd operation is two vectors
6445    1st vec    2nd vec
6446    0 1 2 3    4 5 6 7
6447
6448    and the output is the vector of extracted even/odd elements.  The output of
6449    extract_even will be:   0 2 4 6
6450    and of extract_odd:     1 3 5 7
6451
6452
6453    The permutation is done in log LENGTH stages.  In each stage extract_even
6454    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6455    their order.  In our example,
6456
6457    E1: extract_even (1st vec, 2nd vec)
6458    E2: extract_odd (1st vec, 2nd vec)
6459    E3: extract_even (3rd vec, 4th vec)
6460    E4: extract_odd (3rd vec, 4th vec)
6461
6462    The output for the first stage will be:
6463
6464    E1:  0  2  4  6  8 10 12 14
6465    E2:  1  3  5  7  9 11 13 15
6466    E3: 16 18 20 22 24 26 28 30
6467    E4: 17 19 21 23 25 27 29 31
6468
6469    In order to proceed and create the correct sequence for the next stage (or
6470    for the correct output, if the second stage is the last one, as in our
6471    example), we first put the output of extract_even operation and then the
6472    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6473    The input for the second stage is:
6474
6475    1st vec (E1):  0  2  4  6  8 10 12 14
6476    2nd vec (E3): 16 18 20 22 24 26 28 30
6477    3rd vec (E2):  1  3  5  7  9 11 13 15
6478    4th vec (E4): 17 19 21 23 25 27 29 31
6479
6480    The output of the second stage:
6481
6482    E1: 0 4  8 12 16 20 24 28
6483    E2: 2 6 10 14 18 22 26 30
6484    E3: 1 5  9 13 17 21 25 29
6485    E4: 3 7 11 15 19 23 27 31
6486
6487    And RESULT_CHAIN after reordering:
6488
6489    1st vec (E1):  0 4  8 12 16 20 24 28
6490    2nd vec (E3):  1 5  9 13 17 21 25 29
6491    3rd vec (E2):  2 6 10 14 18 22 26 30
6492    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6493
6494 static void
6495 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6496                          unsigned int length,
6497                          stmt_vec_info stmt_info,
6498                          gimple_stmt_iterator *gsi,
6499                          vec<tree> *result_chain)
6500 {
6501   tree data_ref, first_vect, second_vect;
6502   tree perm_mask_even, perm_mask_odd;
6503   tree perm3_mask_low, perm3_mask_high;
6504   gimple *perm_stmt;
6505   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6506   unsigned int i, j, log_length = exact_log2 (length);
6507
6508   result_chain->quick_grow (length);
6509   memcpy (result_chain->address (), dr_chain.address (),
6510           length * sizeof (tree));
6511
6512   if (length == 3)
6513     {
6514       /* vect_grouped_load_supported ensures that this is constant.  */
6515       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6516       unsigned int k;
6517
6518       vec_perm_builder sel (nelt, nelt, 1);
6519       sel.quick_grow (nelt);
6520       vec_perm_indices indices;
6521       for (k = 0; k < 3; k++)
6522         {
6523           for (i = 0; i < nelt; i++)
6524             if (3 * i + k < 2 * nelt)
6525               sel[i] = 3 * i + k;
6526             else
6527               sel[i] = 0;
6528           indices.new_vector (sel, 2, nelt);
6529           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6530
6531           for (i = 0, j = 0; i < nelt; i++)
6532             if (3 * i + k < 2 * nelt)
6533               sel[i] = i;
6534             else
6535               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6536           indices.new_vector (sel, 2, nelt);
6537           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6538
6539           first_vect = dr_chain[0];
6540           second_vect = dr_chain[1];
6541
6542           /* Create interleaving stmt (low part of):
6543              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6544                                                              ...}>  */
6545           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6546           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6547                                            second_vect, perm3_mask_low);
6548           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6549
6550           /* Create interleaving stmt (high part of):
6551              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6552                                                               ...}>  */
6553           first_vect = data_ref;
6554           second_vect = dr_chain[2];
6555           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6556           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6557                                            second_vect, perm3_mask_high);
6558           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6559           (*result_chain)[k] = data_ref;
6560         }
6561     }
6562   else
6563     {
6564       /* If length is not equal to 3 then only power of 2 is supported.  */
6565       gcc_assert (pow2p_hwi (length));
6566
6567       /* The encoding has a single stepped pattern.  */
6568       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6569       vec_perm_builder sel (nelt, 1, 3);
6570       sel.quick_grow (3);
6571       for (i = 0; i < 3; ++i)
6572         sel[i] = i * 2;
6573       vec_perm_indices indices (sel, 2, nelt);
6574       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6575
6576       for (i = 0; i < 3; ++i)
6577         sel[i] = i * 2 + 1;
6578       indices.new_vector (sel, 2, nelt);
6579       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6580
6581       for (i = 0; i < log_length; i++)
6582         {
6583           for (j = 0; j < length; j += 2)
6584             {
6585               first_vect = dr_chain[j];
6586               second_vect = dr_chain[j+1];
6587
6588               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6589               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6590               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6591                                                first_vect, second_vect,
6592                                                perm_mask_even);
6593               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6594               (*result_chain)[j/2] = data_ref;
6595
6596               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6597               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6598               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6599                                                first_vect, second_vect,
6600                                                perm_mask_odd);
6601               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6602               (*result_chain)[j/2+length/2] = data_ref;
6603             }
6604           memcpy (dr_chain.address (), result_chain->address (),
6605                   length * sizeof (tree));
6606         }
6607     }
6608 }
6609
6610 /* Function vect_shift_permute_load_chain.
6611
6612    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6613    sequence of stmts to reorder the input data accordingly.
6614    Return the final references for loads in RESULT_CHAIN.
6615    Return true if successed, false otherwise.
6616
6617    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6618    The input is 3 vectors each containing 8 elements.  We assign a
6619    number to each element, the input sequence is:
6620
6621    1st vec:   0  1  2  3  4  5  6  7
6622    2nd vec:   8  9 10 11 12 13 14 15
6623    3rd vec:  16 17 18 19 20 21 22 23
6624
6625    The output sequence should be:
6626
6627    1st vec:  0 3 6  9 12 15 18 21
6628    2nd vec:  1 4 7 10 13 16 19 22
6629    3rd vec:  2 5 8 11 14 17 20 23
6630
6631    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6632
6633    First we shuffle all 3 vectors to get correct elements order:
6634
6635    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6636    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6637    3rd vec:  (16 19 22) (17 20 23) (18 21)
6638
6639    Next we unite and shift vector 3 times:
6640
6641    1st step:
6642      shift right by 6 the concatenation of:
6643      "1st vec" and  "2nd vec"
6644        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6645      "2nd vec" and  "3rd vec"
6646        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6647      "3rd vec" and  "1st vec"
6648        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6649                              | New vectors                   |
6650
6651      So that now new vectors are:
6652
6653      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6654      2nd vec:  (10 13) (16 19 22) (17 20 23)
6655      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6656
6657    2nd step:
6658      shift right by 5 the concatenation of:
6659      "1st vec" and  "3rd vec"
6660        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6661      "2nd vec" and  "1st vec"
6662        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6663      "3rd vec" and  "2nd vec"
6664        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6665                           | New vectors                   |
6666
6667      So that now new vectors are:
6668
6669      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6670      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6671      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6672
6673    3rd step:
6674      shift right by 5 the concatenation of:
6675      "1st vec" and  "1st vec"
6676        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6677      shift right by 3 the concatenation of:
6678      "2nd vec" and  "2nd vec"
6679                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6680                           | New vectors                   |
6681
6682      So that now all vectors are READY:
6683      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6684      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6685      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6686
6687    This algorithm is faster than one in vect_permute_load_chain if:
6688      1.  "shift of a concatination" is faster than general permutation.
6689          This is usually so.
6690      2.  The TARGET machine can't execute vector instructions in parallel.
6691          This is because each step of the algorithm depends on previous.
6692          The algorithm in vect_permute_load_chain is much more parallel.
6693
6694    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6695 */
6696
6697 static bool
6698 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6699                                unsigned int length,
6700                                stmt_vec_info stmt_info,
6701                                gimple_stmt_iterator *gsi,
6702                                vec<tree> *result_chain)
6703 {
6704   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6705   tree perm2_mask1, perm2_mask2, perm3_mask;
6706   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6707   gimple *perm_stmt;
6708
6709   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6710   machine_mode vmode = TYPE_MODE (vectype);
6711   unsigned int i;
6712   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6713
6714   unsigned HOST_WIDE_INT nelt, vf;
6715   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6716       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6717     /* Not supported for variable-length vectors.  */
6718     return false;
6719
6720   vec_perm_builder sel (nelt, nelt, 1);
6721   sel.quick_grow (nelt);
6722
6723   result_chain->quick_grow (length);
6724   memcpy (result_chain->address (), dr_chain.address (),
6725           length * sizeof (tree));
6726
6727   if (pow2p_hwi (length) && vf > 4)
6728     {
6729       unsigned int j, log_length = exact_log2 (length);
6730       for (i = 0; i < nelt / 2; ++i)
6731         sel[i] = i * 2;
6732       for (i = 0; i < nelt / 2; ++i)
6733         sel[nelt / 2 + i] = i * 2 + 1;
6734       vec_perm_indices indices (sel, 2, nelt);
6735       if (!can_vec_perm_const_p (vmode, vmode, indices))
6736         {
6737           if (dump_enabled_p ())
6738             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6739                              "shuffle of 2 fields structure is not \
6740                               supported by target\n");
6741           return false;
6742         }
6743       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6744
6745       for (i = 0; i < nelt / 2; ++i)
6746         sel[i] = i * 2 + 1;
6747       for (i = 0; i < nelt / 2; ++i)
6748         sel[nelt / 2 + i] = i * 2;
6749       indices.new_vector (sel, 2, nelt);
6750       if (!can_vec_perm_const_p (vmode, vmode, indices))
6751         {
6752           if (dump_enabled_p ())
6753             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6754                              "shuffle of 2 fields structure is not \
6755                               supported by target\n");
6756           return false;
6757         }
6758       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6759
6760       /* Generating permutation constant to shift all elements.
6761          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6762       for (i = 0; i < nelt; i++)
6763         sel[i] = nelt / 2 + i;
6764       indices.new_vector (sel, 2, nelt);
6765       if (!can_vec_perm_const_p (vmode, vmode, indices))
6766         {
6767           if (dump_enabled_p ())
6768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769                              "shift permutation is not supported by target\n");
6770           return false;
6771         }
6772       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6773
6774       /* Generating permutation constant to select vector from 2.
6775          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6776       for (i = 0; i < nelt / 2; i++)
6777         sel[i] = i;
6778       for (i = nelt / 2; i < nelt; i++)
6779         sel[i] = nelt + i;
6780       indices.new_vector (sel, 2, nelt);
6781       if (!can_vec_perm_const_p (vmode, vmode, indices))
6782         {
6783           if (dump_enabled_p ())
6784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785                              "select is not supported by target\n");
6786           return false;
6787         }
6788       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6789
6790       for (i = 0; i < log_length; i++)
6791         {
6792           for (j = 0; j < length; j += 2)
6793             {
6794               first_vect = dr_chain[j];
6795               second_vect = dr_chain[j + 1];
6796
6797               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6798               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6799                                                first_vect, first_vect,
6800                                                perm2_mask1);
6801               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6802               vect[0] = data_ref;
6803
6804               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6805               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6806                                                second_vect, second_vect,
6807                                                perm2_mask2);
6808               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6809               vect[1] = data_ref;
6810
6811               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6812               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6813                                                vect[0], vect[1], shift1_mask);
6814               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6815               (*result_chain)[j/2 + length/2] = data_ref;
6816
6817               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6818               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6819                                                vect[0], vect[1], select_mask);
6820               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6821               (*result_chain)[j/2] = data_ref;
6822             }
6823           memcpy (dr_chain.address (), result_chain->address (),
6824                   length * sizeof (tree));
6825         }
6826       return true;
6827     }
6828   if (length == 3 && vf > 2)
6829     {
6830       unsigned int k = 0, l = 0;
6831
6832       /* Generating permutation constant to get all elements in rigth order.
6833          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6834       for (i = 0; i < nelt; i++)
6835         {
6836           if (3 * k + (l % 3) >= nelt)
6837             {
6838               k = 0;
6839               l += (3 - (nelt % 3));
6840             }
6841           sel[i] = 3 * k + (l % 3);
6842           k++;
6843         }
6844       vec_perm_indices indices (sel, 2, nelt);
6845       if (!can_vec_perm_const_p (vmode, vmode, indices))
6846         {
6847           if (dump_enabled_p ())
6848             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849                              "shuffle of 3 fields structure is not \
6850                               supported by target\n");
6851           return false;
6852         }
6853       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6854
6855       /* Generating permutation constant to shift all elements.
6856          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6857       for (i = 0; i < nelt; i++)
6858         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6859       indices.new_vector (sel, 2, nelt);
6860       if (!can_vec_perm_const_p (vmode, vmode, indices))
6861         {
6862           if (dump_enabled_p ())
6863             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6864                              "shift permutation is not supported by target\n");
6865           return false;
6866         }
6867       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6868
6869       /* Generating permutation constant to shift all elements.
6870          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6871       for (i = 0; i < nelt; i++)
6872         sel[i] = 2 * (nelt / 3) + 1 + i;
6873       indices.new_vector (sel, 2, nelt);
6874       if (!can_vec_perm_const_p (vmode, vmode, indices))
6875         {
6876           if (dump_enabled_p ())
6877             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6878                              "shift permutation is not supported by target\n");
6879           return false;
6880         }
6881       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6882
6883       /* Generating permutation constant to shift all elements.
6884          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6885       for (i = 0; i < nelt; i++)
6886         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6887       indices.new_vector (sel, 2, nelt);
6888       if (!can_vec_perm_const_p (vmode, vmode, indices))
6889         {
6890           if (dump_enabled_p ())
6891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6892                              "shift permutation is not supported by target\n");
6893           return false;
6894         }
6895       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6896
6897       /* Generating permutation constant to shift all elements.
6898          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6899       for (i = 0; i < nelt; i++)
6900         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6901       indices.new_vector (sel, 2, nelt);
6902       if (!can_vec_perm_const_p (vmode, vmode, indices))
6903         {
6904           if (dump_enabled_p ())
6905             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906                              "shift permutation is not supported by target\n");
6907           return false;
6908         }
6909       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6910
6911       for (k = 0; k < 3; k++)
6912         {
6913           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6914           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6915                                            dr_chain[k], dr_chain[k],
6916                                            perm3_mask);
6917           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6918           vect[k] = data_ref;
6919         }
6920
6921       for (k = 0; k < 3; k++)
6922         {
6923           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6924           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6925                                            vect[k % 3], vect[(k + 1) % 3],
6926                                            shift1_mask);
6927           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6928           vect_shift[k] = data_ref;
6929         }
6930
6931       for (k = 0; k < 3; k++)
6932         {
6933           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6934           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6935                                            vect_shift[(4 - k) % 3],
6936                                            vect_shift[(3 - k) % 3],
6937                                            shift2_mask);
6938           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6939           vect[k] = data_ref;
6940         }
6941
6942       (*result_chain)[3 - (nelt % 3)] = vect[2];
6943
6944       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6945       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6946                                        vect[0], shift3_mask);
6947       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6948       (*result_chain)[nelt % 3] = data_ref;
6949
6950       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6951       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6952                                        vect[1], shift4_mask);
6953       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6954       (*result_chain)[0] = data_ref;
6955       return true;
6956     }
6957   return false;
6958 }
6959
6960 /* Function vect_transform_grouped_load.
6961
6962    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6963    to perform their permutation and ascribe the result vectorized statements to
6964    the scalar statements.
6965 */
6966
6967 void
6968 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6969                              vec<tree> dr_chain,
6970                              int size, gimple_stmt_iterator *gsi)
6971 {
6972   machine_mode mode;
6973   vec<tree> result_chain = vNULL;
6974
6975   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6976      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6977      vectors, that are ready for vector computation.  */
6978   result_chain.create (size);
6979
6980   /* If reassociation width for vector type is 2 or greater target machine can
6981      execute 2 or more vector instructions in parallel.  Otherwise try to
6982      get chain for loads group using vect_shift_permute_load_chain.  */
6983   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6984   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6985       || pow2p_hwi (size)
6986       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6987                                          gsi, &result_chain))
6988     vect_permute_load_chain (vinfo, dr_chain,
6989                              size, stmt_info, gsi, &result_chain);
6990   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6991   result_chain.release ();
6992 }
6993
6994 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6995    generated as part of the vectorization of STMT_INFO.  Assign the statement
6996    for each vector to the associated scalar statement.  */
6997
6998 void
6999 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
7000                                   vec<tree> result_chain)
7001 {
7002   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7003   unsigned int i, gap_count;
7004   tree tmp_data_ref;
7005
7006   /* Put a permuted data-ref in the VECTORIZED_STMT field.
7007      Since we scan the chain starting from it's first node, their order
7008      corresponds the order of data-refs in RESULT_CHAIN.  */
7009   stmt_vec_info next_stmt_info = first_stmt_info;
7010   gap_count = 1;
7011   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
7012     {
7013       if (!next_stmt_info)
7014         break;
7015
7016       /* Skip the gaps.  Loads created for the gaps will be removed by dead
7017        code elimination pass later.  No need to check for the first stmt in
7018        the group, since it always exists.
7019        DR_GROUP_GAP is the number of steps in elements from the previous
7020        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
7021        correspond to the gaps.  */
7022       if (next_stmt_info != first_stmt_info
7023           && gap_count < DR_GROUP_GAP (next_stmt_info))
7024         {
7025           gap_count++;
7026           continue;
7027         }
7028
7029       /* ???  The following needs cleanup after the removal of
7030          DR_GROUP_SAME_DR_STMT.  */
7031       if (next_stmt_info)
7032         {
7033           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7034           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7035              copies, and we put the new vector statement last.  */
7036           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7037
7038           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7039           gap_count = 1;
7040         }
7041     }
7042 }
7043
7044 /* Function vect_force_dr_alignment_p.
7045
7046    Returns whether the alignment of a DECL can be forced to be aligned
7047    on ALIGNMENT bit boundary.  */
7048
7049 bool
7050 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7051 {
7052   if (!VAR_P (decl))
7053     return false;
7054
7055   if (decl_in_symtab_p (decl)
7056       && !symtab_node::get (decl)->can_increase_alignment_p ())
7057     return false;
7058
7059   if (TREE_STATIC (decl))
7060     return (known_le (alignment,
7061                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7062   else
7063     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7064 }
7065
7066 /* Return whether the data reference DR_INFO is supported with respect to its
7067    alignment.
7068    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7069    it is aligned, i.e., check if it is possible to vectorize it with different
7070    alignment.  */
7071
7072 enum dr_alignment_support
7073 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7074                                tree vectype, int misalignment)
7075 {
7076   data_reference *dr = dr_info->dr;
7077   stmt_vec_info stmt_info = dr_info->stmt;
7078   machine_mode mode = TYPE_MODE (vectype);
7079   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7080   class loop *vect_loop = NULL;
7081   bool nested_in_vect_loop = false;
7082
7083   if (misalignment == 0)
7084     return dr_aligned;
7085
7086   /* For now assume all conditional loads/stores support unaligned
7087      access without any special code.  */
7088   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7089     if (gimple_call_internal_p (stmt)
7090         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7091             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7092       return dr_unaligned_supported;
7093
7094   if (loop_vinfo)
7095     {
7096       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7097       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7098     }
7099
7100   /* Possibly unaligned access.  */
7101
7102   /* We can choose between using the implicit realignment scheme (generating
7103      a misaligned_move stmt) and the explicit realignment scheme (generating
7104      aligned loads with a REALIGN_LOAD).  There are two variants to the
7105      explicit realignment scheme: optimized, and unoptimized.
7106      We can optimize the realignment only if the step between consecutive
7107      vector loads is equal to the vector size.  Since the vector memory
7108      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7109      is guaranteed that the misalignment amount remains the same throughout the
7110      execution of the vectorized loop.  Therefore, we can create the
7111      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7112      at the loop preheader.
7113
7114      However, in the case of outer-loop vectorization, when vectorizing a
7115      memory access in the inner-loop nested within the LOOP that is now being
7116      vectorized, while it is guaranteed that the misalignment of the
7117      vectorized memory access will remain the same in different outer-loop
7118      iterations, it is *not* guaranteed that is will remain the same throughout
7119      the execution of the inner-loop.  This is because the inner-loop advances
7120      with the original scalar step (and not in steps of VS).  If the inner-loop
7121      step happens to be a multiple of VS, then the misalignment remains fixed
7122      and we can use the optimized realignment scheme.  For example:
7123
7124       for (i=0; i<N; i++)
7125         for (j=0; j<M; j++)
7126           s += a[i+j];
7127
7128      When vectorizing the i-loop in the above example, the step between
7129      consecutive vector loads is 1, and so the misalignment does not remain
7130      fixed across the execution of the inner-loop, and the realignment cannot
7131      be optimized (as illustrated in the following pseudo vectorized loop):
7132
7133       for (i=0; i<N; i+=4)
7134         for (j=0; j<M; j++){
7135           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7136                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7137                          // (assuming that we start from an aligned address).
7138           }
7139
7140      We therefore have to use the unoptimized realignment scheme:
7141
7142       for (i=0; i<N; i+=4)
7143           for (j=k; j<M; j+=4)
7144           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7145                            // that the misalignment of the initial address is
7146                            // 0).
7147
7148      The loop can then be vectorized as follows:
7149
7150       for (k=0; k<4; k++){
7151         rt = get_realignment_token (&vp[k]);
7152         for (i=0; i<N; i+=4){
7153           v1 = vp[i+k];
7154           for (j=k; j<M; j+=4){
7155             v2 = vp[i+j+VS-1];
7156             va = REALIGN_LOAD <v1,v2,rt>;
7157             vs += va;
7158             v1 = v2;
7159           }
7160         }
7161     } */
7162
7163   if (DR_IS_READ (dr))
7164     {
7165       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7166           && (!targetm.vectorize.builtin_mask_for_load
7167               || targetm.vectorize.builtin_mask_for_load ()))
7168         {
7169           /* If we are doing SLP then the accesses need not have the
7170              same alignment, instead it depends on the SLP group size.  */
7171           if (loop_vinfo
7172               && STMT_SLP_TYPE (stmt_info)
7173               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7174                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7175                                   * (DR_GROUP_SIZE
7176                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7177                                   TYPE_VECTOR_SUBPARTS (vectype))))
7178             ;
7179           else if (!loop_vinfo
7180                    || (nested_in_vect_loop
7181                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7182                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7183             return dr_explicit_realign;
7184           else
7185             return dr_explicit_realign_optimized;
7186         }
7187     }
7188
7189   bool is_packed = false;
7190   tree type = TREE_TYPE (DR_REF (dr));
7191   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7192     is_packed = not_size_aligned (DR_REF (dr));
7193   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7194                                                      is_packed))
7195     return dr_unaligned_supported;
7196
7197   /* Unsupported.  */
7198   return dr_unaligned_unsupported;
7199 }