gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 140         {
 141           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144           if (rhs < lhs)
 145             scalar_type = rhs_type;
 146         }
 147     }
 148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 149     {
 150       unsigned int i = 0;
 151       if (gimple_call_internal_p (call))
 152         {
 153           internal_fn ifn = gimple_call_internal_fn (call);
 154           if (internal_load_fn_p (ifn))
 155             /* For loads the LHS type does the trick.  */
 156             i = ~0U;
 157           else if (internal_store_fn_p (ifn))
 158             {
 159               /* For stores use the tyep of the stored value.  */
 160               i = internal_fn_stored_value_index (ifn);
 161               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 162               i = ~0U;
 163             }
 164           else if (internal_fn_mask_index (ifn) == 0)
 165             i = 1;
 166         }
 167       if (i < gimple_call_num_args (call))
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 170           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 171             {
 172               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 173               if (rhs < lhs)
 174                 scalar_type = rhs_type;
 175             }
 176         }
 177     }
 178
 179   return scalar_type;
 180 }
 181
 182
 183 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 184    tested at run-time.  Return TRUE if DDR was successfully inserted.
 185    Return false if versioning is not supported.  */
 186
 187 static opt_result
 188 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 189 {
 190   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 191
 192   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 193     return opt_result::failure_at (vect_location,
 194                                    "will not create alias checks, as"
 195                                    " --param vect-max-version-for-alias-checks"
 196                                    " == 0\n");
 197
 198   opt_result res
 199     = runtime_alias_check_p (ddr, loop,
 200                              optimize_loop_nest_for_speed_p (loop));
 201   if (!res)
 202     return res;
 203
 204   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 205   return opt_result::success ();
 206 }
 207
 208 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 209
 210 static void
 211 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 212 {
 213   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 214   for (unsigned int i = 0; i < checks.length(); ++i)
 215     if (checks[i] == value)
 216       return;
 217
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location,
 220                      "need run-time check that %T is nonzero\n",
 221                      value);
 222   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 223 }
 224
 225 /* Return true if we know that the order of vectorized DR_INFO_A and
 226    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 227    DR_INFO_B.  At least one of the accesses is a write.  */
 228
 229 static bool
 230 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 231 {
 232   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 233   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 234
 235   /* Single statements are always kept in their original order.  */
 236   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 237       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 238     return true;
 239
 240   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 241      emitted at the position of the first scalar load.
 242      Stores in a group are emitted at the position of the last scalar store.
 243      Compute that position and check whether the resulting order matches
 244      the current one.  */
 245   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 246   if (il_a)
 247     {
 248       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 249         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 250              s = DR_GROUP_NEXT_ELEMENT (s))
 251           il_a = get_later_stmt (il_a, s);
 252       else /* DR_IS_READ */
 253         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 254              s = DR_GROUP_NEXT_ELEMENT (s))
 255           if (get_later_stmt (il_a, s) == il_a)
 256             il_a = s;
 257     }
 258   else
 259     il_a = stmtinfo_a;
 260   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 261   if (il_b)
 262     {
 263       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 264         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 265              s = DR_GROUP_NEXT_ELEMENT (s))
 266           il_b = get_later_stmt (il_b, s);
 267       else /* DR_IS_READ */
 268         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 269              s = DR_GROUP_NEXT_ELEMENT (s))
 270           if (get_later_stmt (il_b, s) == il_b)
 271             il_b = s;
 272     }
 273   else
 274     il_b = stmtinfo_b;
 275   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 276   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 277 }
 278
 279 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 280    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 281    distances.  These distances are conservatively correct but they don't
 282    reflect a guaranteed dependence.
 283
 284    Return true if this function does all the work necessary to avoid
 285    an alias or false if the caller should use the dependence distances
 286    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 287    the depth of the loop described by LOOP_VINFO and the other arguments
 288    are as for vect_analyze_data_ref_dependence.  */
 289
 290 static bool
 291 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 292                                        loop_vec_info loop_vinfo,
 293                                        int loop_depth, unsigned int *max_vf)
 294 {
 295   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 296   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 297     {
 298       int dist = dist_v[loop_depth];
 299       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 300         {
 301           /* If the user asserted safelen >= DIST consecutive iterations
 302              can be executed concurrently, assume independence.
 303
 304              ??? An alternative would be to add the alias check even
 305              in this case, and vectorize the fallback loop with the
 306              maximum VF set to safelen.  However, if the user has
 307              explicitly given a length, it's less likely that that
 308              would be a win.  */
 309           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 310             {
 311               if ((unsigned int) loop->safelen < *max_vf)
 312                 *max_vf = loop->safelen;
 313               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 314               continue;
 315             }
 316
 317           /* For dependence distances of 2 or more, we have the option
 318              of limiting VF or checking for an alias at runtime.
 319              Prefer to check at runtime if we can, to avoid limiting
 320              the VF unnecessarily when the bases are in fact independent.
 321
 322              Note that the alias checks will be removed if the VF ends up
 323              being small enough.  */
 324           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 325           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 326           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 327                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 328                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 329         }
 330     }
 331   return true;
 332 }
 333
 334
 335 /* Function vect_analyze_data_ref_dependence.
 336
 337    FIXME: I needed to change the sense of the returned flag.
 338
 339    Return FALSE if there (might) exist a dependence between a memory-reference
 340    DRA and a memory-reference DRB.  When versioning for alias may check a
 341    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 342    the data dependence.  */
 343
 344 static opt_result
 345 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 346                                   loop_vec_info loop_vinfo,
 347                                   unsigned int *max_vf)
 348 {
 349   unsigned int i;
 350   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 351   struct data_reference *dra = DDR_A (ddr);
 352   struct data_reference *drb = DDR_B (ddr);
 353   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 354   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 355   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 356   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 357   lambda_vector dist_v;
 358   unsigned int loop_depth;
 359
 360   /* If user asserted safelen consecutive iterations can be
 361      executed concurrently, assume independence.  */
 362   auto apply_safelen = [&]()
 363     {
 364       if (loop->safelen >= 2)
 365         {
 366           if ((unsigned int) loop->safelen < *max_vf)
 367             *max_vf = loop->safelen;
 368           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 369           return true;
 370         }
 371       return false;
 372     };
 373
 374   /* In loop analysis all data references should be vectorizable.  */
 375   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 376       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 377     gcc_unreachable ();
 378
 379   /* Independent data accesses.  */
 380   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 381     return opt_result::success ();
 382
 383   if (dra == drb
 384       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 385     return opt_result::success ();
 386
 387   /* We do not have to consider dependences between accesses that belong
 388      to the same group, unless the stride could be smaller than the
 389      group size.  */
 390   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 391       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 392           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 393       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 394     return opt_result::success ();
 395
 396   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 397      least two scalar iterations, there is always also a true dependence.
 398      As the vectorizer does not re-order loads and stores we can ignore
 399      the anti-dependence if TBAA can disambiguate both DRs similar to the
 400      case with known negative distance anti-dependences (positive
 401      distance anti-dependences would violate TBAA constraints).  */
 402   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 403        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 404       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 405                                  get_alias_set (DR_REF (drb))))
 406     return opt_result::success ();
 407
 408   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 409       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 410     {
 411       if (apply_safelen ())
 412         return opt_result::success ();
 413
 414       return opt_result::failure_at
 415         (stmtinfo_a->stmt,
 416          "possible alias involving gather/scatter between %T and %T\n",
 417          DR_REF (dra), DR_REF (drb));
 418     }
 419
 420   /* Unknown data dependence.  */
 421   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 422     {
 423       if (apply_safelen ())
 424         return opt_result::success ();
 425
 426       if (dump_enabled_p ())
 427         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 428                          "versioning for alias required: "
 429                          "can't determine dependence between %T and %T\n",
 430                          DR_REF (dra), DR_REF (drb));
 431
 432       /* Add to list of ddrs that need to be tested at run-time.  */
 433       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 434     }
 435
 436   /* Known data dependence.  */
 437   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 438     {
 439       if (apply_safelen ())
 440         return opt_result::success ();
 441
 442       if (dump_enabled_p ())
 443         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 444                          "versioning for alias required: "
 445                          "bad dist vector for %T and %T\n",
 446                          DR_REF (dra), DR_REF (drb));
 447       /* Add to list of ddrs that need to be tested at run-time.  */
 448       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 449     }
 450
 451   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 452
 453   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 454       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 455                                                 loop_depth, max_vf))
 456     return opt_result::success ();
 457
 458   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 459     {
 460       int dist = dist_v[loop_depth];
 461
 462       if (dump_enabled_p ())
 463         dump_printf_loc (MSG_NOTE, vect_location,
 464                          "dependence distance  = %d.\n", dist);
 465
 466       if (dist == 0)
 467         {
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location,
 470                              "dependence distance == 0 between %T and %T\n",
 471                              DR_REF (dra), DR_REF (drb));
 472
 473           /* When we perform grouped accesses and perform implicit CSE
 474              by detecting equal accesses and doing disambiguation with
 475              runtime alias tests like for
 476                 .. = a[i];
 477                 .. = a[i+1];
 478                 a[i] = ..;
 479                 a[i+1] = ..;
 480                 *p = ..;
 481                 .. = a[i];
 482                 .. = a[i+1];
 483              where we will end up loading { a[i], a[i+1] } once, make
 484              sure that inserting group loads before the first load and
 485              stores after the last store will do the right thing.
 486              Similar for groups like
 487                 a[i] = ...;
 488                 ... = a[i];
 489                 a[i+1] = ...;
 490              where loads from the group interleave with the store.  */
 491           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 492             return opt_result::failure_at (stmtinfo_a->stmt,
 493                                            "READ_WRITE dependence"
 494                                            " in interleaving.\n");
 495
 496           if (loop->safelen < 2)
 497             {
 498               tree indicator = dr_zero_step_indicator (dra);
 499               if (!indicator || integer_zerop (indicator))
 500                 return opt_result::failure_at (stmtinfo_a->stmt,
 501                                                "access also has a zero step\n");
 502               else if (TREE_CODE (indicator) != INTEGER_CST)
 503                 vect_check_nonzero_value (loop_vinfo, indicator);
 504             }
 505           continue;
 506         }
 507
 508       if (dist > 0 && DDR_REVERSED_P (ddr))
 509         {
 510           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 511              reversed (to make distance vector positive), and the actual
 512              distance is negative.  */
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "dependence distance negative.\n");
 516           /* When doing outer loop vectorization, we need to check if there is
 517              a backward dependence at the inner loop level if the dependence
 518              at the outer loop is reversed.  See PR81740.  */
 519           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 520               || nested_in_vect_loop_p (loop, stmtinfo_b))
 521             {
 522               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 523                                                          DDR_LOOP_NEST (ddr));
 524               if (dist_v[inner_depth] < 0)
 525                 return opt_result::failure_at (stmtinfo_a->stmt,
 526                                                "not vectorized, dependence "
 527                                                "between data-refs %T and %T\n",
 528                                                DR_REF (dra), DR_REF (drb));
 529             }
 530           /* Record a negative dependence distance to later limit the
 531              amount of stmt copying / unrolling we can perform.
 532              Only need to handle read-after-write dependence.  */
 533           if (DR_IS_READ (drb)
 534               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 535                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 536             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 537           continue;
 538         }
 539
 540       unsigned int abs_dist = abs (dist);
 541       if (abs_dist >= 2 && abs_dist < *max_vf)
 542         {
 543           /* The dependence distance requires reduction of the maximal
 544              vectorization factor.  */
 545           *max_vf = abs_dist;
 546           if (dump_enabled_p ())
 547             dump_printf_loc (MSG_NOTE, vect_location,
 548                              "adjusting maximal vectorization factor to %i\n",
 549                              *max_vf);
 550         }
 551
 552       if (abs_dist >= *max_vf)
 553         {
 554           /* Dependence distance does not create dependence, as far as
 555              vectorization is concerned, in this case.  */
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location,
 558                              "dependence distance >= VF.\n");
 559           continue;
 560         }
 561
 562       return opt_result::failure_at (stmtinfo_a->stmt,
 563                                      "not vectorized, possible dependence "
 564                                      "between data-refs %T and %T\n",
 565                                      DR_REF (dra), DR_REF (drb));
 566     }
 567
 568   return opt_result::success ();
 569 }
 570
 571 /* Function vect_analyze_data_ref_dependences.
 572
 573    Examine all the data references in the loop, and make sure there do not
 574    exist any data dependences between them.  Set *MAX_VF according to
 575    the maximum vectorization factor the data dependences allow.  */
 576
 577 opt_result
 578 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 579                                    unsigned int *max_vf)
 580 {
 581   unsigned int i;
 582   struct data_dependence_relation *ddr;
 583
 584   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 585
 586   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 587     {
 588       LOOP_VINFO_DDRS (loop_vinfo)
 589         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 590                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 591       /* We do not need read-read dependences.  */
 592       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 593                                           &LOOP_VINFO_DDRS (loop_vinfo),
 594                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 595                                           false);
 596       gcc_assert (res);
 597     }
 598
 599   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 600
 601   /* For epilogues we either have no aliases or alias versioning
 602      was applied to original loop.  Therefore we may just get max_vf
 603      using VF of original loop.  */
 604   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 605     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 606   else
 607     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 608       {
 609         opt_result res
 610           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 611         if (!res)
 612           return res;
 613       }
 614
 615   return opt_result::success ();
 616 }
 617
 618
 619 /* Function vect_slp_analyze_data_ref_dependence.
 620
 621    Return TRUE if there (might) exist a dependence between a memory-reference
 622    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 623    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 624    according to the data dependence.  */
 625
 626 static bool
 627 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 628                                       struct data_dependence_relation *ddr)
 629 {
 630   struct data_reference *dra = DDR_A (ddr);
 631   struct data_reference *drb = DDR_B (ddr);
 632   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 633   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 634
 635   /* We need to check dependences of statements marked as unvectorizable
 636      as well, they still can prohibit vectorization.  */
 637
 638   /* Independent data accesses.  */
 639   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 640     return false;
 641
 642   if (dra == drb)
 643     return false;
 644
 645   /* Read-read is OK.  */
 646   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 647     return false;
 648
 649   /* If dra and drb are part of the same interleaving chain consider
 650      them independent.  */
 651   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 652       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 653           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 654     return false;
 655
 656   /* Unknown data dependence.  */
 657   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 658     {
 659       if  (dump_enabled_p ())
 660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                          "can't determine dependence between %T and %T\n",
 662                          DR_REF (dra), DR_REF (drb));
 663     }
 664   else if (dump_enabled_p ())
 665     dump_printf_loc (MSG_NOTE, vect_location,
 666                      "determined dependence between %T and %T\n",
 667                      DR_REF (dra), DR_REF (drb));
 668
 669   return true;
 670 }
 671
 672
 673 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 674    contain the vector of scalar stores of this instance if we are
 675    disambiguating the loads.  */
 676
 677 static bool
 678 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 679                                    vec<stmt_vec_info> stores,
 680                                    stmt_vec_info last_store_info)
 681 {
 682   /* This walks over all stmts involved in the SLP load/store done
 683      in NODE verifying we can sink them up to the last stmt in the
 684      group.  */
 685   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 686     {
 687       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 688       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 689         {
 690           stmt_vec_info access_info
 691             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 692           if (access_info == last_access_info)
 693             continue;
 694           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 695           ao_ref ref;
 696           bool ref_initialized_p = false;
 697           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 698                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 699             {
 700               gimple *stmt = gsi_stmt (gsi);
 701               if (! gimple_vuse (stmt))
 702                 continue;
 703
 704               /* If we couldn't record a (single) data reference for this
 705                  stmt we have to resort to the alias oracle.  */
 706               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 707               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 708               if (!dr_b)
 709                 {
 710                   /* We are moving a store - this means
 711                      we cannot use TBAA for disambiguation.  */
 712                   if (!ref_initialized_p)
 713                     ao_ref_init (&ref, DR_REF (dr_a));
 714                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 715                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 716                     return false;
 717                   continue;
 718                 }
 719
 720               bool dependent = false;
 721               /* If we run into a store of this same instance (we've just
 722                  marked those) then delay dependence checking until we run
 723                  into the last store because this is where it will have
 724                  been sunk to (and we verify if we can do that as well).  */
 725               if (gimple_visited_p (stmt))
 726                 {
 727                   if (stmt_info != last_store_info)
 728                     continue;
 729
 730                   for (stmt_vec_info &store_info : stores)
 731                     {
 732                       data_reference *store_dr
 733                         = STMT_VINFO_DATA_REF (store_info);
 734                       ddr_p ddr = initialize_data_dependence_relation
 735                                     (dr_a, store_dr, vNULL);
 736                       dependent
 737                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 738                       free_dependence_relation (ddr);
 739                       if (dependent)
 740                         break;
 741                     }
 742                 }
 743               else
 744                 {
 745                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 746                                                                    dr_b, vNULL);
 747                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 748                   free_dependence_relation (ddr);
 749                 }
 750               if (dependent)
 751                 return false;
 752             }
 753         }
 754     }
 755   else /* DR_IS_READ */
 756     {
 757       stmt_vec_info first_access_info
 758         = vect_find_first_scalar_stmt_in_slp (node);
 759       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 760         {
 761           stmt_vec_info access_info
 762             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 763           if (access_info == first_access_info)
 764             continue;
 765           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 766           ao_ref ref;
 767           bool ref_initialized_p = false;
 768           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 769                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 770             {
 771               gimple *stmt = gsi_stmt (gsi);
 772               if (! gimple_vdef (stmt))
 773                 continue;
 774
 775               /* If we couldn't record a (single) data reference for this
 776                  stmt we have to resort to the alias oracle.  */
 777               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 778               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 779
 780               /* We are hoisting a load - this means we can use
 781                  TBAA for disambiguation.  */
 782               if (!ref_initialized_p)
 783                 ao_ref_init (&ref, DR_REF (dr_a));
 784               if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 785                 {
 786                   if (!dr_b)
 787                     return false;
 788                   /* Resort to dependence checking below.  */
 789                 }
 790               else
 791                 /* No dependence.  */
 792                 continue;
 793
 794               bool dependent = false;
 795               /* If we run into a store of this same instance (we've just
 796                  marked those) then delay dependence checking until we run
 797                  into the last store because this is where it will have
 798                  been sunk to (and we verify if we can do that as well).  */
 799               if (gimple_visited_p (stmt))
 800                 {
 801                   if (stmt_info != last_store_info)
 802                     continue;
 803
 804                   for (stmt_vec_info &store_info : stores)
 805                     {
 806                       data_reference *store_dr
 807                         = STMT_VINFO_DATA_REF (store_info);
 808                       ddr_p ddr = initialize_data_dependence_relation
 809                                     (dr_a, store_dr, vNULL);
 810                       dependent
 811                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 812                       free_dependence_relation (ddr);
 813                       if (dependent)
 814                         break;
 815                     }
 816                 }
 817               else
 818                 {
 819                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 820                                                                    dr_b, vNULL);
 821                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 822                   free_dependence_relation (ddr);
 823                 }
 824               if (dependent)
 825                 return false;
 826             }
 827         }
 828     }
 829   return true;
 830 }
 831
 832
 833 /* Function vect_analyze_data_ref_dependences.
 834
 835    Examine all the data references in the basic-block, and make sure there
 836    do not exist any data dependences between them.  Set *MAX_VF according to
 837    the maximum vectorization factor the data dependences allow.  */
 838
 839 bool
 840 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 841 {
 842   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 843
 844   /* The stores of this instance are at the root of the SLP tree.  */
 845   slp_tree store = NULL;
 846   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 847     store = SLP_INSTANCE_TREE (instance);
 848
 849   /* Verify we can sink stores to the vectorized stmt insert location.  */
 850   stmt_vec_info last_store_info = NULL;
 851   if (store)
 852     {
 853       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 854         return false;
 855
 856       /* Mark stores in this instance and remember the last one.  */
 857       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 858       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 859         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 860     }
 861
 862   bool res = true;
 863
 864   /* Verify we can sink loads to the vectorized stmt insert location,
 865      special-casing stores of this instance.  */
 866   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 867     if (! vect_slp_analyze_node_dependences (vinfo, load,
 868                                              store
 869                                              ? SLP_TREE_SCALAR_STMTS (store)
 870                                              : vNULL, last_store_info))
 871       {
 872         res = false;
 873         break;
 874       }
 875
 876   /* Unset the visited flag.  */
 877   if (store)
 878     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 879       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 880
 881   return res;
 882 }
 883
 884 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 885    applied.  */
 886
 887 int
 888 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 889 {
 890   HOST_WIDE_INT diff = 0;
 891   /* Alignment is only analyzed for the first element of a DR group,
 892      use that but adjust misalignment by the offset of the access.  */
 893   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 894     {
 895       dr_vec_info *first_dr
 896         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 897       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 898          INTEGER_CSTs and the first element in the group has the lowest
 899          address.  */
 900       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 901               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 902       gcc_assert (diff >= 0);
 903       dr_info = first_dr;
 904     }
 905
 906   int misalign = dr_info->misalignment;
 907   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 908   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 909     return misalign;
 910
 911   /* If the access is only aligned for a vector type with smaller alignment
 912      requirement the access has unknown misalignment.  */
 913   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 914                 targetm.vectorize.preferred_vector_alignment (vectype)))
 915     return DR_MISALIGNMENT_UNKNOWN;
 916
 917   /* Apply the offset from the DR group start and the externally supplied
 918      offset which can for example result from a negative stride access.  */
 919   poly_int64 misalignment = misalign + diff + offset;
 920
 921   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 922      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 923   unsigned HOST_WIDE_INT target_alignment_c
 924     = dr_info->target_alignment.to_constant ();
 925   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 926     return DR_MISALIGNMENT_UNKNOWN;
 927   return misalign;
 928 }
 929
 930 /* Record the base alignment guarantee given by DRB, which occurs
 931    in STMT_INFO.  */
 932
 933 static void
 934 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 935                             innermost_loop_behavior *drb)
 936 {
 937   bool existed;
 938   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 939     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 940   if (!existed || entry.second->base_alignment < drb->base_alignment)
 941     {
 942       entry = std::make_pair (stmt_info, drb);
 943       if (dump_enabled_p ())
 944         dump_printf_loc (MSG_NOTE, vect_location,
 945                          "recording new base alignment for %T\n"
 946                          "  alignment:    %d\n"
 947                          "  misalignment: %d\n"
 948                          "  based on:     %G",
 949                          drb->base_address,
 950                          drb->base_alignment,
 951                          drb->base_misalignment,
 952                          stmt_info->stmt);
 953     }
 954 }
 955
 956 /* If the region we're going to vectorize is reached, all unconditional
 957    data references occur at least once.  We can therefore pool the base
 958    alignment guarantees from each unconditional reference.  Do this by
 959    going through all the data references in VINFO and checking whether
 960    the containing statement makes the reference unconditionally.  If so,
 961    record the alignment of the base address in VINFO so that it can be
 962    used for all other references with the same base.  */
 963
 964 void
 965 vect_record_base_alignments (vec_info *vinfo)
 966 {
 967   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 968   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 969   for (data_reference *dr : vinfo->shared->datarefs)
 970     {
 971       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 972       stmt_vec_info stmt_info = dr_info->stmt;
 973       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 974           && STMT_VINFO_VECTORIZABLE (stmt_info)
 975           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 976         {
 977           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 978
 979           /* If DR is nested in the loop that is being vectorized, we can also
 980              record the alignment of the base wrt the outer loop.  */
 981           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 982             vect_record_base_alignment
 983               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 984         }
 985     }
 986 }
 987
 988 /* Function vect_compute_data_ref_alignment
 989
 990    Compute the misalignment of the data reference DR_INFO when vectorizing
 991    with VECTYPE.
 992
 993    Output:
 994    1. initialized misalignment info for DR_INFO
 995
 996    FOR NOW: No analysis is actually performed. Misalignment is calculated
 997    only for trivial cases. TODO.  */
 998
 999 static void
1000 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1001                                  tree vectype)
1002 {
1003   stmt_vec_info stmt_info = dr_info->stmt;
1004   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1005   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1006   class loop *loop = NULL;
1007   tree ref = DR_REF (dr_info->dr);
1008
1009   if (dump_enabled_p ())
1010     dump_printf_loc (MSG_NOTE, vect_location,
1011                      "vect_compute_data_ref_alignment:\n");
1012
1013   if (loop_vinfo)
1014     loop = LOOP_VINFO_LOOP (loop_vinfo);
1015
1016   /* Initialize misalignment to unknown.  */
1017   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1018
1019   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1020     return;
1021
1022   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1023   bool step_preserves_misalignment_p;
1024
1025   poly_uint64 vector_alignment
1026     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1027                  BITS_PER_UNIT);
1028   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1029
1030   /* If the main loop has peeled for alignment we have no way of knowing
1031      whether the data accesses in the epilogues are aligned.  We can't at
1032      compile time answer the question whether we have entered the main loop or
1033      not.  Fixes PR 92351.  */
1034   if (loop_vinfo)
1035     {
1036       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1037       if (orig_loop_vinfo
1038           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1039         return;
1040     }
1041
1042   unsigned HOST_WIDE_INT vect_align_c;
1043   if (!vector_alignment.is_constant (&vect_align_c))
1044     return;
1045
1046   /* No step for BB vectorization.  */
1047   if (!loop)
1048     {
1049       gcc_assert (integer_zerop (drb->step));
1050       step_preserves_misalignment_p = true;
1051     }
1052
1053   /* In case the dataref is in an inner-loop of the loop that is being
1054      vectorized (LOOP), we use the base and misalignment information
1055      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1056      stays the same throughout the execution of the inner-loop, which is why
1057      we have to check that the stride of the dataref in the inner-loop evenly
1058      divides by the vector alignment.  */
1059   else if (nested_in_vect_loop_p (loop, stmt_info))
1060     {
1061       step_preserves_misalignment_p
1062         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1063
1064       if (dump_enabled_p ())
1065         {
1066           if (step_preserves_misalignment_p)
1067             dump_printf_loc (MSG_NOTE, vect_location,
1068                              "inner step divides the vector alignment.\n");
1069           else
1070             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071                              "inner step doesn't divide the vector"
1072                              " alignment.\n");
1073         }
1074     }
1075
1076   /* Similarly we can only use base and misalignment information relative to
1077      an innermost loop if the misalignment stays the same throughout the
1078      execution of the loop.  As above, this is the case if the stride of
1079      the dataref evenly divides by the alignment.  */
1080   else
1081     {
1082       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1083       step_preserves_misalignment_p
1084         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1085
1086       if (!step_preserves_misalignment_p && dump_enabled_p ())
1087         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1088                          "step doesn't divide the vector alignment.\n");
1089     }
1090
1091   unsigned int base_alignment = drb->base_alignment;
1092   unsigned int base_misalignment = drb->base_misalignment;
1093
1094   /* Calculate the maximum of the pooled base address alignment and the
1095      alignment that we can compute for DR itself.  */
1096   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1097     = base_alignments->get (drb->base_address);
1098   if (entry
1099       && base_alignment < (*entry).second->base_alignment
1100       && (loop_vinfo
1101           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1102                               gimple_bb (entry->first->stmt))
1103               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1104                   || (entry->first->dr_aux.group <= dr_info->group)))))
1105     {
1106       base_alignment = entry->second->base_alignment;
1107       base_misalignment = entry->second->base_misalignment;
1108     }
1109
1110   if (drb->offset_alignment < vect_align_c
1111       || !step_preserves_misalignment_p
1112       /* We need to know whether the step wrt the vectorized loop is
1113          negative when computing the starting misalignment below.  */
1114       || TREE_CODE (drb->step) != INTEGER_CST)
1115     {
1116       if (dump_enabled_p ())
1117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118                          "Unknown alignment for access: %T\n", ref);
1119       return;
1120     }
1121
1122   if (base_alignment < vect_align_c)
1123     {
1124       unsigned int max_alignment;
1125       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1126       if (max_alignment < vect_align_c
1127           || !vect_can_force_dr_alignment_p (base,
1128                                              vect_align_c * BITS_PER_UNIT))
1129         {
1130           if (dump_enabled_p ())
1131             dump_printf_loc (MSG_NOTE, vect_location,
1132                              "can't force alignment of ref: %T\n", ref);
1133           return;
1134         }
1135
1136       /* Force the alignment of the decl.
1137          NOTE: This is the only change to the code we make during
1138          the analysis phase, before deciding to vectorize the loop.  */
1139       if (dump_enabled_p ())
1140         dump_printf_loc (MSG_NOTE, vect_location,
1141                          "force alignment of %T\n", ref);
1142
1143       dr_info->base_decl = base;
1144       dr_info->base_misaligned = true;
1145       base_misalignment = 0;
1146     }
1147   poly_int64 misalignment
1148     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1149
1150   unsigned int const_misalignment;
1151   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1152     {
1153       if (dump_enabled_p ())
1154         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155                          "Non-constant misalignment for access: %T\n", ref);
1156       return;
1157     }
1158
1159   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1160
1161   if (dump_enabled_p ())
1162     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163                      "misalign = %d bytes of ref %T\n",
1164                      const_misalignment, ref);
1165
1166   return;
1167 }
1168
1169 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1170    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1171    is made aligned via peeling.  */
1172
1173 static bool
1174 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1175                                          dr_vec_info *dr_peel_info)
1176 {
1177   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1178                   DR_TARGET_ALIGNMENT (dr_info)))
1179     {
1180       poly_offset_int diff
1181         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1182            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1183       if (known_eq (diff, 0)
1184           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1185         return true;
1186     }
1187   return false;
1188 }
1189
1190 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1191    aligned via peeling.  */
1192
1193 static bool
1194 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1195                                  dr_vec_info *dr_peel_info)
1196 {
1197   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1198                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1199       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1200                            DR_OFFSET (dr_peel_info->dr), 0)
1201       || !operand_equal_p (DR_STEP (dr_info->dr),
1202                            DR_STEP (dr_peel_info->dr), 0))
1203     return false;
1204
1205   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1206 }
1207
1208 /* Compute the value for dr_info->misalign so that the access appears
1209    aligned.  This is used by peeling to compensate for dr_misalignment
1210    applying the offset for negative step.  */
1211
1212 int
1213 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1214 {
1215   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1216     return 0;
1217
1218   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1219   poly_int64 misalignment
1220     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1221        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1222
1223   unsigned HOST_WIDE_INT target_alignment_c;
1224   int misalign;
1225   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1226       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1227     return DR_MISALIGNMENT_UNKNOWN;
1228   return misalign;
1229 }
1230
1231 /* Function vect_update_misalignment_for_peel.
1232    Sets DR_INFO's misalignment
1233    - to 0 if it has the same alignment as DR_PEEL_INFO,
1234    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1235    - to -1 (unknown) otherwise.
1236
1237    DR_INFO - the data reference whose misalignment is to be adjusted.
1238    DR_PEEL_INFO - the data reference whose misalignment is being made
1239                   zero in the vector loop by the peel.
1240    NPEEL - the number of iterations in the peel loop if the misalignment
1241            of DR_PEEL_INFO is known at compile time.  */
1242
1243 static void
1244 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1245                                    dr_vec_info *dr_peel_info, int npeel)
1246 {
1247   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1248   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1249     {
1250       SET_DR_MISALIGNMENT (dr_info,
1251                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1252       return;
1253     }
1254
1255   unsigned HOST_WIDE_INT alignment;
1256   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1257       && known_alignment_for_access_p (dr_info,
1258                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1259       && known_alignment_for_access_p (dr_peel_info,
1260                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1261     {
1262       int misal = dr_info->misalignment;
1263       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1264       misal &= alignment - 1;
1265       set_dr_misalignment (dr_info, misal);
1266       return;
1267     }
1268
1269   if (dump_enabled_p ())
1270     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1271                      "to unknown (-1).\n");
1272   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1273 }
1274
1275 /* Return true if alignment is relevant for DR_INFO.  */
1276
1277 static bool
1278 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1279 {
1280   stmt_vec_info stmt_info = dr_info->stmt;
1281
1282   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1283     return false;
1284
1285   /* For interleaving, only the alignment of the first access matters.  */
1286   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1287       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1288     return false;
1289
1290   /* Scatter-gather and invariant accesses continue to address individual
1291      scalars, so vector-level alignment is irrelevant.  */
1292   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1293       || integer_zerop (DR_STEP (dr_info->dr)))
1294     return false;
1295
1296   /* Strided accesses perform only component accesses, alignment is
1297      irrelevant for them.  */
1298   if (STMT_VINFO_STRIDED_P (stmt_info)
1299       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1300     return false;
1301
1302   return true;
1303 }
1304
1305 /* Given an memory reference EXP return whether its alignment is less
1306    than its size.  */
1307
1308 static bool
1309 not_size_aligned (tree exp)
1310 {
1311   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1312     return true;
1313
1314   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1315           > get_object_alignment (exp));
1316 }
1317
1318 /* Function vector_alignment_reachable_p
1319
1320    Return true if vector alignment for DR_INFO is reachable by peeling
1321    a few loop iterations.  Return false otherwise.  */
1322
1323 static bool
1324 vector_alignment_reachable_p (dr_vec_info *dr_info)
1325 {
1326   stmt_vec_info stmt_info = dr_info->stmt;
1327   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1328
1329   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1330     {
1331       /* For interleaved access we peel only if number of iterations in
1332          the prolog loop ({VF - misalignment}), is a multiple of the
1333          number of the interleaved accesses.  */
1334       int elem_size, mis_in_elements;
1335
1336       /* FORNOW: handle only known alignment.  */
1337       if (!known_alignment_for_access_p (dr_info, vectype))
1338         return false;
1339
1340       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1341       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1342       elem_size = vector_element_size (vector_size, nelements);
1343       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1344
1345       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1346         return false;
1347     }
1348
1349   /* If misalignment is known at the compile time then allow peeling
1350      only if natural alignment is reachable through peeling.  */
1351   if (known_alignment_for_access_p (dr_info, vectype)
1352       && !aligned_access_p (dr_info, vectype))
1353     {
1354       HOST_WIDE_INT elmsize =
1355                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1356       if (dump_enabled_p ())
1357         {
1358           dump_printf_loc (MSG_NOTE, vect_location,
1359                            "data size = %wd. misalignment = %d.\n", elmsize,
1360                            dr_misalignment (dr_info, vectype));
1361         }
1362       if (dr_misalignment (dr_info, vectype) % elmsize)
1363         {
1364           if (dump_enabled_p ())
1365             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366                              "data size does not divide the misalignment.\n");
1367           return false;
1368         }
1369     }
1370
1371   if (!known_alignment_for_access_p (dr_info, vectype))
1372     {
1373       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1374       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1375       if (dump_enabled_p ())
1376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                          "Unknown misalignment, %snaturally aligned\n",
1378                          is_packed ? "not " : "");
1379       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1380     }
1381
1382   return true;
1383 }
1384
1385
1386 /* Calculate the cost of the memory access represented by DR_INFO.  */
1387
1388 static void
1389 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1390                            dr_alignment_support alignment_support_scheme,
1391                            int misalignment,
1392                            unsigned int *inside_cost,
1393                            unsigned int *outside_cost,
1394                            stmt_vector_for_cost *body_cost_vec,
1395                            stmt_vector_for_cost *prologue_cost_vec)
1396 {
1397   stmt_vec_info stmt_info = dr_info->stmt;
1398   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1399   int ncopies;
1400
1401   if (PURE_SLP_STMT (stmt_info))
1402     ncopies = 1;
1403   else
1404     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1405
1406   if (DR_IS_READ (dr_info->dr))
1407     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1408                         misalignment, true, inside_cost,
1409                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1410   else
1411     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1412                          misalignment, inside_cost, body_cost_vec);
1413
1414   if (dump_enabled_p ())
1415     dump_printf_loc (MSG_NOTE, vect_location,
1416                      "vect_get_data_access_cost: inside_cost = %d, "
1417                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1418 }
1419
1420
1421 typedef struct _vect_peel_info
1422 {
1423   dr_vec_info *dr_info;
1424   int npeel;
1425   unsigned int count;
1426 } *vect_peel_info;
1427
1428 typedef struct _vect_peel_extended_info
1429 {
1430   vec_info *vinfo;
1431   struct _vect_peel_info peel_info;
1432   unsigned int inside_cost;
1433   unsigned int outside_cost;
1434 } *vect_peel_extended_info;
1435
1436
1437 /* Peeling hashtable helpers.  */
1438
1439 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1440 {
1441   static inline hashval_t hash (const _vect_peel_info *);
1442   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1443 };
1444
1445 inline hashval_t
1446 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1447 {
1448   return (hashval_t) peel_info->npeel;
1449 }
1450
1451 inline bool
1452 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1453 {
1454   return (a->npeel == b->npeel);
1455 }
1456
1457
1458 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1459
1460 static void
1461 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1462                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1463                           int npeel, bool supportable_if_not_aligned)
1464 {
1465   struct _vect_peel_info elem, *slot;
1466   _vect_peel_info **new_slot;
1467
1468   elem.npeel = npeel;
1469   slot = peeling_htab->find (&elem);
1470   if (slot)
1471     slot->count++;
1472   else
1473     {
1474       slot = XNEW (struct _vect_peel_info);
1475       slot->npeel = npeel;
1476       slot->dr_info = dr_info;
1477       slot->count = 1;
1478       new_slot = peeling_htab->find_slot (slot, INSERT);
1479       *new_slot = slot;
1480     }
1481
1482   /* If this DR is not supported with unknown misalignment then bias
1483      this slot when the cost model is disabled.  */
1484   if (!supportable_if_not_aligned
1485       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1486     slot->count += VECT_MAX_COST;
1487 }
1488
1489
1490 /* Traverse peeling hash table to find peeling option that aligns maximum
1491    number of data accesses.  */
1492
1493 int
1494 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1495                                      _vect_peel_extended_info *max)
1496 {
1497   vect_peel_info elem = *slot;
1498
1499   if (elem->count > max->peel_info.count
1500       || (elem->count == max->peel_info.count
1501           && max->peel_info.npeel > elem->npeel))
1502     {
1503       max->peel_info.npeel = elem->npeel;
1504       max->peel_info.count = elem->count;
1505       max->peel_info.dr_info = elem->dr_info;
1506     }
1507
1508   return 1;
1509 }
1510
1511 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1512    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1513    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1514    after peeling.  */
1515
1516 static void
1517 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1518                                 dr_vec_info *dr0_info,
1519                                 unsigned int *inside_cost,
1520                                 unsigned int *outside_cost,
1521                                 stmt_vector_for_cost *body_cost_vec,
1522                                 stmt_vector_for_cost *prologue_cost_vec,
1523                                 unsigned int npeel)
1524 {
1525   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1526
1527   bool dr0_alignment_known_p
1528     = (dr0_info
1529        && known_alignment_for_access_p (dr0_info,
1530                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1531
1532   for (data_reference *dr : datarefs)
1533     {
1534       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1535       if (!vect_relevant_for_alignment_p (dr_info))
1536         continue;
1537
1538       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1539       dr_alignment_support alignment_support_scheme;
1540       int misalignment;
1541       unsigned HOST_WIDE_INT alignment;
1542
1543       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1544                                             size_zero_node) < 0;
1545       poly_int64 off = 0;
1546       if (negative)
1547         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1548                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1549
1550       if (npeel == 0)
1551         misalignment = dr_misalignment (dr_info, vectype, off);
1552       else if (dr_info == dr0_info
1553                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1554         misalignment = 0;
1555       else if (!dr0_alignment_known_p
1556                || !known_alignment_for_access_p (dr_info, vectype)
1557                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1558         misalignment = DR_MISALIGNMENT_UNKNOWN;
1559       else
1560         {
1561           misalignment = dr_misalignment (dr_info, vectype, off);
1562           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1563           misalignment &= alignment - 1;
1564         }
1565       alignment_support_scheme
1566         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1567                                          misalignment);
1568
1569       vect_get_data_access_cost (loop_vinfo, dr_info,
1570                                  alignment_support_scheme, misalignment,
1571                                  inside_cost, outside_cost,
1572                                  body_cost_vec, prologue_cost_vec);
1573     }
1574 }
1575
1576 /* Traverse peeling hash table and calculate cost for each peeling option.
1577    Find the one with the lowest cost.  */
1578
1579 int
1580 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1581                                    _vect_peel_extended_info *min)
1582 {
1583   vect_peel_info elem = *slot;
1584   int dummy;
1585   unsigned int inside_cost = 0, outside_cost = 0;
1586   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1587   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1588                        epilogue_cost_vec;
1589
1590   prologue_cost_vec.create (2);
1591   body_cost_vec.create (2);
1592   epilogue_cost_vec.create (2);
1593
1594   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1595                                   &outside_cost, &body_cost_vec,
1596                                   &prologue_cost_vec, elem->npeel);
1597
1598   body_cost_vec.release ();
1599
1600   outside_cost += vect_get_known_peeling_cost
1601     (loop_vinfo, elem->npeel, &dummy,
1602      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1603      &prologue_cost_vec, &epilogue_cost_vec);
1604
1605   /* Prologue and epilogue costs are added to the target model later.
1606      These costs depend only on the scalar iteration cost, the
1607      number of peeling iterations finally chosen, and the number of
1608      misaligned statements.  So discard the information found here.  */
1609   prologue_cost_vec.release ();
1610   epilogue_cost_vec.release ();
1611
1612   if (inside_cost < min->inside_cost
1613       || (inside_cost == min->inside_cost
1614           && outside_cost < min->outside_cost))
1615     {
1616       min->inside_cost = inside_cost;
1617       min->outside_cost = outside_cost;
1618       min->peel_info.dr_info = elem->dr_info;
1619       min->peel_info.npeel = elem->npeel;
1620       min->peel_info.count = elem->count;
1621     }
1622
1623   return 1;
1624 }
1625
1626
1627 /* Choose best peeling option by traversing peeling hash table and either
1628    choosing an option with the lowest cost (if cost model is enabled) or the
1629    option that aligns as many accesses as possible.  */
1630
1631 static struct _vect_peel_extended_info
1632 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1633                                        loop_vec_info loop_vinfo)
1634 {
1635    struct _vect_peel_extended_info res;
1636
1637    res.peel_info.dr_info = NULL;
1638    res.vinfo = loop_vinfo;
1639
1640    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1641      {
1642        res.inside_cost = INT_MAX;
1643        res.outside_cost = INT_MAX;
1644        peeling_htab->traverse <_vect_peel_extended_info *,
1645                                vect_peeling_hash_get_lowest_cost> (&res);
1646      }
1647    else
1648      {
1649        res.peel_info.count = 0;
1650        peeling_htab->traverse <_vect_peel_extended_info *,
1651                                vect_peeling_hash_get_most_frequent> (&res);
1652        res.inside_cost = 0;
1653        res.outside_cost = 0;
1654      }
1655
1656    return res;
1657 }
1658
1659 /* Return true if the new peeling NPEEL is supported.  */
1660
1661 static bool
1662 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1663                           unsigned npeel)
1664 {
1665   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1666   enum dr_alignment_support supportable_dr_alignment;
1667
1668   bool dr0_alignment_known_p
1669     = known_alignment_for_access_p (dr0_info,
1670                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1671
1672   /* Ensure that all data refs can be vectorized after the peel.  */
1673   for (data_reference *dr : datarefs)
1674     {
1675       if (dr == dr0_info->dr)
1676         continue;
1677
1678       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1679       if (!vect_relevant_for_alignment_p (dr_info)
1680           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1681         continue;
1682
1683       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1684       int misalignment;
1685       unsigned HOST_WIDE_INT alignment;
1686       if (!dr0_alignment_known_p
1687           || !known_alignment_for_access_p (dr_info, vectype)
1688           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1689         misalignment = DR_MISALIGNMENT_UNKNOWN;
1690       else
1691         {
1692           misalignment = dr_misalignment (dr_info, vectype);
1693           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1694           misalignment &= alignment - 1;
1695         }
1696       supportable_dr_alignment
1697         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1698                                          misalignment);
1699       if (supportable_dr_alignment == dr_unaligned_unsupported)
1700         return false;
1701     }
1702
1703   return true;
1704 }
1705
1706 /* Compare two data-references DRA and DRB to group them into chunks
1707    with related alignment.  */
1708
1709 static int
1710 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1711 {
1712   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1713   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1714   int cmp;
1715
1716   /* Stabilize sort.  */
1717   if (dra == drb)
1718     return 0;
1719
1720   /* Ordering of DRs according to base.  */
1721   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1722                                DR_BASE_ADDRESS (drb));
1723   if (cmp != 0)
1724     return cmp;
1725
1726   /* And according to DR_OFFSET.  */
1727   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1728   if (cmp != 0)
1729     return cmp;
1730
1731   /* And after step.  */
1732   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1733   if (cmp != 0)
1734     return cmp;
1735
1736   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1737   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1738   if (cmp == 0)
1739     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1740   return cmp;
1741 }
1742
1743 /* Function vect_enhance_data_refs_alignment
1744
1745    This pass will use loop versioning and loop peeling in order to enhance
1746    the alignment of data references in the loop.
1747
1748    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1749    original loop is to be vectorized.  Any other loops that are created by
1750    the transformations performed in this pass - are not supposed to be
1751    vectorized.  This restriction will be relaxed.
1752
1753    This pass will require a cost model to guide it whether to apply peeling
1754    or versioning or a combination of the two.  For example, the scheme that
1755    intel uses when given a loop with several memory accesses, is as follows:
1756    choose one memory access ('p') which alignment you want to force by doing
1757    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1758    other accesses are not necessarily aligned, or (2) use loop versioning to
1759    generate one loop in which all accesses are aligned, and another loop in
1760    which only 'p' is necessarily aligned.
1761
1762    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1763    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1764    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1765
1766    Devising a cost model is the most critical aspect of this work.  It will
1767    guide us on which access to peel for, whether to use loop versioning, how
1768    many versions to create, etc.  The cost model will probably consist of
1769    generic considerations as well as target specific considerations (on
1770    powerpc for example, misaligned stores are more painful than misaligned
1771    loads).
1772
1773    Here are the general steps involved in alignment enhancements:
1774
1775      -- original loop, before alignment analysis:
1776         for (i=0; i<N; i++){
1777           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1778           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1779         }
1780
1781      -- After vect_compute_data_refs_alignment:
1782         for (i=0; i<N; i++){
1783           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1784           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1785         }
1786
1787      -- Possibility 1: we do loop versioning:
1788      if (p is aligned) {
1789         for (i=0; i<N; i++){    # loop 1A
1790           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1791           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1792         }
1793      }
1794      else {
1795         for (i=0; i<N; i++){    # loop 1B
1796           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1797           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1798         }
1799      }
1800
1801      -- Possibility 2: we do loop peeling:
1802      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1803         x = q[i];
1804         p[i] = y;
1805      }
1806      for (i = 3; i < N; i++){   # loop 2A
1807         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1808         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1809      }
1810
1811      -- Possibility 3: combination of loop peeling and versioning:
1812      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1813         x = q[i];
1814         p[i] = y;
1815      }
1816      if (p is aligned) {
1817         for (i = 3; i<N; i++){  # loop 3A
1818           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1819           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1820         }
1821      }
1822      else {
1823         for (i = 3; i<N; i++){  # loop 3B
1824           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1825           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1826         }
1827      }
1828
1829      These loops are later passed to loop_transform to be vectorized.  The
1830      vectorizer will use the alignment information to guide the transformation
1831      (whether to generate regular loads/stores, or with special handling for
1832      misalignment).  */
1833
1834 opt_result
1835 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1836 {
1837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838   dr_vec_info *first_store = NULL;
1839   dr_vec_info *dr0_info = NULL;
1840   struct data_reference *dr;
1841   unsigned int i;
1842   bool do_peeling = false;
1843   bool do_versioning = false;
1844   unsigned int npeel = 0;
1845   bool one_misalignment_known = false;
1846   bool one_misalignment_unknown = false;
1847   bool one_dr_unsupportable = false;
1848   dr_vec_info *unsupportable_dr_info = NULL;
1849   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1850   hash_table<peel_info_hasher> peeling_htab (1);
1851
1852   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1853
1854   /* Reset data so we can safely be called multiple times.  */
1855   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1856   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1857
1858   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1859     return opt_result::success ();
1860
1861   /* Sort the vector of datarefs so DRs that have the same or dependent
1862      alignment are next to each other.  */
1863   auto_vec<data_reference_p> datarefs
1864     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1865   datarefs.qsort (dr_align_group_sort_cmp);
1866
1867   /* Compute the number of DRs that become aligned when we peel
1868      a dataref so it becomes aligned.  */
1869   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1870   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1871   unsigned i0;
1872   for (i0 = 0; i0 < datarefs.length (); ++i0)
1873     if (DR_BASE_ADDRESS (datarefs[i0]))
1874       break;
1875   for (i = i0 + 1; i <= datarefs.length (); ++i)
1876     {
1877       if (i == datarefs.length ()
1878           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1879                                DR_BASE_ADDRESS (datarefs[i]), 0)
1880           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1881                                DR_OFFSET (datarefs[i]), 0)
1882           || !operand_equal_p (DR_STEP (datarefs[i0]),
1883                                DR_STEP (datarefs[i]), 0))
1884         {
1885           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1886              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1887              will get known misalignment if we align one of the refs
1888              with the largest DR_TARGET_ALIGNMENT.  */
1889           for (unsigned j = i0; j < i; ++j)
1890             {
1891               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1892               for (unsigned k = i0; k < i; ++k)
1893                 {
1894                   if (k == j)
1895                     continue;
1896                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1897                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1898                                                                dr_infoj))
1899                     n_same_align_refs[j]++;
1900                 }
1901             }
1902           i0 = i;
1903         }
1904     }
1905
1906   /* While cost model enhancements are expected in the future, the high level
1907      view of the code at this time is as follows:
1908
1909      A) If there is a misaligned access then see if peeling to align
1910         this access can make all data references satisfy
1911         vect_supportable_dr_alignment.  If so, update data structures
1912         as needed and return true.
1913
1914      B) If peeling wasn't possible and there is a data reference with an
1915         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1916         then see if loop versioning checks can be used to make all data
1917         references satisfy vect_supportable_dr_alignment.  If so, update
1918         data structures as needed and return true.
1919
1920      C) If neither peeling nor versioning were successful then return false if
1921         any data reference does not satisfy vect_supportable_dr_alignment.
1922
1923      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1924
1925      Note, Possibility 3 above (which is peeling and versioning together) is not
1926      being done at this time.  */
1927
1928   /* (1) Peeling to force alignment.  */
1929
1930   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1931      Considerations:
1932      + How many accesses will become aligned due to the peeling
1933      - How many accesses will become unaligned due to the peeling,
1934        and the cost of misaligned accesses.
1935      - The cost of peeling (the extra runtime checks, the increase
1936        in code size).  */
1937
1938   FOR_EACH_VEC_ELT (datarefs, i, dr)
1939     {
1940       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1941       if (!vect_relevant_for_alignment_p (dr_info))
1942         continue;
1943
1944       stmt_vec_info stmt_info = dr_info->stmt;
1945       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1946       do_peeling = vector_alignment_reachable_p (dr_info);
1947       if (do_peeling)
1948         {
1949           if (known_alignment_for_access_p (dr_info, vectype))
1950             {
1951               unsigned int npeel_tmp = 0;
1952               bool negative = tree_int_cst_compare (DR_STEP (dr),
1953                                                     size_zero_node) < 0;
1954
1955               /* If known_alignment_for_access_p then we have set
1956                  DR_MISALIGNMENT which is only done if we know it at compiler
1957                  time, so it is safe to assume target alignment is constant.
1958                */
1959               unsigned int target_align =
1960                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1961               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1962               poly_int64 off = 0;
1963               if (negative)
1964                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1965               unsigned int mis = dr_misalignment (dr_info, vectype, off);
1966               mis = negative ? mis : -mis;
1967               if (mis != 0)
1968                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1969
1970               /* For multiple types, it is possible that the bigger type access
1971                  will have more than one peeling option.  E.g., a loop with two
1972                  types: one of size (vector size / 4), and the other one of
1973                  size (vector size / 8).  Vectorization factor will 8.  If both
1974                  accesses are misaligned by 3, the first one needs one scalar
1975                  iteration to be aligned, and the second one needs 5.  But the
1976                  first one will be aligned also by peeling 5 scalar
1977                  iterations, and in that case both accesses will be aligned.
1978                  Hence, except for the immediate peeling amount, we also want
1979                  to try to add full vector size, while we don't exceed
1980                  vectorization factor.
1981                  We do this automatically for cost model, since we calculate
1982                  cost for every peeling option.  */
1983               poly_uint64 nscalars = npeel_tmp;
1984               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1985                 {
1986                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1987                   nscalars = (STMT_SLP_TYPE (stmt_info)
1988                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1989                 }
1990
1991               /* Save info about DR in the hash table.  Also include peeling
1992                  amounts according to the explanation above.  Indicate
1993                  the alignment status when the ref is not aligned.
1994                  ???  Rather than using unknown alignment here we should
1995                  prune all entries from the peeling hashtable which cause
1996                  DRs to be not supported.  */
1997               bool supportable_if_not_aligned
1998                 = vect_supportable_dr_alignment
1999                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2000               while (known_le (npeel_tmp, nscalars))
2001                 {
2002                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2003                                             dr_info, npeel_tmp,
2004                                             supportable_if_not_aligned);
2005                   npeel_tmp += MAX (1, target_align / dr_size);
2006                 }
2007
2008               one_misalignment_known = true;
2009             }
2010           else
2011             {
2012               /* If we don't know any misalignment values, we prefer
2013                  peeling for data-ref that has the maximum number of data-refs
2014                  with the same alignment, unless the target prefers to align
2015                  stores over load.  */
2016               unsigned same_align_drs = n_same_align_refs[i];
2017               if (!dr0_info
2018                   || dr0_same_align_drs < same_align_drs)
2019                 {
2020                   dr0_same_align_drs = same_align_drs;
2021                   dr0_info = dr_info;
2022                 }
2023               /* For data-refs with the same number of related
2024                  accesses prefer the one where the misalign
2025                  computation will be invariant in the outermost loop.  */
2026               else if (dr0_same_align_drs == same_align_drs)
2027                 {
2028                   class loop *ivloop0, *ivloop;
2029                   ivloop0 = outermost_invariant_loop_for_expr
2030                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2031                   ivloop = outermost_invariant_loop_for_expr
2032                     (loop, DR_BASE_ADDRESS (dr));
2033                   if ((ivloop && !ivloop0)
2034                       || (ivloop && ivloop0
2035                           && flow_loop_nested_p (ivloop, ivloop0)))
2036                     dr0_info = dr_info;
2037                 }
2038
2039               one_misalignment_unknown = true;
2040
2041               /* Check for data refs with unsupportable alignment that
2042                  can be peeled.  */
2043               enum dr_alignment_support supportable_dr_alignment
2044                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2045                                                  DR_MISALIGNMENT_UNKNOWN);
2046               if (supportable_dr_alignment == dr_unaligned_unsupported)
2047                 {
2048                   one_dr_unsupportable = true;
2049                   unsupportable_dr_info = dr_info;
2050                 }
2051
2052               if (!first_store && DR_IS_WRITE (dr))
2053                 {
2054                   first_store = dr_info;
2055                   first_store_same_align_drs = same_align_drs;
2056                 }
2057             }
2058         }
2059       else
2060         {
2061           if (!aligned_access_p (dr_info, vectype))
2062             {
2063               if (dump_enabled_p ())
2064                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2065                                  "vector alignment may not be reachable\n");
2066               break;
2067             }
2068         }
2069     }
2070
2071   /* Check if we can possibly peel the loop.  */
2072   if (!vect_can_advance_ivs_p (loop_vinfo)
2073       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2074       || loop->inner)
2075     do_peeling = false;
2076
2077   struct _vect_peel_extended_info peel_for_known_alignment;
2078   struct _vect_peel_extended_info peel_for_unknown_alignment;
2079   struct _vect_peel_extended_info best_peel;
2080
2081   peel_for_unknown_alignment.inside_cost = INT_MAX;
2082   peel_for_unknown_alignment.outside_cost = INT_MAX;
2083   peel_for_unknown_alignment.peel_info.count = 0;
2084
2085   if (do_peeling
2086       && one_misalignment_unknown)
2087     {
2088       /* Check if the target requires to prefer stores over loads, i.e., if
2089          misaligned stores are more expensive than misaligned loads (taking
2090          drs with same alignment into account).  */
2091       unsigned int load_inside_cost = 0;
2092       unsigned int load_outside_cost = 0;
2093       unsigned int store_inside_cost = 0;
2094       unsigned int store_outside_cost = 0;
2095       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2096
2097       stmt_vector_for_cost dummy;
2098       dummy.create (2);
2099       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2100                                       &load_inside_cost,
2101                                       &load_outside_cost,
2102                                       &dummy, &dummy, estimated_npeels);
2103       dummy.release ();
2104
2105       if (first_store)
2106         {
2107           dummy.create (2);
2108           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2109                                           &store_inside_cost,
2110                                           &store_outside_cost,
2111                                           &dummy, &dummy,
2112                                           estimated_npeels);
2113           dummy.release ();
2114         }
2115       else
2116         {
2117           store_inside_cost = INT_MAX;
2118           store_outside_cost = INT_MAX;
2119         }
2120
2121       if (load_inside_cost > store_inside_cost
2122           || (load_inside_cost == store_inside_cost
2123               && load_outside_cost > store_outside_cost))
2124         {
2125           dr0_info = first_store;
2126           dr0_same_align_drs = first_store_same_align_drs;
2127           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2128           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2129         }
2130       else
2131         {
2132           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2133           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2134         }
2135
2136       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2137       prologue_cost_vec.create (2);
2138       epilogue_cost_vec.create (2);
2139
2140       int dummy2;
2141       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2142         (loop_vinfo, estimated_npeels, &dummy2,
2143          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2144          &prologue_cost_vec, &epilogue_cost_vec);
2145
2146       prologue_cost_vec.release ();
2147       epilogue_cost_vec.release ();
2148
2149       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2150     }
2151
2152   peel_for_unknown_alignment.peel_info.npeel = 0;
2153   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2154
2155   best_peel = peel_for_unknown_alignment;
2156
2157   peel_for_known_alignment.inside_cost = INT_MAX;
2158   peel_for_known_alignment.outside_cost = INT_MAX;
2159   peel_for_known_alignment.peel_info.count = 0;
2160   peel_for_known_alignment.peel_info.dr_info = NULL;
2161
2162   if (do_peeling && one_misalignment_known)
2163     {
2164       /* Peeling is possible, but there is no data access that is not supported
2165          unless aligned.  So we try to choose the best possible peeling from
2166          the hash table.  */
2167       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2168         (&peeling_htab, loop_vinfo);
2169     }
2170
2171   /* Compare costs of peeling for known and unknown alignment. */
2172   if (peel_for_known_alignment.peel_info.dr_info != NULL
2173       && peel_for_unknown_alignment.inside_cost
2174       >= peel_for_known_alignment.inside_cost)
2175     {
2176       best_peel = peel_for_known_alignment;
2177
2178       /* If the best peeling for known alignment has NPEEL == 0, perform no
2179          peeling at all except if there is an unsupportable dr that we can
2180          align.  */
2181       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2182         do_peeling = false;
2183     }
2184
2185   /* If there is an unsupportable data ref, prefer this over all choices so far
2186      since we'd have to discard a chosen peeling except when it accidentally
2187      aligned the unsupportable data ref.  */
2188   if (one_dr_unsupportable)
2189     dr0_info = unsupportable_dr_info;
2190   else if (do_peeling)
2191     {
2192       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2193          TODO: Use nopeel_outside_cost or get rid of it?  */
2194       unsigned nopeel_inside_cost = 0;
2195       unsigned nopeel_outside_cost = 0;
2196
2197       stmt_vector_for_cost dummy;
2198       dummy.create (2);
2199       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2200                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2201       dummy.release ();
2202
2203       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2204          costs will be recorded.  */
2205       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2206       prologue_cost_vec.create (2);
2207       epilogue_cost_vec.create (2);
2208
2209       int dummy2;
2210       nopeel_outside_cost += vect_get_known_peeling_cost
2211         (loop_vinfo, 0, &dummy2,
2212          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2213          &prologue_cost_vec, &epilogue_cost_vec);
2214
2215       prologue_cost_vec.release ();
2216       epilogue_cost_vec.release ();
2217
2218       npeel = best_peel.peel_info.npeel;
2219       dr0_info = best_peel.peel_info.dr_info;
2220
2221       /* If no peeling is not more expensive than the best peeling we
2222          have so far, don't perform any peeling.  */
2223       if (nopeel_inside_cost <= best_peel.inside_cost)
2224         do_peeling = false;
2225     }
2226
2227   if (do_peeling)
2228     {
2229       stmt_vec_info stmt_info = dr0_info->stmt;
2230       if (known_alignment_for_access_p (dr0_info,
2231                                         STMT_VINFO_VECTYPE (stmt_info)))
2232         {
2233           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2234                                                 size_zero_node) < 0;
2235           if (!npeel)
2236             {
2237               /* Since it's known at compile time, compute the number of
2238                  iterations in the peeled loop (the peeling factor) for use in
2239                  updating DR_MISALIGNMENT values.  The peeling factor is the
2240                  vectorization factor minus the misalignment as an element
2241                  count.  */
2242               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2243               poly_int64 off = 0;
2244               if (negative)
2245                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2246                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2247               unsigned int mis
2248                 = dr_misalignment (dr0_info, vectype, off);
2249               mis = negative ? mis : -mis;
2250               /* If known_alignment_for_access_p then we have set
2251                  DR_MISALIGNMENT which is only done if we know it at compiler
2252                  time, so it is safe to assume target alignment is constant.
2253                */
2254               unsigned int target_align =
2255                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2256               npeel = ((mis & (target_align - 1))
2257                        / vect_get_scalar_dr_size (dr0_info));
2258             }
2259
2260           /* For interleaved data access every iteration accesses all the
2261              members of the group, therefore we divide the number of iterations
2262              by the group size.  */
2263           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2264             npeel /= DR_GROUP_SIZE (stmt_info);
2265
2266           if (dump_enabled_p ())
2267             dump_printf_loc (MSG_NOTE, vect_location,
2268                              "Try peeling by %d\n", npeel);
2269         }
2270
2271       /* Ensure that all datarefs can be vectorized after the peel.  */
2272       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2273         do_peeling = false;
2274
2275       /* Check if all datarefs are supportable and log.  */
2276       if (do_peeling
2277           && npeel == 0
2278           && known_alignment_for_access_p (dr0_info,
2279                                            STMT_VINFO_VECTYPE (stmt_info)))
2280         return opt_result::success ();
2281
2282       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2283       if (do_peeling)
2284         {
2285           unsigned max_allowed_peel
2286             = param_vect_max_peeling_for_alignment;
2287           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2288             max_allowed_peel = 0;
2289           if (max_allowed_peel != (unsigned)-1)
2290             {
2291               unsigned max_peel = npeel;
2292               if (max_peel == 0)
2293                 {
2294                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2295                   unsigned HOST_WIDE_INT target_align_c;
2296                   if (target_align.is_constant (&target_align_c))
2297                     max_peel =
2298                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2299                   else
2300                     {
2301                       do_peeling = false;
2302                       if (dump_enabled_p ())
2303                         dump_printf_loc (MSG_NOTE, vect_location,
2304                           "Disable peeling, max peels set and vector"
2305                           " alignment unknown\n");
2306                     }
2307                 }
2308               if (max_peel > max_allowed_peel)
2309                 {
2310                   do_peeling = false;
2311                   if (dump_enabled_p ())
2312                     dump_printf_loc (MSG_NOTE, vect_location,
2313                         "Disable peeling, max peels reached: %d\n", max_peel);
2314                 }
2315             }
2316         }
2317
2318       /* Cost model #2 - if peeling may result in a remaining loop not
2319          iterating enough to be vectorized then do not peel.  Since this
2320          is a cost heuristic rather than a correctness decision, use the
2321          most likely runtime value for variable vectorization factors.  */
2322       if (do_peeling
2323           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2324         {
2325           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2326           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2327           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2328               < assumed_vf + max_peel)
2329             do_peeling = false;
2330         }
2331
2332       if (do_peeling)
2333         {
2334           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2335              If the misalignment of DR_i is identical to that of dr0 then set
2336              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2337              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2338              by the peeling factor times the element size of DR_i (MOD the
2339              vectorization factor times the size).  Otherwise, the
2340              misalignment of DR_i must be set to unknown.  */
2341           FOR_EACH_VEC_ELT (datarefs, i, dr)
2342             if (dr != dr0_info->dr)
2343               {
2344                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2345                 if (!vect_relevant_for_alignment_p (dr_info))
2346                   continue;
2347
2348                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2349               }
2350
2351           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2352           if (npeel)
2353             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2354           else
2355             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2356           SET_DR_MISALIGNMENT (dr0_info,
2357                                vect_dr_misalign_for_aligned_access (dr0_info));
2358           if (dump_enabled_p ())
2359             {
2360               dump_printf_loc (MSG_NOTE, vect_location,
2361                                "Alignment of access forced using peeling.\n");
2362               dump_printf_loc (MSG_NOTE, vect_location,
2363                                "Peeling for alignment will be applied.\n");
2364             }
2365
2366           /* The inside-loop cost will be accounted for in vectorizable_load
2367              and vectorizable_store correctly with adjusted alignments.
2368              Drop the body_cst_vec on the floor here.  */
2369           return opt_result::success ();
2370         }
2371     }
2372
2373   /* (2) Versioning to force alignment.  */
2374
2375   /* Try versioning if:
2376      1) optimize loop for speed and the cost-model is not cheap
2377      2) there is at least one unsupported misaligned data ref with an unknown
2378         misalignment, and
2379      3) all misaligned data refs with a known misalignment are supported, and
2380      4) the number of runtime alignment checks is within reason.  */
2381
2382   do_versioning
2383     = (optimize_loop_nest_for_speed_p (loop)
2384        && !loop->inner /* FORNOW */
2385        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2386
2387   if (do_versioning)
2388     {
2389       FOR_EACH_VEC_ELT (datarefs, i, dr)
2390         {
2391           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2392           if (!vect_relevant_for_alignment_p (dr_info))
2393             continue;
2394
2395           stmt_vec_info stmt_info = dr_info->stmt;
2396           if (STMT_VINFO_STRIDED_P (stmt_info))
2397             {
2398               do_versioning = false;
2399               break;
2400             }
2401
2402           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2403           bool negative = tree_int_cst_compare (DR_STEP (dr),
2404                                                 size_zero_node) < 0;
2405           poly_int64 off = 0;
2406           if (negative)
2407             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2408                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2409           int misalignment;
2410           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2411             continue;
2412
2413           enum dr_alignment_support supportable_dr_alignment
2414             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2415                                              misalignment);
2416           if (supportable_dr_alignment == dr_unaligned_unsupported)
2417             {
2418               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2419                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2420                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2421                 {
2422                   do_versioning = false;
2423                   break;
2424                 }
2425
2426               /* At present we don't support versioning for alignment
2427                  with variable VF, since there's no guarantee that the
2428                  VF is a power of two.  We could relax this if we added
2429                  a way of enforcing a power-of-two size.  */
2430               unsigned HOST_WIDE_INT size;
2431               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2432                 {
2433                   do_versioning = false;
2434                   break;
2435                 }
2436
2437               /* Forcing alignment in the first iteration is no good if
2438                  we don't keep it across iterations.  For now, just disable
2439                  versioning in this case.
2440                  ?? We could actually unroll the loop to achieve the required
2441                  overall step alignment, and forcing the alignment could be
2442                  done by doing some iterations of the non-vectorized loop.  */
2443               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2444                                * DR_STEP_ALIGNMENT (dr),
2445                                DR_TARGET_ALIGNMENT (dr_info)))
2446                 {
2447                   do_versioning = false;
2448                   break;
2449                 }
2450
2451               /* The rightmost bits of an aligned address must be zeros.
2452                  Construct the mask needed for this test.  For example,
2453                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2454                  mask must be 15 = 0xf. */
2455               int mask = size - 1;
2456
2457               /* FORNOW: use the same mask to test all potentially unaligned
2458                  references in the loop.  */
2459               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2460                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2461                 {
2462                   do_versioning = false;
2463                   break;
2464                 }
2465
2466               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2467               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2468             }
2469         }
2470
2471       /* Versioning requires at least one misaligned data reference.  */
2472       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2473         do_versioning = false;
2474       else if (!do_versioning)
2475         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2476     }
2477
2478   if (do_versioning)
2479     {
2480       const vec<stmt_vec_info> &may_misalign_stmts
2481         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2482       stmt_vec_info stmt_info;
2483
2484       /* It can now be assumed that the data references in the statements
2485          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2486          of the loop being vectorized.  */
2487       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2488         {
2489           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2490           SET_DR_MISALIGNMENT (dr_info,
2491                                vect_dr_misalign_for_aligned_access (dr_info));
2492           if (dump_enabled_p ())
2493             dump_printf_loc (MSG_NOTE, vect_location,
2494                              "Alignment of access forced using versioning.\n");
2495         }
2496
2497       if (dump_enabled_p ())
2498         dump_printf_loc (MSG_NOTE, vect_location,
2499                          "Versioning for alignment will be applied.\n");
2500
2501       /* Peeling and versioning can't be done together at this time.  */
2502       gcc_assert (! (do_peeling && do_versioning));
2503
2504       return opt_result::success ();
2505     }
2506
2507   /* This point is reached if neither peeling nor versioning is being done.  */
2508   gcc_assert (! (do_peeling || do_versioning));
2509
2510   return opt_result::success ();
2511 }
2512
2513
2514 /* Function vect_analyze_data_refs_alignment
2515
2516    Analyze the alignment of the data-references in the loop.
2517    Return FALSE if a data reference is found that cannot be vectorized.  */
2518
2519 opt_result
2520 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2521 {
2522   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2523
2524   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2525   struct data_reference *dr;
2526   unsigned int i;
2527
2528   vect_record_base_alignments (loop_vinfo);
2529   FOR_EACH_VEC_ELT (datarefs, i, dr)
2530     {
2531       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2532       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2533         {
2534           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2535               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2536             continue;
2537           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2538                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2539         }
2540     }
2541
2542   return opt_result::success ();
2543 }
2544
2545
2546 /* Analyze alignment of DRs of stmts in NODE.  */
2547
2548 static bool
2549 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2550 {
2551   /* Alignment is maintained in the first element of the group.  */
2552   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2553   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2554   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2555   tree vectype = SLP_TREE_VECTYPE (node);
2556   poly_uint64 vector_alignment
2557     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2558                  BITS_PER_UNIT);
2559   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2560     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2561   /* Re-analyze alignment when we're facing a vectorization with a bigger
2562      alignment requirement.  */
2563   else if (known_lt (dr_info->target_alignment, vector_alignment))
2564     {
2565       poly_uint64 old_target_alignment = dr_info->target_alignment;
2566       int old_misalignment = dr_info->misalignment;
2567       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2568       /* But keep knowledge about a smaller alignment.  */
2569       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2570           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2571         {
2572           dr_info->target_alignment = old_target_alignment;
2573           dr_info->misalignment = old_misalignment;
2574         }
2575     }
2576   /* When we ever face unordered target alignments the first one wins in terms
2577      of analyzing and the other will become unknown in dr_misalignment.  */
2578   return true;
2579 }
2580
2581 /* Function vect_slp_analyze_instance_alignment
2582
2583    Analyze the alignment of the data-references in the SLP instance.
2584    Return FALSE if a data reference is found that cannot be vectorized.  */
2585
2586 bool
2587 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2588                                                 slp_instance instance)
2589 {
2590   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2591
2592   slp_tree node;
2593   unsigned i;
2594   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2595     if (! vect_slp_analyze_node_alignment (vinfo, node))
2596       return false;
2597
2598   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2599       && ! vect_slp_analyze_node_alignment
2600              (vinfo, SLP_INSTANCE_TREE (instance)))
2601     return false;
2602
2603   return true;
2604 }
2605
2606
2607 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2608    accesses of legal size, step, etc.  Detect gaps, single element
2609    interleaving, and other special cases. Set grouped access info.
2610    Collect groups of strided stores for further use in SLP analysis.
2611    Worker for vect_analyze_group_access.  */
2612
2613 static bool
2614 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2615 {
2616   data_reference *dr = dr_info->dr;
2617   tree step = DR_STEP (dr);
2618   tree scalar_type = TREE_TYPE (DR_REF (dr));
2619   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2620   stmt_vec_info stmt_info = dr_info->stmt;
2621   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2622   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2623   HOST_WIDE_INT dr_step = -1;
2624   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2625   bool slp_impossible = false;
2626
2627   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2628      size of the interleaving group (including gaps).  */
2629   if (tree_fits_shwi_p (step))
2630     {
2631       dr_step = tree_to_shwi (step);
2632       /* Check that STEP is a multiple of type size.  Otherwise there is
2633          a non-element-sized gap at the end of the group which we
2634          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2635          ???  As we can handle non-constant step fine here we should
2636          simply remove uses of DR_GROUP_GAP between the last and first
2637          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2638          simply not include that gap.  */
2639       if ((dr_step % type_size) != 0)
2640         {
2641           if (dump_enabled_p ())
2642             dump_printf_loc (MSG_NOTE, vect_location,
2643                              "Step %T is not a multiple of the element size"
2644                              " for %T\n",
2645                              step, DR_REF (dr));
2646           return false;
2647         }
2648       groupsize = absu_hwi (dr_step) / type_size;
2649     }
2650   else
2651     groupsize = 0;
2652
2653   /* Not consecutive access is possible only if it is a part of interleaving.  */
2654   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2655     {
2656       /* Check if it this DR is a part of interleaving, and is a single
2657          element of the group that is accessed in the loop.  */
2658
2659       /* Gaps are supported only for loads. STEP must be a multiple of the type
2660          size.  */
2661       if (DR_IS_READ (dr)
2662           && (dr_step % type_size) == 0
2663           && groupsize > 0
2664           /* This could be UINT_MAX but as we are generating code in a very
2665              inefficient way we have to cap earlier.
2666              See PR91403 for example.  */
2667           && groupsize <= 4096)
2668         {
2669           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2670           DR_GROUP_SIZE (stmt_info) = groupsize;
2671           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2672           if (dump_enabled_p ())
2673             dump_printf_loc (MSG_NOTE, vect_location,
2674                              "Detected single element interleaving %T"
2675                              " step %T\n",
2676                              DR_REF (dr), step);
2677
2678           return true;
2679         }
2680
2681       if (dump_enabled_p ())
2682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2683                          "not consecutive access %G", stmt_info->stmt);
2684
2685       if (bb_vinfo)
2686         {
2687           /* Mark the statement as unvectorizable.  */
2688           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2689           return true;
2690         }
2691
2692       if (dump_enabled_p ())
2693         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2694       STMT_VINFO_STRIDED_P (stmt_info) = true;
2695       return true;
2696     }
2697
2698   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2699     {
2700       /* First stmt in the interleaving chain. Check the chain.  */
2701       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2702       struct data_reference *data_ref = dr;
2703       unsigned int count = 1;
2704       tree prev_init = DR_INIT (data_ref);
2705       HOST_WIDE_INT diff, gaps = 0;
2706
2707       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2708       while (next)
2709         {
2710           /* We never have the same DR multiple times.  */
2711           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2712                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2713
2714           data_ref = STMT_VINFO_DATA_REF (next);
2715
2716           /* All group members have the same STEP by construction.  */
2717           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2718
2719           /* Check that the distance between two accesses is equal to the type
2720              size. Otherwise, we have gaps.  */
2721           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2722                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2723           if (diff < 1 || diff > UINT_MAX)
2724             {
2725               /* For artificial testcases with array accesses with large
2726                  constant indices we can run into overflow issues which
2727                  can end up fooling the groupsize constraint below so
2728                  check the individual gaps (which are represented as
2729                  unsigned int) as well.  */
2730               if (dump_enabled_p ())
2731                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2732                                  "interleaved access with gap larger "
2733                                  "than representable\n");
2734               return false;
2735             }
2736           if (diff != 1)
2737             {
2738               /* FORNOW: SLP of accesses with gaps is not supported.  */
2739               slp_impossible = true;
2740               if (DR_IS_WRITE (data_ref))
2741                 {
2742                   if (dump_enabled_p ())
2743                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744                                      "interleaved store with gaps\n");
2745                   return false;
2746                 }
2747
2748               gaps += diff - 1;
2749             }
2750
2751           last_accessed_element += diff;
2752
2753           /* Store the gap from the previous member of the group. If there is no
2754              gap in the access, DR_GROUP_GAP is always 1.  */
2755           DR_GROUP_GAP (next) = diff;
2756
2757           prev_init = DR_INIT (data_ref);
2758           next = DR_GROUP_NEXT_ELEMENT (next);
2759           /* Count the number of data-refs in the chain.  */
2760           count++;
2761         }
2762
2763       if (groupsize == 0)
2764         groupsize = count + gaps;
2765
2766       /* This could be UINT_MAX but as we are generating code in a very
2767          inefficient way we have to cap earlier.  See PR78699 for example.  */
2768       if (groupsize > 4096)
2769         {
2770           if (dump_enabled_p ())
2771             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2772                              "group is too large\n");
2773           return false;
2774         }
2775
2776       /* Check that the size of the interleaving is equal to count for stores,
2777          i.e., that there are no gaps.  */
2778       if (groupsize != count
2779           && !DR_IS_READ (dr))
2780         {
2781           groupsize = count;
2782           STMT_VINFO_STRIDED_P (stmt_info) = true;
2783         }
2784
2785       /* If there is a gap after the last load in the group it is the
2786          difference between the groupsize and the last accessed
2787          element.
2788          When there is no gap, this difference should be 0.  */
2789       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2790
2791       DR_GROUP_SIZE (stmt_info) = groupsize;
2792       if (dump_enabled_p ())
2793         {
2794           dump_printf_loc (MSG_NOTE, vect_location,
2795                            "Detected interleaving ");
2796           if (DR_IS_READ (dr))
2797             dump_printf (MSG_NOTE, "load ");
2798           else if (STMT_VINFO_STRIDED_P (stmt_info))
2799             dump_printf (MSG_NOTE, "strided store ");
2800           else
2801             dump_printf (MSG_NOTE, "store ");
2802           dump_printf (MSG_NOTE, "of size %u\n",
2803                        (unsigned)groupsize);
2804           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2805           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2806           while (next)
2807             {
2808               if (DR_GROUP_GAP (next) != 1)
2809                 dump_printf_loc (MSG_NOTE, vect_location,
2810                                  "\t<gap of %d elements>\n",
2811                                  DR_GROUP_GAP (next) - 1);
2812               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2813               next = DR_GROUP_NEXT_ELEMENT (next);
2814             }
2815           if (DR_GROUP_GAP (stmt_info) != 0)
2816             dump_printf_loc (MSG_NOTE, vect_location,
2817                              "\t<gap of %d elements>\n",
2818                              DR_GROUP_GAP (stmt_info));
2819         }
2820
2821       /* SLP: create an SLP data structure for every interleaving group of
2822          stores for further analysis in vect_analyse_slp.  */
2823       if (DR_IS_WRITE (dr) && !slp_impossible)
2824         {
2825           if (loop_vinfo)
2826             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2827           if (bb_vinfo)
2828             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2829         }
2830     }
2831
2832   return true;
2833 }
2834
2835 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2836    accesses of legal size, step, etc.  Detect gaps, single element
2837    interleaving, and other special cases. Set grouped access info.
2838    Collect groups of strided stores for further use in SLP analysis.  */
2839
2840 static bool
2841 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2842 {
2843   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2844     {
2845       /* Dissolve the group if present.  */
2846       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2847       while (stmt_info)
2848         {
2849           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2850           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2851           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2852           stmt_info = next;
2853         }
2854       return false;
2855     }
2856   return true;
2857 }
2858
2859 /* Analyze the access pattern of the data-reference DR_INFO.
2860    In case of non-consecutive accesses call vect_analyze_group_access() to
2861    analyze groups of accesses.  */
2862
2863 static bool
2864 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2865 {
2866   data_reference *dr = dr_info->dr;
2867   tree step = DR_STEP (dr);
2868   tree scalar_type = TREE_TYPE (DR_REF (dr));
2869   stmt_vec_info stmt_info = dr_info->stmt;
2870   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2871   class loop *loop = NULL;
2872
2873   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2874     return true;
2875
2876   if (loop_vinfo)
2877     loop = LOOP_VINFO_LOOP (loop_vinfo);
2878
2879   if (loop_vinfo && !step)
2880     {
2881       if (dump_enabled_p ())
2882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2883                          "bad data-ref access in loop\n");
2884       return false;
2885     }
2886
2887   /* Allow loads with zero step in inner-loop vectorization.  */
2888   if (loop_vinfo && integer_zerop (step))
2889     {
2890       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2891       if (!nested_in_vect_loop_p (loop, stmt_info))
2892         return DR_IS_READ (dr);
2893       /* Allow references with zero step for outer loops marked
2894          with pragma omp simd only - it guarantees absence of
2895          loop-carried dependencies between inner loop iterations.  */
2896       if (loop->safelen < 2)
2897         {
2898           if (dump_enabled_p ())
2899             dump_printf_loc (MSG_NOTE, vect_location,
2900                              "zero step in inner loop of nest\n");
2901           return false;
2902         }
2903     }
2904
2905   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2906     {
2907       /* Interleaved accesses are not yet supported within outer-loop
2908         vectorization for references in the inner-loop.  */
2909       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2910
2911       /* For the rest of the analysis we use the outer-loop step.  */
2912       step = STMT_VINFO_DR_STEP (stmt_info);
2913       if (integer_zerop (step))
2914         {
2915           if (dump_enabled_p ())
2916             dump_printf_loc (MSG_NOTE, vect_location,
2917                              "zero step in outer loop.\n");
2918           return DR_IS_READ (dr);
2919         }
2920     }
2921
2922   /* Consecutive?  */
2923   if (TREE_CODE (step) == INTEGER_CST)
2924     {
2925       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2926       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2927           || (dr_step < 0
2928               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2929         {
2930           /* Mark that it is not interleaving.  */
2931           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2932           return true;
2933         }
2934     }
2935
2936   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2937     {
2938       if (dump_enabled_p ())
2939         dump_printf_loc (MSG_NOTE, vect_location,
2940                          "grouped access in outer loop.\n");
2941       return false;
2942     }
2943
2944
2945   /* Assume this is a DR handled by non-constant strided load case.  */
2946   if (TREE_CODE (step) != INTEGER_CST)
2947     return (STMT_VINFO_STRIDED_P (stmt_info)
2948             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2949                 || vect_analyze_group_access (vinfo, dr_info)));
2950
2951   /* Not consecutive access - check if it's a part of interleaving group.  */
2952   return vect_analyze_group_access (vinfo, dr_info);
2953 }
2954
2955 /* Compare two data-references DRA and DRB to group them into chunks
2956    suitable for grouping.  */
2957
2958 static int
2959 dr_group_sort_cmp (const void *dra_, const void *drb_)
2960 {
2961   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2962   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2963   data_reference_p dra = dra_info->dr;
2964   data_reference_p drb = drb_info->dr;
2965   int cmp;
2966
2967   /* Stabilize sort.  */
2968   if (dra == drb)
2969     return 0;
2970
2971   /* Different group IDs lead never belong to the same group.  */
2972   if (dra_info->group != drb_info->group)
2973     return dra_info->group < drb_info->group ? -1 : 1;
2974
2975   /* Ordering of DRs according to base.  */
2976   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2977                                DR_BASE_ADDRESS (drb));
2978   if (cmp != 0)
2979     return cmp;
2980
2981   /* And according to DR_OFFSET.  */
2982   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2983   if (cmp != 0)
2984     return cmp;
2985
2986   /* Put reads before writes.  */
2987   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2988     return DR_IS_READ (dra) ? -1 : 1;
2989
2990   /* Then sort after access size.  */
2991   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2992                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2993   if (cmp != 0)
2994     return cmp;
2995
2996   /* And after step.  */
2997   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2998   if (cmp != 0)
2999     return cmp;
3000
3001   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3002   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3003   if (cmp == 0)
3004     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3005   return cmp;
3006 }
3007
3008 /* If OP is the result of a conversion, return the unconverted value,
3009    otherwise return null.  */
3010
3011 static tree
3012 strip_conversion (tree op)
3013 {
3014   if (TREE_CODE (op) != SSA_NAME)
3015     return NULL_TREE;
3016   gimple *stmt = SSA_NAME_DEF_STMT (op);
3017   if (!is_gimple_assign (stmt)
3018       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3019     return NULL_TREE;
3020   return gimple_assign_rhs1 (stmt);
3021 }
3022
3023 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3024    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3025    be grouped in SLP mode.  */
3026
3027 static bool
3028 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3029                    bool allow_slp_p)
3030 {
3031   if (gimple_assign_single_p (stmt1_info->stmt))
3032     return gimple_assign_single_p (stmt2_info->stmt);
3033
3034   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3035   if (call1 && gimple_call_internal_p (call1))
3036     {
3037       /* Check for two masked loads or two masked stores.  */
3038       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3039       if (!call2 || !gimple_call_internal_p (call2))
3040         return false;
3041       internal_fn ifn = gimple_call_internal_fn (call1);
3042       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3043         return false;
3044       if (ifn != gimple_call_internal_fn (call2))
3045         return false;
3046
3047       /* Check that the masks are the same.  Cope with casts of masks,
3048          like those created by build_mask_conversion.  */
3049       tree mask1 = gimple_call_arg (call1, 2);
3050       tree mask2 = gimple_call_arg (call2, 2);
3051       if (!operand_equal_p (mask1, mask2, 0)
3052           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3053         {
3054           mask1 = strip_conversion (mask1);
3055           if (!mask1)
3056             return false;
3057           mask2 = strip_conversion (mask2);
3058           if (!mask2)
3059             return false;
3060           if (!operand_equal_p (mask1, mask2, 0))
3061             return false;
3062         }
3063       return true;
3064     }
3065
3066   return false;
3067 }
3068
3069 /* Function vect_analyze_data_ref_accesses.
3070
3071    Analyze the access pattern of all the data references in the loop.
3072
3073    FORNOW: the only access pattern that is considered vectorizable is a
3074            simple step 1 (consecutive) access.
3075
3076    FORNOW: handle only arrays and pointer accesses.  */
3077
3078 opt_result
3079 vect_analyze_data_ref_accesses (vec_info *vinfo,
3080                                 vec<int> *dataref_groups)
3081 {
3082   unsigned int i;
3083   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3084
3085   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3086
3087   if (datarefs.is_empty ())
3088     return opt_result::success ();
3089
3090   /* Sort the array of datarefs to make building the interleaving chains
3091      linear.  Don't modify the original vector's order, it is needed for
3092      determining what dependencies are reversed.  */
3093   vec<dr_vec_info *> datarefs_copy;
3094   datarefs_copy.create (datarefs.length ());
3095   for (unsigned i = 0; i < datarefs.length (); i++)
3096     {
3097       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3098       /* If the caller computed DR grouping use that, otherwise group by
3099          basic blocks.  */
3100       if (dataref_groups)
3101         dr_info->group = (*dataref_groups)[i];
3102       else
3103         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3104       datarefs_copy.quick_push (dr_info);
3105     }
3106   datarefs_copy.qsort (dr_group_sort_cmp);
3107   hash_set<stmt_vec_info> to_fixup;
3108
3109   /* Build the interleaving chains.  */
3110   for (i = 0; i < datarefs_copy.length () - 1;)
3111     {
3112       dr_vec_info *dr_info_a = datarefs_copy[i];
3113       data_reference_p dra = dr_info_a->dr;
3114       int dra_group_id = dr_info_a->group;
3115       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3116       stmt_vec_info lastinfo = NULL;
3117       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3118           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3119         {
3120           ++i;
3121           continue;
3122         }
3123       for (i = i + 1; i < datarefs_copy.length (); ++i)
3124         {
3125           dr_vec_info *dr_info_b = datarefs_copy[i];
3126           data_reference_p drb = dr_info_b->dr;
3127           int drb_group_id = dr_info_b->group;
3128           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3129           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3130               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3131             break;
3132
3133           /* ???  Imperfect sorting (non-compatible types, non-modulo
3134              accesses, same accesses) can lead to a group to be artificially
3135              split here as we don't just skip over those.  If it really
3136              matters we can push those to a worklist and re-iterate
3137              over them.  The we can just skip ahead to the next DR here.  */
3138
3139           /* DRs in a different DR group should not be put into the same
3140              interleaving group.  */
3141           if (dra_group_id != drb_group_id)
3142             break;
3143
3144           /* Check that the data-refs have same first location (except init)
3145              and they are both either store or load (not load and store,
3146              not masked loads or stores).  */
3147           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3148               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3149                                         DR_BASE_ADDRESS (drb)) != 0
3150               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3151               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3152             break;
3153
3154           /* Check that the data-refs have the same constant size.  */
3155           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3156           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3157           if (!tree_fits_uhwi_p (sza)
3158               || !tree_fits_uhwi_p (szb)
3159               || !tree_int_cst_equal (sza, szb))
3160             break;
3161
3162           /* Check that the data-refs have the same step.  */
3163           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3164             break;
3165
3166           /* Check the types are compatible.
3167              ???  We don't distinguish this during sorting.  */
3168           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3169                                    TREE_TYPE (DR_REF (drb))))
3170             break;
3171
3172           /* Check that the DR_INITs are compile-time constants.  */
3173           if (!tree_fits_shwi_p (DR_INIT (dra))
3174               || !tree_fits_shwi_p (DR_INIT (drb)))
3175             break;
3176
3177           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3178              just hold extra information.  */
3179           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3180               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3181               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3182             break;
3183
3184           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3185           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3186           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3187           HOST_WIDE_INT init_prev
3188             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3189           gcc_assert (init_a <= init_b
3190                       && init_a <= init_prev
3191                       && init_prev <= init_b);
3192
3193           /* Do not place the same access in the interleaving chain twice.  */
3194           if (init_b == init_prev)
3195             {
3196               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3197                           < gimple_uid (DR_STMT (drb)));
3198               /* Simply link in duplicates and fix up the chain below.  */
3199             }
3200           else
3201             {
3202               /* If init_b == init_a + the size of the type * k, we have an
3203                  interleaving, and DRA is accessed before DRB.  */
3204               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3205               if (type_size_a == 0
3206                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3207                       % type_size_a != 0))
3208                 break;
3209
3210               /* If we have a store, the accesses are adjacent.  This splits
3211                  groups into chunks we support (we don't support vectorization
3212                  of stores with gaps).  */
3213               if (!DR_IS_READ (dra)
3214                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3215                       != type_size_a))
3216                 break;
3217
3218               /* If the step (if not zero or non-constant) is smaller than the
3219                  difference between data-refs' inits this splits groups into
3220                  suitable sizes.  */
3221               if (tree_fits_shwi_p (DR_STEP (dra)))
3222                 {
3223                   unsigned HOST_WIDE_INT step
3224                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3225                   if (step != 0
3226                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3227                     break;
3228                 }
3229             }
3230
3231           if (dump_enabled_p ())
3232             dump_printf_loc (MSG_NOTE, vect_location,
3233                              DR_IS_READ (dra)
3234                              ? "Detected interleaving load %T and %T\n"
3235                              : "Detected interleaving store %T and %T\n",
3236                              DR_REF (dra), DR_REF (drb));
3237
3238           /* Link the found element into the group list.  */
3239           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3240             {
3241               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3242               lastinfo = stmtinfo_a;
3243             }
3244           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3245           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3246           lastinfo = stmtinfo_b;
3247
3248           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3249             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3250
3251           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3252             dump_printf_loc (MSG_NOTE, vect_location,
3253                              "Load suitable for SLP vectorization only.\n");
3254
3255           if (init_b == init_prev
3256               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3257               && dump_enabled_p ())
3258             dump_printf_loc (MSG_NOTE, vect_location,
3259                              "Queuing group with duplicate access for fixup\n");
3260         }
3261     }
3262
3263   /* Fixup groups with duplicate entries by splitting it.  */
3264   while (1)
3265     {
3266       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3267       if (!(it != to_fixup.end ()))
3268         break;
3269       stmt_vec_info grp = *it;
3270       to_fixup.remove (grp);
3271
3272       /* Find the earliest duplicate group member.  */
3273       unsigned first_duplicate = -1u;
3274       stmt_vec_info next, g = grp;
3275       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3276         {
3277           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3278                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3279               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3280             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3281           g = next;
3282         }
3283       if (first_duplicate == -1U)
3284         continue;
3285
3286       /* Then move all stmts after the first duplicate to a new group.
3287          Note this is a heuristic but one with the property that *it
3288          is fixed up completely.  */
3289       g = grp;
3290       stmt_vec_info newgroup = NULL, ng = grp;
3291       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3292         {
3293           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3294             {
3295               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3296               if (!newgroup)
3297                 newgroup = next;
3298               else
3299                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3300               ng = next;
3301               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3302             }
3303           else
3304             g = DR_GROUP_NEXT_ELEMENT (g);
3305         }
3306       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3307
3308       /* Fixup the new group which still may contain duplicates.  */
3309       to_fixup.add (newgroup);
3310     }
3311
3312   dr_vec_info *dr_info;
3313   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3314     {
3315       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3316           && !vect_analyze_data_ref_access (vinfo, dr_info))
3317         {
3318           if (dump_enabled_p ())
3319             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320                              "not vectorized: complicated access pattern.\n");
3321
3322           if (is_a <bb_vec_info> (vinfo))
3323             {
3324               /* Mark the statement as not vectorizable.  */
3325               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3326               continue;
3327             }
3328           else
3329             {
3330               datarefs_copy.release ();
3331               return opt_result::failure_at (dr_info->stmt->stmt,
3332                                              "not vectorized:"
3333                                              " complicated access pattern.\n");
3334             }
3335         }
3336     }
3337
3338   datarefs_copy.release ();
3339   return opt_result::success ();
3340 }
3341
3342 /* Function vect_vfa_segment_size.
3343
3344    Input:
3345      DR_INFO: The data reference.
3346      LENGTH_FACTOR: segment length to consider.
3347
3348    Return a value suitable for the dr_with_seg_len::seg_len field.
3349    This is the "distance travelled" by the pointer from the first
3350    iteration in the segment to the last.  Note that it does not include
3351    the size of the access; in effect it only describes the first byte.  */
3352
3353 static tree
3354 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3355 {
3356   length_factor = size_binop (MINUS_EXPR,
3357                               fold_convert (sizetype, length_factor),
3358                               size_one_node);
3359   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3360                      length_factor);
3361 }
3362
3363 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3364    gives the worst-case number of bytes covered by the segment.  */
3365
3366 static unsigned HOST_WIDE_INT
3367 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3368 {
3369   stmt_vec_info stmt_vinfo = dr_info->stmt;
3370   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3371   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3372   unsigned HOST_WIDE_INT access_size = ref_size;
3373   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3374     {
3375       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3376       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3377     }
3378   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3379   int misalignment;
3380   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3381       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3382       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3383           == dr_explicit_realign_optimized))
3384     {
3385       /* We might access a full vector's worth.  */
3386       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3387     }
3388   return access_size;
3389 }
3390
3391 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3392    describes.  */
3393
3394 static unsigned int
3395 vect_vfa_align (dr_vec_info *dr_info)
3396 {
3397   return dr_alignment (dr_info->dr);
3398 }
3399
3400 /* Function vect_no_alias_p.
3401
3402    Given data references A and B with equal base and offset, see whether
3403    the alias relation can be decided at compilation time.  Return 1 if
3404    it can and the references alias, 0 if it can and the references do
3405    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3406    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3407    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3408
3409 static int
3410 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3411                          tree segment_length_a, tree segment_length_b,
3412                          unsigned HOST_WIDE_INT access_size_a,
3413                          unsigned HOST_WIDE_INT access_size_b)
3414 {
3415   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3416   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3417   poly_uint64 const_length_a;
3418   poly_uint64 const_length_b;
3419
3420   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3421      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3422      [a, a+12) */
3423   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3424     {
3425       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3426       offset_a -= const_length_a;
3427     }
3428   else
3429     const_length_a = tree_to_poly_uint64 (segment_length_a);
3430   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3431     {
3432       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3433       offset_b -= const_length_b;
3434     }
3435   else
3436     const_length_b = tree_to_poly_uint64 (segment_length_b);
3437
3438   const_length_a += access_size_a;
3439   const_length_b += access_size_b;
3440
3441   if (ranges_known_overlap_p (offset_a, const_length_a,
3442                               offset_b, const_length_b))
3443     return 1;
3444
3445   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3446                                offset_b, const_length_b))
3447     return 0;
3448
3449   return -1;
3450 }
3451
3452 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3453    in DDR is >= VF.  */
3454
3455 static bool
3456 dependence_distance_ge_vf (data_dependence_relation *ddr,
3457                            unsigned int loop_depth, poly_uint64 vf)
3458 {
3459   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3460       || DDR_NUM_DIST_VECTS (ddr) == 0)
3461     return false;
3462
3463   /* If the dependence is exact, we should have limited the VF instead.  */
3464   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3465
3466   unsigned int i;
3467   lambda_vector dist_v;
3468   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3469     {
3470       HOST_WIDE_INT dist = dist_v[loop_depth];
3471       if (dist != 0
3472           && !(dist > 0 && DDR_REVERSED_P (ddr))
3473           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3474         return false;
3475     }
3476
3477   if (dump_enabled_p ())
3478     dump_printf_loc (MSG_NOTE, vect_location,
3479                      "dependence distance between %T and %T is >= VF\n",
3480                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3481
3482   return true;
3483 }
3484
3485 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3486
3487 static void
3488 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3489 {
3490   dump_printf (dump_kind, "%s (%T) >= ",
3491                lower_bound.unsigned_p ? "unsigned" : "abs",
3492                lower_bound.expr);
3493   dump_dec (dump_kind, lower_bound.min_value);
3494 }
3495
3496 /* Record that the vectorized loop requires the vec_lower_bound described
3497    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3498
3499 static void
3500 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3501                         poly_uint64 min_value)
3502 {
3503   vec<vec_lower_bound> &lower_bounds
3504     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3505   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3506     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3507       {
3508         unsigned_p &= lower_bounds[i].unsigned_p;
3509         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3510         if (lower_bounds[i].unsigned_p != unsigned_p
3511             || maybe_lt (lower_bounds[i].min_value, min_value))
3512           {
3513             lower_bounds[i].unsigned_p = unsigned_p;
3514             lower_bounds[i].min_value = min_value;
3515             if (dump_enabled_p ())
3516               {
3517                 dump_printf_loc (MSG_NOTE, vect_location,
3518                                  "updating run-time check to ");
3519                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3520                 dump_printf (MSG_NOTE, "\n");
3521               }
3522           }
3523         return;
3524       }
3525
3526   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3527   if (dump_enabled_p ())
3528     {
3529       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3530       dump_lower_bound (MSG_NOTE, lower_bound);
3531       dump_printf (MSG_NOTE, "\n");
3532     }
3533   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3534 }
3535
3536 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3537    will span fewer than GAP bytes.  */
3538
3539 static bool
3540 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3541                   poly_int64 gap)
3542 {
3543   stmt_vec_info stmt_info = dr_info->stmt;
3544   HOST_WIDE_INT count
3545     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3546   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3547     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3548   return (estimated_poly_value (gap)
3549           <= count * vect_get_scalar_dr_size (dr_info));
3550 }
3551
3552 /* Return true if we know that there is no alias between DR_INFO_A and
3553    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3554    When returning true, set *LOWER_BOUND_OUT to this N.  */
3555
3556 static bool
3557 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3558                                 poly_uint64 *lower_bound_out)
3559 {
3560   /* Check that there is a constant gap of known sign between DR_A
3561      and DR_B.  */
3562   data_reference *dr_a = dr_info_a->dr;
3563   data_reference *dr_b = dr_info_b->dr;
3564   poly_int64 init_a, init_b;
3565   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3566       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3567       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3568       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3569       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3570       || !ordered_p (init_a, init_b))
3571     return false;
3572
3573   /* Sort DR_A and DR_B by the address they access.  */
3574   if (maybe_lt (init_b, init_a))
3575     {
3576       std::swap (init_a, init_b);
3577       std::swap (dr_info_a, dr_info_b);
3578       std::swap (dr_a, dr_b);
3579     }
3580
3581   /* If the two accesses could be dependent within a scalar iteration,
3582      make sure that we'd retain their order.  */
3583   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3584       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3585     return false;
3586
3587   /* There is no alias if abs (DR_STEP) is greater than or equal to
3588      the bytes spanned by the combination of the two accesses.  */
3589   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3590   return true;
3591 }
3592
3593 /* Function vect_prune_runtime_alias_test_list.
3594
3595    Prune a list of ddrs to be tested at run-time by versioning for alias.
3596    Merge several alias checks into one if possible.
3597    Return FALSE if resulting list of ddrs is longer then allowed by
3598    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3599
3600 opt_result
3601 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3602 {
3603   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3604   hash_set <tree_pair_hash> compared_objects;
3605
3606   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3607   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3608     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3609   const vec<vec_object_pair> &check_unequal_addrs
3610     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3611   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3612   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3613
3614   ddr_p ddr;
3615   unsigned int i;
3616   tree length_factor;
3617
3618   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3619
3620   /* Step values are irrelevant for aliasing if the number of vector
3621      iterations is equal to the number of scalar iterations (which can
3622      happen for fully-SLP loops).  */
3623   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3624
3625   if (!vf_one_p)
3626     {
3627       /* Convert the checks for nonzero steps into bound tests.  */
3628       tree value;
3629       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3630         vect_check_lower_bound (loop_vinfo, value, true, 1);
3631     }
3632
3633   if (may_alias_ddrs.is_empty ())
3634     return opt_result::success ();
3635
3636   comp_alias_ddrs.create (may_alias_ddrs.length ());
3637
3638   unsigned int loop_depth
3639     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3640                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3641
3642   /* First, we collect all data ref pairs for aliasing checks.  */
3643   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3644     {
3645       poly_uint64 lower_bound;
3646       tree segment_length_a, segment_length_b;
3647       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3648       unsigned int align_a, align_b;
3649
3650       /* Ignore the alias if the VF we chose ended up being no greater
3651          than the dependence distance.  */
3652       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3653         continue;
3654
3655       if (DDR_OBJECT_A (ddr))
3656         {
3657           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3658           if (!compared_objects.add (new_pair))
3659             {
3660               if (dump_enabled_p ())
3661                 dump_printf_loc (MSG_NOTE, vect_location,
3662                                  "checking that %T and %T"
3663                                  " have different addresses\n",
3664                                  new_pair.first, new_pair.second);
3665               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3666             }
3667           continue;
3668         }
3669
3670       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3671       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3672
3673       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3674       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3675
3676       bool preserves_scalar_order_p
3677         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3678       bool ignore_step_p
3679           = (vf_one_p
3680              && (preserves_scalar_order_p
3681                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3682                                      DR_STEP (dr_info_b->dr))));
3683
3684       /* Skip the pair if inter-iteration dependencies are irrelevant
3685          and intra-iteration dependencies are guaranteed to be honored.  */
3686       if (ignore_step_p
3687           && (preserves_scalar_order_p
3688               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3689                                                  &lower_bound)))
3690         {
3691           if (dump_enabled_p ())
3692             dump_printf_loc (MSG_NOTE, vect_location,
3693                              "no need for alias check between "
3694                              "%T and %T when VF is 1\n",
3695                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3696           continue;
3697         }
3698
3699       /* See whether we can handle the alias using a bounds check on
3700          the step, and whether that's likely to be the best approach.
3701          (It might not be, for example, if the minimum step is much larger
3702          than the number of bytes handled by one vector iteration.)  */
3703       if (!ignore_step_p
3704           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3705           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3706                                              &lower_bound)
3707           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3708               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3709         {
3710           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3711           if (dump_enabled_p ())
3712             {
3713               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3714                                "%T and %T when the step %T is outside ",
3715                                DR_REF (dr_info_a->dr),
3716                                DR_REF (dr_info_b->dr),
3717                                DR_STEP (dr_info_a->dr));
3718               if (unsigned_p)
3719                 dump_printf (MSG_NOTE, "[0");
3720               else
3721                 {
3722                   dump_printf (MSG_NOTE, "(");
3723                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3724                 }
3725               dump_printf (MSG_NOTE, ", ");
3726               dump_dec (MSG_NOTE, lower_bound);
3727               dump_printf (MSG_NOTE, ")\n");
3728             }
3729           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3730                                   unsigned_p, lower_bound);
3731           continue;
3732         }
3733
3734       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3735       if (dr_group_first_a)
3736         {
3737           stmt_info_a = dr_group_first_a;
3738           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3739         }
3740
3741       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3742       if (dr_group_first_b)
3743         {
3744           stmt_info_b = dr_group_first_b;
3745           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3746         }
3747
3748       if (ignore_step_p)
3749         {
3750           segment_length_a = size_zero_node;
3751           segment_length_b = size_zero_node;
3752         }
3753       else
3754         {
3755           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3756                                 DR_STEP (dr_info_b->dr), 0))
3757             length_factor = scalar_loop_iters;
3758           else
3759             length_factor = size_int (vect_factor);
3760           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3761           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3762         }
3763       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3764       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3765       align_a = vect_vfa_align (dr_info_a);
3766       align_b = vect_vfa_align (dr_info_b);
3767
3768       /* See whether the alias is known at compilation time.  */
3769       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3770                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3771           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3772                               DR_OFFSET (dr_info_b->dr), 0)
3773           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3774           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3775           && poly_int_tree_p (segment_length_a)
3776           && poly_int_tree_p (segment_length_b))
3777         {
3778           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3779                                              segment_length_a,
3780                                              segment_length_b,
3781                                              access_size_a,
3782                                              access_size_b);
3783           if (res >= 0 && dump_enabled_p ())
3784             {
3785               dump_printf_loc (MSG_NOTE, vect_location,
3786                                "can tell at compile time that %T and %T",
3787                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3788               if (res == 0)
3789                 dump_printf (MSG_NOTE, " do not alias\n");
3790               else
3791                 dump_printf (MSG_NOTE, " alias\n");
3792             }
3793
3794           if (res == 0)
3795             continue;
3796
3797           if (res == 1)
3798             return opt_result::failure_at (stmt_info_b->stmt,
3799                                            "not vectorized:"
3800                                            " compilation time alias: %G%G",
3801                                            stmt_info_a->stmt,
3802                                            stmt_info_b->stmt);
3803         }
3804
3805       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3806                             access_size_a, align_a);
3807       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3808                             access_size_b, align_b);
3809       /* Canonicalize the order to be the one that's needed for accurate
3810          RAW, WAR and WAW flags, in cases where the data references are
3811          well-ordered.  The order doesn't really matter otherwise,
3812          but we might as well be consistent.  */
3813       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3814         std::swap (dr_a, dr_b);
3815
3816       dr_with_seg_len_pair_t dr_with_seg_len_pair
3817         (dr_a, dr_b, (preserves_scalar_order_p
3818                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3819                       : dr_with_seg_len_pair_t::REORDERED));
3820
3821       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3822     }
3823
3824   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3825
3826   unsigned int count = (comp_alias_ddrs.length ()
3827                         + check_unequal_addrs.length ());
3828
3829   if (count
3830       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3831           == VECT_COST_MODEL_VERY_CHEAP))
3832     return opt_result::failure_at
3833       (vect_location, "would need a runtime alias check\n");
3834
3835   if (dump_enabled_p ())
3836     dump_printf_loc (MSG_NOTE, vect_location,
3837                      "improved number of alias checks from %d to %d\n",
3838                      may_alias_ddrs.length (), count);
3839   unsigned limit = param_vect_max_version_for_alias_checks;
3840   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3841     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3842   if (count > limit)
3843     return opt_result::failure_at
3844       (vect_location,
3845        "number of versioning for alias run-time tests exceeds %d "
3846        "(--param vect-max-version-for-alias-checks)\n", limit);
3847
3848   return opt_result::success ();
3849 }
3850
3851 /* Check whether we can use an internal function for a gather load
3852    or scatter store.  READ_P is true for loads and false for stores.
3853    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3854    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3855    is the type of the offset that is being applied to the invariant
3856    base address.  SCALE is the amount by which the offset should
3857    be multiplied *after* it has been converted to address width.
3858
3859    Return true if the function is supported, storing the function id in
3860    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3861
3862 bool
3863 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3864                           tree vectype, tree memory_type, tree offset_type,
3865                           int scale, internal_fn *ifn_out,
3866                           tree *offset_vectype_out)
3867 {
3868   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3869   unsigned int element_bits = vector_element_bits (vectype);
3870   if (element_bits != memory_bits)
3871     /* For now the vector elements must be the same width as the
3872        memory elements.  */
3873     return false;
3874
3875   /* Work out which function we need.  */
3876   internal_fn ifn, alt_ifn;
3877   if (read_p)
3878     {
3879       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3880       alt_ifn = IFN_MASK_GATHER_LOAD;
3881     }
3882   else
3883     {
3884       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3885       alt_ifn = IFN_MASK_SCATTER_STORE;
3886     }
3887
3888   for (;;)
3889     {
3890       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3891       if (!offset_vectype)
3892         return false;
3893
3894       /* Test whether the target supports this combination.  */
3895       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3896                                                   offset_vectype, scale))
3897         {
3898           *ifn_out = ifn;
3899           *offset_vectype_out = offset_vectype;
3900           return true;
3901         }
3902       else if (!masked_p
3903                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3904                                                           memory_type,
3905                                                           offset_vectype,
3906                                                           scale))
3907         {
3908           *ifn_out = alt_ifn;
3909           *offset_vectype_out = offset_vectype;
3910           return true;
3911         }
3912
3913       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3914           && TYPE_PRECISION (offset_type) >= element_bits)
3915         return false;
3916
3917       offset_type = build_nonstandard_integer_type
3918         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3919     }
3920 }
3921
3922 /* STMT_INFO is a call to an internal gather load or scatter store function.
3923    Describe the operation in INFO.  */
3924
3925 static void
3926 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3927                                    gather_scatter_info *info)
3928 {
3929   gcall *call = as_a <gcall *> (stmt_info->stmt);
3930   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3931   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3932
3933   info->ifn = gimple_call_internal_fn (call);
3934   info->decl = NULL_TREE;
3935   info->base = gimple_call_arg (call, 0);
3936   info->offset = gimple_call_arg (call, 1);
3937   info->offset_dt = vect_unknown_def_type;
3938   info->offset_vectype = NULL_TREE;
3939   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3940   info->element_type = TREE_TYPE (vectype);
3941   info->memory_type = TREE_TYPE (DR_REF (dr));
3942 }
3943
3944 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3945    gather load or scatter store.  Describe the operation in *INFO if so.  */
3946
3947 bool
3948 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3949                            gather_scatter_info *info)
3950 {
3951   HOST_WIDE_INT scale = 1;
3952   poly_int64 pbitpos, pbitsize;
3953   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3954   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3955   tree offtype = NULL_TREE;
3956   tree decl = NULL_TREE, base, off;
3957   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3958   tree memory_type = TREE_TYPE (DR_REF (dr));
3959   machine_mode pmode;
3960   int punsignedp, reversep, pvolatilep = 0;
3961   internal_fn ifn;
3962   tree offset_vectype;
3963   bool masked_p = false;
3964
3965   /* See whether this is already a call to a gather/scatter internal function.
3966      If not, see whether it's a masked load or store.  */
3967   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3968   if (call && gimple_call_internal_p (call))
3969     {
3970       ifn = gimple_call_internal_fn (call);
3971       if (internal_gather_scatter_fn_p (ifn))
3972         {
3973           vect_describe_gather_scatter_call (stmt_info, info);
3974           return true;
3975         }
3976       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3977     }
3978
3979   /* True if we should aim to use internal functions rather than
3980      built-in functions.  */
3981   bool use_ifn_p = (DR_IS_READ (dr)
3982                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3983                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3984
3985   base = DR_REF (dr);
3986   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3987      see if we can use the def stmt of the address.  */
3988   if (masked_p
3989       && TREE_CODE (base) == MEM_REF
3990       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3991       && integer_zerop (TREE_OPERAND (base, 1))
3992       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3993     {
3994       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3995       if (is_gimple_assign (def_stmt)
3996           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3997         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3998     }
3999
4000   /* The gather and scatter builtins need address of the form
4001      loop_invariant + vector * {1, 2, 4, 8}
4002      or
4003      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4004      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4005      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4006      multiplications and additions in it.  To get a vector, we need
4007      a single SSA_NAME that will be defined in the loop and will
4008      contain everything that is not loop invariant and that can be
4009      vectorized.  The following code attempts to find such a preexistng
4010      SSA_NAME OFF and put the loop invariants into a tree BASE
4011      that can be gimplified before the loop.  */
4012   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4013                               &punsignedp, &reversep, &pvolatilep);
4014   if (reversep)
4015     return false;
4016
4017   /* PR 107346.  Packed structs can have fields at offsets that are not
4018      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4019   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4020     return false;
4021
4022   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4023
4024   if (TREE_CODE (base) == MEM_REF)
4025     {
4026       if (!integer_zerop (TREE_OPERAND (base, 1)))
4027         {
4028           if (off == NULL_TREE)
4029             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4030           else
4031             off = size_binop (PLUS_EXPR, off,
4032                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4033         }
4034       base = TREE_OPERAND (base, 0);
4035     }
4036   else
4037     base = build_fold_addr_expr (base);
4038
4039   if (off == NULL_TREE)
4040     off = size_zero_node;
4041
4042   /* If base is not loop invariant, either off is 0, then we start with just
4043      the constant offset in the loop invariant BASE and continue with base
4044      as OFF, otherwise give up.
4045      We could handle that case by gimplifying the addition of base + off
4046      into some SSA_NAME and use that as off, but for now punt.  */
4047   if (!expr_invariant_in_loop_p (loop, base))
4048     {
4049       if (!integer_zerop (off))
4050         return false;
4051       off = base;
4052       base = size_int (pbytepos);
4053     }
4054   /* Otherwise put base + constant offset into the loop invariant BASE
4055      and continue with OFF.  */
4056   else
4057     {
4058       base = fold_convert (sizetype, base);
4059       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4060     }
4061
4062   /* OFF at this point may be either a SSA_NAME or some tree expression
4063      from get_inner_reference.  Try to peel off loop invariants from it
4064      into BASE as long as possible.  */
4065   STRIP_NOPS (off);
4066   while (offtype == NULL_TREE)
4067     {
4068       enum tree_code code;
4069       tree op0, op1, add = NULL_TREE;
4070
4071       if (TREE_CODE (off) == SSA_NAME)
4072         {
4073           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4074
4075           if (expr_invariant_in_loop_p (loop, off))
4076             return false;
4077
4078           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4079             break;
4080
4081           op0 = gimple_assign_rhs1 (def_stmt);
4082           code = gimple_assign_rhs_code (def_stmt);
4083           op1 = gimple_assign_rhs2 (def_stmt);
4084         }
4085       else
4086         {
4087           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4088             return false;
4089           code = TREE_CODE (off);
4090           extract_ops_from_tree (off, &code, &op0, &op1);
4091         }
4092       switch (code)
4093         {
4094         case POINTER_PLUS_EXPR:
4095         case PLUS_EXPR:
4096           if (expr_invariant_in_loop_p (loop, op0))
4097             {
4098               add = op0;
4099               off = op1;
4100             do_add:
4101               add = fold_convert (sizetype, add);
4102               if (scale != 1)
4103                 add = size_binop (MULT_EXPR, add, size_int (scale));
4104               base = size_binop (PLUS_EXPR, base, add);
4105               continue;
4106             }
4107           if (expr_invariant_in_loop_p (loop, op1))
4108             {
4109               add = op1;
4110               off = op0;
4111               goto do_add;
4112             }
4113           break;
4114         case MINUS_EXPR:
4115           if (expr_invariant_in_loop_p (loop, op1))
4116             {
4117               add = fold_convert (sizetype, op1);
4118               add = size_binop (MINUS_EXPR, size_zero_node, add);
4119               off = op0;
4120               goto do_add;
4121             }
4122           break;
4123         case MULT_EXPR:
4124           if (scale == 1 && tree_fits_shwi_p (op1))
4125             {
4126               int new_scale = tree_to_shwi (op1);
4127               /* Only treat this as a scaling operation if the target
4128                  supports it for at least some offset type.  */
4129               if (use_ifn_p
4130                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4131                                                 masked_p, vectype, memory_type,
4132                                                 signed_char_type_node,
4133                                                 new_scale, &ifn,
4134                                                 &offset_vectype)
4135                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4136                                                 masked_p, vectype, memory_type,
4137                                                 unsigned_char_type_node,
4138                                                 new_scale, &ifn,
4139                                                 &offset_vectype))
4140                 break;
4141               scale = new_scale;
4142               off = op0;
4143               continue;
4144             }
4145           break;
4146         case SSA_NAME:
4147           off = op0;
4148           continue;
4149         CASE_CONVERT:
4150           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4151               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4152             break;
4153
4154           /* Don't include the conversion if the target is happy with
4155              the current offset type.  */
4156           if (use_ifn_p
4157               && TREE_CODE (off) == SSA_NAME
4158               && !POINTER_TYPE_P (TREE_TYPE (off))
4159               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4160                                            masked_p, vectype, memory_type,
4161                                            TREE_TYPE (off), scale, &ifn,
4162                                            &offset_vectype))
4163             break;
4164
4165           if (TYPE_PRECISION (TREE_TYPE (op0))
4166               == TYPE_PRECISION (TREE_TYPE (off)))
4167             {
4168               off = op0;
4169               continue;
4170             }
4171
4172           /* Include the conversion if it is widening and we're using
4173              the IFN path or the target can handle the converted from
4174              offset or the current size is not already the same as the
4175              data vector element size.  */
4176           if ((TYPE_PRECISION (TREE_TYPE (op0))
4177                < TYPE_PRECISION (TREE_TYPE (off)))
4178               && (use_ifn_p
4179                   || (DR_IS_READ (dr)
4180                       ? (targetm.vectorize.builtin_gather
4181                          && targetm.vectorize.builtin_gather (vectype,
4182                                                               TREE_TYPE (op0),
4183                                                               scale))
4184                       : (targetm.vectorize.builtin_scatter
4185                          && targetm.vectorize.builtin_scatter (vectype,
4186                                                                TREE_TYPE (op0),
4187                                                                scale)))
4188                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4189                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4190             {
4191               off = op0;
4192               offtype = TREE_TYPE (off);
4193               STRIP_NOPS (off);
4194               continue;
4195             }
4196           break;
4197         default:
4198           break;
4199         }
4200       break;
4201     }
4202
4203   /* If at the end OFF still isn't a SSA_NAME or isn't
4204      defined in the loop, punt.  */
4205   if (TREE_CODE (off) != SSA_NAME
4206       || expr_invariant_in_loop_p (loop, off))
4207     return false;
4208
4209   if (offtype == NULL_TREE)
4210     offtype = TREE_TYPE (off);
4211
4212   if (use_ifn_p)
4213     {
4214       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4215                                      vectype, memory_type, offtype, scale,
4216                                      &ifn, &offset_vectype))
4217         ifn = IFN_LAST;
4218       decl = NULL_TREE;
4219     }
4220   else
4221     {
4222       if (DR_IS_READ (dr))
4223         {
4224           if (targetm.vectorize.builtin_gather)
4225             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4226         }
4227       else
4228         {
4229           if (targetm.vectorize.builtin_scatter)
4230             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4231         }
4232       ifn = IFN_LAST;
4233       /* The offset vector type will be read from DECL when needed.  */
4234       offset_vectype = NULL_TREE;
4235     }
4236
4237   info->ifn = ifn;
4238   info->decl = decl;
4239   info->base = base;
4240   info->offset = off;
4241   info->offset_dt = vect_unknown_def_type;
4242   info->offset_vectype = offset_vectype;
4243   info->scale = scale;
4244   info->element_type = TREE_TYPE (vectype);
4245   info->memory_type = memory_type;
4246   return true;
4247 }
4248
4249 /* Find the data references in STMT, analyze them with respect to LOOP and
4250    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4251    be handled.  */
4252
4253 opt_result
4254 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4255                                vec<data_reference_p> *datarefs,
4256                                vec<int> *dataref_groups, int group_id)
4257 {
4258   /* We can ignore clobbers for dataref analysis - they are removed during
4259      loop vectorization and BB vectorization checks dependences with a
4260      stmt walk.  */
4261   if (gimple_clobber_p (stmt))
4262     return opt_result::success ();
4263
4264   if (gimple_has_volatile_ops (stmt))
4265     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4266                                    stmt);
4267
4268   if (stmt_can_throw_internal (cfun, stmt))
4269     return opt_result::failure_at (stmt,
4270                                    "not vectorized:"
4271                                    " statement can throw an exception: %G",
4272                                    stmt);
4273
4274   auto_vec<data_reference_p, 2> refs;
4275   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4276   if (!res)
4277     return res;
4278
4279   if (refs.is_empty ())
4280     return opt_result::success ();
4281
4282   if (refs.length () > 1)
4283     {
4284       while (!refs.is_empty ())
4285         free_data_ref (refs.pop ());
4286       return opt_result::failure_at (stmt,
4287                                      "not vectorized: more than one "
4288                                      "data ref in stmt: %G", stmt);
4289     }
4290
4291   data_reference_p dr = refs.pop ();
4292   if (gcall *call = dyn_cast <gcall *> (stmt))
4293     if (!gimple_call_internal_p (call)
4294         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4295             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4296       {
4297         free_data_ref (dr);
4298         return opt_result::failure_at (stmt,
4299                                        "not vectorized: dr in a call %G", stmt);
4300       }
4301
4302   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4303       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4304     {
4305       free_data_ref (dr);
4306       return opt_result::failure_at (stmt,
4307                                      "not vectorized:"
4308                                      " statement is an unsupported"
4309                                      " bitfield access %G", stmt);
4310     }
4311
4312   if (DR_BASE_ADDRESS (dr)
4313       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4314     {
4315       free_data_ref (dr);
4316       return opt_result::failure_at (stmt,
4317                                      "not vectorized:"
4318                                      " base addr of dr is a constant\n");
4319     }
4320
4321   /* Check whether this may be a SIMD lane access and adjust the
4322      DR to make it easier for us to handle it.  */
4323   if (loop
4324       && loop->simduid
4325       && (!DR_BASE_ADDRESS (dr)
4326           || !DR_OFFSET (dr)
4327           || !DR_INIT (dr)
4328           || !DR_STEP (dr)))
4329     {
4330       struct data_reference *newdr
4331         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4332                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4333       if (DR_BASE_ADDRESS (newdr)
4334           && DR_OFFSET (newdr)
4335           && DR_INIT (newdr)
4336           && DR_STEP (newdr)
4337           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4338           && integer_zerop (DR_STEP (newdr)))
4339         {
4340           tree base_address = DR_BASE_ADDRESS (newdr);
4341           tree off = DR_OFFSET (newdr);
4342           tree step = ssize_int (1);
4343           if (integer_zerop (off)
4344               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4345             {
4346               off = TREE_OPERAND (base_address, 1);
4347               base_address = TREE_OPERAND (base_address, 0);
4348             }
4349           STRIP_NOPS (off);
4350           if (TREE_CODE (off) == MULT_EXPR
4351               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4352             {
4353               step = TREE_OPERAND (off, 1);
4354               off = TREE_OPERAND (off, 0);
4355               STRIP_NOPS (off);
4356             }
4357           if (CONVERT_EXPR_P (off)
4358               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4359                   < TYPE_PRECISION (TREE_TYPE (off))))
4360             off = TREE_OPERAND (off, 0);
4361           if (TREE_CODE (off) == SSA_NAME)
4362             {
4363               gimple *def = SSA_NAME_DEF_STMT (off);
4364               /* Look through widening conversion.  */
4365               if (is_gimple_assign (def)
4366                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4367                 {
4368                   tree rhs1 = gimple_assign_rhs1 (def);
4369                   if (TREE_CODE (rhs1) == SSA_NAME
4370                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4371                       && (TYPE_PRECISION (TREE_TYPE (off))
4372                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4373                     def = SSA_NAME_DEF_STMT (rhs1);
4374                 }
4375               if (is_gimple_call (def)
4376                   && gimple_call_internal_p (def)
4377                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4378                 {
4379                   tree arg = gimple_call_arg (def, 0);
4380                   tree reft = TREE_TYPE (DR_REF (newdr));
4381                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4382                   arg = SSA_NAME_VAR (arg);
4383                   if (arg == loop->simduid
4384                       /* For now.  */
4385                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4386                     {
4387                       DR_BASE_ADDRESS (newdr) = base_address;
4388                       DR_OFFSET (newdr) = ssize_int (0);
4389                       DR_STEP (newdr) = step;
4390                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4391                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4392                       /* Mark as simd-lane access.  */
4393                       tree arg2 = gimple_call_arg (def, 1);
4394                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4395                       free_data_ref (dr);
4396                       datarefs->safe_push (newdr);
4397                       if (dataref_groups)
4398                         dataref_groups->safe_push (group_id);
4399                       return opt_result::success ();
4400                     }
4401                 }
4402             }
4403         }
4404       free_data_ref (newdr);
4405     }
4406
4407   datarefs->safe_push (dr);
4408   if (dataref_groups)
4409     dataref_groups->safe_push (group_id);
4410   return opt_result::success ();
4411 }
4412
4413 /* Function vect_analyze_data_refs.
4414
4415   Find all the data references in the loop or basic block.
4416
4417    The general structure of the analysis of data refs in the vectorizer is as
4418    follows:
4419    1- vect_analyze_data_refs(loop/bb): call
4420       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4421       in the loop/bb and their dependences.
4422    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4423    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4424    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4425
4426 */
4427
4428 opt_result
4429 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4430 {
4431   class loop *loop = NULL;
4432   unsigned int i;
4433   struct data_reference *dr;
4434   tree scalar_type;
4435
4436   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4437
4438   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4439     loop = LOOP_VINFO_LOOP (loop_vinfo);
4440
4441   /* Go through the data-refs, check that the analysis succeeded.  Update
4442      pointer from stmt_vec_info struct to DR and vectype.  */
4443
4444   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4445   FOR_EACH_VEC_ELT (datarefs, i, dr)
4446     {
4447       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4448       poly_uint64 vf;
4449
4450       gcc_assert (DR_REF (dr));
4451       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4452       gcc_assert (!stmt_info->dr_aux.dr);
4453       stmt_info->dr_aux.dr = dr;
4454       stmt_info->dr_aux.stmt = stmt_info;
4455
4456       /* Check that analysis of the data-ref succeeded.  */
4457       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4458           || !DR_STEP (dr))
4459         {
4460           bool maybe_gather
4461             = DR_IS_READ (dr)
4462               && !TREE_THIS_VOLATILE (DR_REF (dr));
4463           bool maybe_scatter
4464             = DR_IS_WRITE (dr)
4465               && !TREE_THIS_VOLATILE (DR_REF (dr));
4466
4467           /* If target supports vector gather loads or scatter stores,
4468              see if they can't be used.  */
4469           if (is_a <loop_vec_info> (vinfo)
4470               && !nested_in_vect_loop_p (loop, stmt_info))
4471             {
4472               if (maybe_gather || maybe_scatter)
4473                 {
4474                   if (maybe_gather)
4475                     gatherscatter = GATHER;
4476                   else
4477                     gatherscatter = SCATTER;
4478                 }
4479             }
4480
4481           if (gatherscatter == SG_NONE)
4482             {
4483               if (dump_enabled_p ())
4484                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4485                                  "not vectorized: data ref analysis "
4486                                  "failed %G", stmt_info->stmt);
4487               if (is_a <bb_vec_info> (vinfo))
4488                 {
4489                   /* In BB vectorization the ref can still participate
4490                      in dependence analysis, we just can't vectorize it.  */
4491                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4492                   continue;
4493                 }
4494               return opt_result::failure_at (stmt_info->stmt,
4495                                              "not vectorized:"
4496                                              " data ref analysis failed: %G",
4497                                              stmt_info->stmt);
4498             }
4499         }
4500
4501       /* See if this was detected as SIMD lane access.  */
4502       if (dr->aux == (void *)-1
4503           || dr->aux == (void *)-2
4504           || dr->aux == (void *)-3
4505           || dr->aux == (void *)-4)
4506         {
4507           if (nested_in_vect_loop_p (loop, stmt_info))
4508             return opt_result::failure_at (stmt_info->stmt,
4509                                            "not vectorized:"
4510                                            " data ref analysis failed: %G",
4511                                            stmt_info->stmt);
4512           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4513             = -(uintptr_t) dr->aux;
4514         }
4515
4516       tree base = get_base_address (DR_REF (dr));
4517       if (base && VAR_P (base) && DECL_NONALIASED (base))
4518         {
4519           if (dump_enabled_p ())
4520             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4521                              "not vectorized: base object not addressable "
4522                              "for stmt: %G", stmt_info->stmt);
4523           if (is_a <bb_vec_info> (vinfo))
4524             {
4525               /* In BB vectorization the ref can still participate
4526                  in dependence analysis, we just can't vectorize it.  */
4527               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4528               continue;
4529             }
4530           return opt_result::failure_at (stmt_info->stmt,
4531                                          "not vectorized: base object not"
4532                                          " addressable for stmt: %G",
4533                                          stmt_info->stmt);
4534         }
4535
4536       if (is_a <loop_vec_info> (vinfo)
4537           && DR_STEP (dr)
4538           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4539         {
4540           if (nested_in_vect_loop_p (loop, stmt_info))
4541             return opt_result::failure_at (stmt_info->stmt,
4542                                            "not vectorized: "
4543                                            "not suitable for strided load %G",
4544                                            stmt_info->stmt);
4545           STMT_VINFO_STRIDED_P (stmt_info) = true;
4546         }
4547
4548       /* Update DR field in stmt_vec_info struct.  */
4549
4550       /* If the dataref is in an inner-loop of the loop that is considered for
4551          for vectorization, we also want to analyze the access relative to
4552          the outer-loop (DR contains information only relative to the
4553          inner-most enclosing loop).  We do that by building a reference to the
4554          first location accessed by the inner-loop, and analyze it relative to
4555          the outer-loop.  */
4556       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4557         {
4558           /* Build a reference to the first location accessed by the
4559              inner loop: *(BASE + INIT + OFFSET).  By construction,
4560              this address must be invariant in the inner loop, so we
4561              can consider it as being used in the outer loop.  */
4562           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4563           tree offset = unshare_expr (DR_OFFSET (dr));
4564           tree init = unshare_expr (DR_INIT (dr));
4565           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4566                                           init, offset);
4567           tree init_addr = fold_build_pointer_plus (base, init_offset);
4568           tree init_ref = build_fold_indirect_ref (init_addr);
4569
4570           if (dump_enabled_p ())
4571             dump_printf_loc (MSG_NOTE, vect_location,
4572                              "analyze in outer loop: %T\n", init_ref);
4573
4574           opt_result res
4575             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4576                                     init_ref, loop, stmt_info->stmt);
4577           if (!res)
4578             /* dr_analyze_innermost already explained the failure.  */
4579             return res;
4580
4581           if (dump_enabled_p ())
4582             dump_printf_loc (MSG_NOTE, vect_location,
4583                              "\touter base_address: %T\n"
4584                              "\touter offset from base address: %T\n"
4585                              "\touter constant offset from base address: %T\n"
4586                              "\touter step: %T\n"
4587                              "\touter base alignment: %d\n\n"
4588                              "\touter base misalignment: %d\n"
4589                              "\touter offset alignment: %d\n"
4590                              "\touter step alignment: %d\n",
4591                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4592                              STMT_VINFO_DR_OFFSET (stmt_info),
4593                              STMT_VINFO_DR_INIT (stmt_info),
4594                              STMT_VINFO_DR_STEP (stmt_info),
4595                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4596                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4597                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4598                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4599         }
4600
4601       /* Set vectype for STMT.  */
4602       scalar_type = TREE_TYPE (DR_REF (dr));
4603       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4604       if (!vectype)
4605         {
4606           if (dump_enabled_p ())
4607             {
4608               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4609                                "not vectorized: no vectype for stmt: %G",
4610                                stmt_info->stmt);
4611               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4612               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4613                                  scalar_type);
4614               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4615             }
4616
4617           if (is_a <bb_vec_info> (vinfo))
4618             {
4619               /* No vector type is fine, the ref can still participate
4620                  in dependence analysis, we just can't vectorize it.  */
4621               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4622               continue;
4623             }
4624           if (fatal)
4625             *fatal = false;
4626           return opt_result::failure_at (stmt_info->stmt,
4627                                          "not vectorized:"
4628                                          " no vectype for stmt: %G"
4629                                          " scalar_type: %T\n",
4630                                          stmt_info->stmt, scalar_type);
4631         }
4632       else
4633         {
4634           if (dump_enabled_p ())
4635             dump_printf_loc (MSG_NOTE, vect_location,
4636                              "got vectype for stmt: %G%T\n",
4637                              stmt_info->stmt, vectype);
4638         }
4639
4640       /* Adjust the minimal vectorization factor according to the
4641          vector type.  */
4642       vf = TYPE_VECTOR_SUBPARTS (vectype);
4643       *min_vf = upper_bound (*min_vf, vf);
4644
4645       /* Leave the BB vectorizer to pick the vector type later, based on
4646          the final dataref group size and SLP node size.  */
4647       if (is_a <loop_vec_info> (vinfo))
4648         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4649
4650       if (gatherscatter != SG_NONE)
4651         {
4652           gather_scatter_info gs_info;
4653           if (!vect_check_gather_scatter (stmt_info,
4654                                           as_a <loop_vec_info> (vinfo),
4655                                           &gs_info)
4656               || !get_vectype_for_scalar_type (vinfo,
4657                                                TREE_TYPE (gs_info.offset)))
4658             {
4659               if (fatal)
4660                 *fatal = false;
4661               return opt_result::failure_at
4662                         (stmt_info->stmt,
4663                          (gatherscatter == GATHER)
4664                          ? "not vectorized: not suitable for gather load %G"
4665                          : "not vectorized: not suitable for scatter store %G",
4666                          stmt_info->stmt);
4667             }
4668           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4669         }
4670     }
4671
4672   /* We used to stop processing and prune the list here.  Verify we no
4673      longer need to.  */
4674   gcc_assert (i == datarefs.length ());
4675
4676   return opt_result::success ();
4677 }
4678
4679
4680 /* Function vect_get_new_vect_var.
4681
4682    Returns a name for a new variable.  The current naming scheme appends the
4683    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4684    the name of vectorizer generated variables, and appends that to NAME if
4685    provided.  */
4686
4687 tree
4688 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4689 {
4690   const char *prefix;
4691   tree new_vect_var;
4692
4693   switch (var_kind)
4694   {
4695   case vect_simple_var:
4696     prefix = "vect";
4697     break;
4698   case vect_scalar_var:
4699     prefix = "stmp";
4700     break;
4701   case vect_mask_var:
4702     prefix = "mask";
4703     break;
4704   case vect_pointer_var:
4705     prefix = "vectp";
4706     break;
4707   default:
4708     gcc_unreachable ();
4709   }
4710
4711   if (name)
4712     {
4713       char* tmp = concat (prefix, "_", name, NULL);
4714       new_vect_var = create_tmp_reg (type, tmp);
4715       free (tmp);
4716     }
4717   else
4718     new_vect_var = create_tmp_reg (type, prefix);
4719
4720   return new_vect_var;
4721 }
4722
4723 /* Like vect_get_new_vect_var but return an SSA name.  */
4724
4725 tree
4726 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4727 {
4728   const char *prefix;
4729   tree new_vect_var;
4730
4731   switch (var_kind)
4732   {
4733   case vect_simple_var:
4734     prefix = "vect";
4735     break;
4736   case vect_scalar_var:
4737     prefix = "stmp";
4738     break;
4739   case vect_pointer_var:
4740     prefix = "vectp";
4741     break;
4742   default:
4743     gcc_unreachable ();
4744   }
4745
4746   if (name)
4747     {
4748       char* tmp = concat (prefix, "_", name, NULL);
4749       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4750       free (tmp);
4751     }
4752   else
4753     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4754
4755   return new_vect_var;
4756 }
4757
4758 /* Duplicate points-to info on NAME from DR_INFO.  */
4759
4760 static void
4761 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4762 {
4763   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4764   /* DR_PTR_INFO is for a base SSA name, not including constant or
4765      variable offsets in the ref so its alignment info does not apply.  */
4766   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4767 }
4768
4769 /* Function vect_create_addr_base_for_vector_ref.
4770
4771    Create an expression that computes the address of the first memory location
4772    that will be accessed for a data reference.
4773
4774    Input:
4775    STMT_INFO: The statement containing the data reference.
4776    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4777    OFFSET: Optional. If supplied, it is be added to the initial address.
4778    LOOP:    Specify relative to which loop-nest should the address be computed.
4779             For example, when the dataref is in an inner-loop nested in an
4780             outer-loop that is now being vectorized, LOOP can be either the
4781             outer-loop, or the inner-loop.  The first memory location accessed
4782             by the following dataref ('in' points to short):
4783
4784                 for (i=0; i<N; i++)
4785                    for (j=0; j<M; j++)
4786                      s += in[i+j]
4787
4788             is as follows:
4789             if LOOP=i_loop:     &in             (relative to i_loop)
4790             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4791
4792    Output:
4793    1. Return an SSA_NAME whose value is the address of the memory location of
4794       the first vector of the data reference.
4795    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4796       these statement(s) which define the returned SSA_NAME.
4797
4798    FORNOW: We are only handling array accesses with step 1.  */
4799
4800 tree
4801 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4802                                       gimple_seq *new_stmt_list,
4803                                       tree offset)
4804 {
4805   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4806   struct data_reference *dr = dr_info->dr;
4807   const char *base_name;
4808   tree addr_base;
4809   tree dest;
4810   gimple_seq seq = NULL;
4811   tree vect_ptr_type;
4812   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4813   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4814
4815   tree data_ref_base = unshare_expr (drb->base_address);
4816   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4817   tree init = unshare_expr (drb->init);
4818
4819   if (loop_vinfo)
4820     base_name = get_name (data_ref_base);
4821   else
4822     {
4823       base_offset = ssize_int (0);
4824       init = ssize_int (0);
4825       base_name = get_name (DR_REF (dr));
4826     }
4827
4828   /* Create base_offset */
4829   base_offset = size_binop (PLUS_EXPR,
4830                             fold_convert (sizetype, base_offset),
4831                             fold_convert (sizetype, init));
4832
4833   if (offset)
4834     {
4835       offset = fold_convert (sizetype, offset);
4836       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4837                                  base_offset, offset);
4838     }
4839
4840   /* base + base_offset */
4841   if (loop_vinfo)
4842     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4843   else
4844     addr_base = build1 (ADDR_EXPR,
4845                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
4846                         /* Strip zero offset components since we don't need
4847                            them and they can confuse late diagnostics if
4848                            we CSE them wrongly.  See PR106904 for example.  */
4849                         unshare_expr (strip_zero_offset_components
4850                                                                 (DR_REF (dr))));
4851
4852   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4853   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4854   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4855   gimple_seq_add_seq (new_stmt_list, seq);
4856
4857   if (DR_PTR_INFO (dr)
4858       && TREE_CODE (addr_base) == SSA_NAME
4859       /* We should only duplicate pointer info to newly created SSA names.  */
4860       && SSA_NAME_VAR (addr_base) == dest)
4861     {
4862       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4863       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4864     }
4865
4866   if (dump_enabled_p ())
4867     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4868
4869   return addr_base;
4870 }
4871
4872
4873 /* Function vect_create_data_ref_ptr.
4874
4875    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4876    location accessed in the loop by STMT_INFO, along with the def-use update
4877    chain to appropriately advance the pointer through the loop iterations.
4878    Also set aliasing information for the pointer.  This pointer is used by
4879    the callers to this function to create a memory reference expression for
4880    vector load/store access.
4881
4882    Input:
4883    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4884          GIMPLE_ASSIGN <name, data-ref> or
4885          GIMPLE_ASSIGN <data-ref, name>.
4886    2. AGGR_TYPE: the type of the reference, which should be either a vector
4887         or an array.
4888    3. AT_LOOP: the loop where the vector memref is to be created.
4889    4. OFFSET (optional): a byte offset to be added to the initial address
4890         accessed by the data-ref in STMT_INFO.
4891    5. BSI: location where the new stmts are to be placed if there is no loop
4892    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4893         pointing to the initial address.
4894    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4895         to the IV during each iteration of the loop.  NULL says to move
4896         by one copy of AGGR_TYPE up or down, depending on the step of the
4897         data reference.
4898
4899    Output:
4900    1. Declare a new ptr to vector_type, and have it point to the base of the
4901       data reference (initial addressed accessed by the data reference).
4902       For example, for vector of type V8HI, the following code is generated:
4903
4904       v8hi *ap;
4905       ap = (v8hi *)initial_address;
4906
4907       if OFFSET is not supplied:
4908          initial_address = &a[init];
4909       if OFFSET is supplied:
4910          initial_address = &a[init] + OFFSET;
4911       if BYTE_OFFSET is supplied:
4912          initial_address = &a[init] + BYTE_OFFSET;
4913
4914       Return the initial_address in INITIAL_ADDRESS.
4915
4916    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4917       update the pointer in each iteration of the loop.
4918
4919       Return the increment stmt that updates the pointer in PTR_INCR.
4920
4921    3. Return the pointer.  */
4922
4923 tree
4924 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4925                           tree aggr_type, class loop *at_loop, tree offset,
4926                           tree *initial_address, gimple_stmt_iterator *gsi,
4927                           gimple **ptr_incr, bool only_init,
4928                           tree iv_step)
4929 {
4930   const char *base_name;
4931   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4932   class loop *loop = NULL;
4933   bool nested_in_vect_loop = false;
4934   class loop *containing_loop = NULL;
4935   tree aggr_ptr_type;
4936   tree aggr_ptr;
4937   tree new_temp;
4938   gimple_seq new_stmt_list = NULL;
4939   edge pe = NULL;
4940   basic_block new_bb;
4941   tree aggr_ptr_init;
4942   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4943   struct data_reference *dr = dr_info->dr;
4944   tree aptr;
4945   gimple_stmt_iterator incr_gsi;
4946   bool insert_after;
4947   tree indx_before_incr, indx_after_incr;
4948   gimple *incr;
4949   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4950
4951   gcc_assert (iv_step != NULL_TREE
4952               || TREE_CODE (aggr_type) == ARRAY_TYPE
4953               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4954
4955   if (loop_vinfo)
4956     {
4957       loop = LOOP_VINFO_LOOP (loop_vinfo);
4958       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4959       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4960       pe = loop_preheader_edge (loop);
4961     }
4962   else
4963     {
4964       gcc_assert (bb_vinfo);
4965       only_init = true;
4966       *ptr_incr = NULL;
4967     }
4968
4969   /* Create an expression for the first address accessed by this load
4970      in LOOP.  */
4971   base_name = get_name (DR_BASE_ADDRESS (dr));
4972
4973   if (dump_enabled_p ())
4974     {
4975       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4976       dump_printf_loc (MSG_NOTE, vect_location,
4977                        "create %s-pointer variable to type: %T",
4978                        get_tree_code_name (TREE_CODE (aggr_type)),
4979                        aggr_type);
4980       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4981         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4982       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4983         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4984       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4985         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4986       else
4987         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4988       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4989     }
4990
4991   /* (1) Create the new aggregate-pointer variable.
4992      Vector and array types inherit the alias set of their component
4993      type by default so we need to use a ref-all pointer if the data
4994      reference does not conflict with the created aggregated data
4995      reference because it is not addressable.  */
4996   bool need_ref_all = false;
4997   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4998                               get_alias_set (DR_REF (dr))))
4999     need_ref_all = true;
5000   /* Likewise for any of the data references in the stmt group.  */
5001   else if (DR_GROUP_SIZE (stmt_info) > 1)
5002     {
5003       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5004       do
5005         {
5006           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5007           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5008                                       get_alias_set (DR_REF (sdr))))
5009             {
5010               need_ref_all = true;
5011               break;
5012             }
5013           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5014         }
5015       while (sinfo);
5016     }
5017   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5018                                                need_ref_all);
5019   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5020
5021
5022   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5023      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5024      def-use update cycles for the pointer: one relative to the outer-loop
5025      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5026      to the inner-loop (which is the inner-most loop containing the dataref),
5027      and this is done be step (5) below.
5028
5029      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5030      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5031      redundant.  Steps (3),(4) create the following:
5032
5033         vp0 = &base_addr;
5034         LOOP:   vp1 = phi(vp0,vp2)
5035                 ...
5036                 ...
5037                 vp2 = vp1 + step
5038                 goto LOOP
5039
5040      If there is an inner-loop nested in loop, then step (5) will also be
5041      applied, and an additional update in the inner-loop will be created:
5042
5043         vp0 = &base_addr;
5044         LOOP:   vp1 = phi(vp0,vp2)
5045                 ...
5046         inner:     vp3 = phi(vp1,vp4)
5047                    vp4 = vp3 + inner_step
5048                    if () goto inner
5049                 ...
5050                 vp2 = vp1 + step
5051                 if () goto LOOP   */
5052
5053   /* (2) Calculate the initial address of the aggregate-pointer, and set
5054      the aggregate-pointer to point to it before the loop.  */
5055
5056   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5057
5058   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5059                                                    stmt_info, &new_stmt_list,
5060                                                    offset);
5061   if (new_stmt_list)
5062     {
5063       if (pe)
5064         {
5065           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5066           gcc_assert (!new_bb);
5067         }
5068       else
5069         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5070     }
5071
5072   *initial_address = new_temp;
5073   aggr_ptr_init = new_temp;
5074
5075   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5076      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5077      inner-loop nested in LOOP (during outer-loop vectorization).  */
5078
5079   /* No update in loop is required.  */
5080   if (only_init && (!loop_vinfo || at_loop == loop))
5081     aptr = aggr_ptr_init;
5082   else
5083     {
5084       /* Accesses to invariant addresses should be handled specially
5085          by the caller.  */
5086       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5087       gcc_assert (!integer_zerop (step));
5088
5089       if (iv_step == NULL_TREE)
5090         {
5091           /* The step of the aggregate pointer is the type size,
5092              negated for downward accesses.  */
5093           iv_step = TYPE_SIZE_UNIT (aggr_type);
5094           if (tree_int_cst_sgn (step) == -1)
5095             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5096         }
5097
5098       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5099
5100       create_iv (aggr_ptr_init, PLUS_EXPR,
5101                  fold_convert (aggr_ptr_type, iv_step),
5102                  aggr_ptr, loop, &incr_gsi, insert_after,
5103                  &indx_before_incr, &indx_after_incr);
5104       incr = gsi_stmt (incr_gsi);
5105
5106       /* Copy the points-to information if it exists. */
5107       if (DR_PTR_INFO (dr))
5108         {
5109           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5110           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5111         }
5112       if (ptr_incr)
5113         *ptr_incr = incr;
5114
5115       aptr = indx_before_incr;
5116     }
5117
5118   if (!nested_in_vect_loop || only_init)
5119     return aptr;
5120
5121
5122   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5123      nested in LOOP, if exists.  */
5124
5125   gcc_assert (nested_in_vect_loop);
5126   if (!only_init)
5127     {
5128       standard_iv_increment_position (containing_loop, &incr_gsi,
5129                                       &insert_after);
5130       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5131                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5132                  &indx_before_incr, &indx_after_incr);
5133       incr = gsi_stmt (incr_gsi);
5134
5135       /* Copy the points-to information if it exists. */
5136       if (DR_PTR_INFO (dr))
5137         {
5138           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5139           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5140         }
5141       if (ptr_incr)
5142         *ptr_incr = incr;
5143
5144       return indx_before_incr;
5145     }
5146   else
5147     gcc_unreachable ();
5148 }
5149
5150
5151 /* Function bump_vector_ptr
5152
5153    Increment a pointer (to a vector type) by vector-size. If requested,
5154    i.e. if PTR-INCR is given, then also connect the new increment stmt
5155    to the existing def-use update-chain of the pointer, by modifying
5156    the PTR_INCR as illustrated below:
5157
5158    The pointer def-use update-chain before this function:
5159                         DATAREF_PTR = phi (p_0, p_2)
5160                         ....
5161         PTR_INCR:       p_2 = DATAREF_PTR + step
5162
5163    The pointer def-use update-chain after this function:
5164                         DATAREF_PTR = phi (p_0, p_2)
5165                         ....
5166                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5167                         ....
5168         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5169
5170    Input:
5171    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5172                  in the loop.
5173    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5174               the loop.  The increment amount across iterations is expected
5175               to be vector_size.
5176    BSI - location where the new update stmt is to be placed.
5177    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5178    BUMP - optional. The offset by which to bump the pointer. If not given,
5179           the offset is assumed to be vector_size.
5180
5181    Output: Return NEW_DATAREF_PTR as illustrated above.
5182
5183 */
5184
5185 tree
5186 bump_vector_ptr (vec_info *vinfo,
5187                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5188                  stmt_vec_info stmt_info, tree bump)
5189 {
5190   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5191   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5192   tree update = TYPE_SIZE_UNIT (vectype);
5193   gimple *incr_stmt;
5194   ssa_op_iter iter;
5195   use_operand_p use_p;
5196   tree new_dataref_ptr;
5197
5198   if (bump)
5199     update = bump;
5200
5201   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5202     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5203   else if (is_gimple_min_invariant (dataref_ptr))
5204     /* When possible avoid emitting a separate increment stmt that will
5205        force the addressed object addressable.  */
5206     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5207                    fold_build2 (MEM_REF,
5208                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5209                                 dataref_ptr,
5210                                 fold_convert (ptr_type_node, update)));
5211   else
5212     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5213   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5214                                    dataref_ptr, update);
5215   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5216   /* Fold the increment, avoiding excessive chains use-def chains of
5217      those, leading to compile-time issues for passes until the next
5218      forwprop pass which would do this as well.  */
5219   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5220   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5221     {
5222       incr_stmt = gsi_stmt (fold_gsi);
5223       update_stmt (incr_stmt);
5224     }
5225
5226   /* Copy the points-to information if it exists. */
5227   if (DR_PTR_INFO (dr))
5228     {
5229       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5230       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5231     }
5232
5233   if (!ptr_incr)
5234     return new_dataref_ptr;
5235
5236   /* Update the vector-pointer's cross-iteration increment.  */
5237   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5238     {
5239       tree use = USE_FROM_PTR (use_p);
5240
5241       if (use == dataref_ptr)
5242         SET_USE (use_p, new_dataref_ptr);
5243       else
5244         gcc_assert (operand_equal_p (use, update, 0));
5245     }
5246
5247   return new_dataref_ptr;
5248 }
5249
5250
5251 /* Copy memory reference info such as base/clique from the SRC reference
5252    to the DEST MEM_REF.  */
5253
5254 void
5255 vect_copy_ref_info (tree dest, tree src)
5256 {
5257   if (TREE_CODE (dest) != MEM_REF)
5258     return;
5259
5260   tree src_base = src;
5261   while (handled_component_p (src_base))
5262     src_base = TREE_OPERAND (src_base, 0);
5263   if (TREE_CODE (src_base) != MEM_REF
5264       && TREE_CODE (src_base) != TARGET_MEM_REF)
5265     return;
5266
5267   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5268   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5269 }
5270
5271
5272 /* Function vect_create_destination_var.
5273
5274    Create a new temporary of type VECTYPE.  */
5275
5276 tree
5277 vect_create_destination_var (tree scalar_dest, tree vectype)
5278 {
5279   tree vec_dest;
5280   const char *name;
5281   char *new_name;
5282   tree type;
5283   enum vect_var_kind kind;
5284
5285   kind = vectype
5286     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5287     ? vect_mask_var
5288     : vect_simple_var
5289     : vect_scalar_var;
5290   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5291
5292   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5293
5294   name = get_name (scalar_dest);
5295   if (name)
5296     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5297   else
5298     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5299   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5300   free (new_name);
5301
5302   return vec_dest;
5303 }
5304
5305 /* Function vect_grouped_store_supported.
5306
5307    Returns TRUE if interleave high and interleave low permutations
5308    are supported, and FALSE otherwise.  */
5309
5310 bool
5311 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5312 {
5313   machine_mode mode = TYPE_MODE (vectype);
5314
5315   /* vect_permute_store_chain requires the group size to be equal to 3 or
5316      be a power of two.  */
5317   if (count != 3 && exact_log2 (count) == -1)
5318     {
5319       if (dump_enabled_p ())
5320         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5321                          "the size of the group of accesses"
5322                          " is not a power of 2 or not eqaul to 3\n");
5323       return false;
5324     }
5325
5326   /* Check that the permutation is supported.  */
5327   if (VECTOR_MODE_P (mode))
5328     {
5329       unsigned int i;
5330       if (count == 3)
5331         {
5332           unsigned int j0 = 0, j1 = 0, j2 = 0;
5333           unsigned int i, j;
5334
5335           unsigned int nelt;
5336           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5337             {
5338               if (dump_enabled_p ())
5339                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5340                                  "cannot handle groups of 3 stores for"
5341                                  " variable-length vectors\n");
5342               return false;
5343             }
5344
5345           vec_perm_builder sel (nelt, nelt, 1);
5346           sel.quick_grow (nelt);
5347           vec_perm_indices indices;
5348           for (j = 0; j < 3; j++)
5349             {
5350               int nelt0 = ((3 - j) * nelt) % 3;
5351               int nelt1 = ((3 - j) * nelt + 1) % 3;
5352               int nelt2 = ((3 - j) * nelt + 2) % 3;
5353               for (i = 0; i < nelt; i++)
5354                 {
5355                   if (3 * i + nelt0 < nelt)
5356                     sel[3 * i + nelt0] = j0++;
5357                   if (3 * i + nelt1 < nelt)
5358                     sel[3 * i + nelt1] = nelt + j1++;
5359                   if (3 * i + nelt2 < nelt)
5360                     sel[3 * i + nelt2] = 0;
5361                 }
5362               indices.new_vector (sel, 2, nelt);
5363               if (!can_vec_perm_const_p (mode, mode, indices))
5364                 {
5365                   if (dump_enabled_p ())
5366                     dump_printf (MSG_MISSED_OPTIMIZATION,
5367                                  "permutation op not supported by target.\n");
5368                   return false;
5369                 }
5370
5371               for (i = 0; i < nelt; i++)
5372                 {
5373                   if (3 * i + nelt0 < nelt)
5374                     sel[3 * i + nelt0] = 3 * i + nelt0;
5375                   if (3 * i + nelt1 < nelt)
5376                     sel[3 * i + nelt1] = 3 * i + nelt1;
5377                   if (3 * i + nelt2 < nelt)
5378                     sel[3 * i + nelt2] = nelt + j2++;
5379                 }
5380               indices.new_vector (sel, 2, nelt);
5381               if (!can_vec_perm_const_p (mode, mode, indices))
5382                 {
5383                   if (dump_enabled_p ())
5384                     dump_printf (MSG_MISSED_OPTIMIZATION,
5385                                  "permutation op not supported by target.\n");
5386                   return false;
5387                 }
5388             }
5389           return true;
5390         }
5391       else
5392         {
5393           /* If length is not equal to 3 then only power of 2 is supported.  */
5394           gcc_assert (pow2p_hwi (count));
5395           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5396
5397           /* The encoding has 2 interleaved stepped patterns.  */
5398           if(!multiple_p (nelt, 2))
5399             return false;
5400           vec_perm_builder sel (nelt, 2, 3);
5401           sel.quick_grow (6);
5402           for (i = 0; i < 3; i++)
5403             {
5404               sel[i * 2] = i;
5405               sel[i * 2 + 1] = i + nelt;
5406             }
5407           vec_perm_indices indices (sel, 2, nelt);
5408           if (can_vec_perm_const_p (mode, mode, indices))
5409             {
5410               for (i = 0; i < 6; i++)
5411                 sel[i] += exact_div (nelt, 2);
5412               indices.new_vector (sel, 2, nelt);
5413               if (can_vec_perm_const_p (mode, mode, indices))
5414                 return true;
5415             }
5416         }
5417     }
5418
5419   if (dump_enabled_p ())
5420     dump_printf (MSG_MISSED_OPTIMIZATION,
5421                  "permutation op not supported by target.\n");
5422   return false;
5423 }
5424
5425
5426 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5427    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5428
5429 bool
5430 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5431                             bool masked_p)
5432 {
5433   if (masked_p)
5434     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5435                                          vec_mask_store_lanes_optab,
5436                                          vectype, count);
5437   else
5438     return vect_lanes_optab_supported_p ("vec_store_lanes",
5439                                          vec_store_lanes_optab,
5440                                          vectype, count);
5441 }
5442
5443
5444 /* Function vect_permute_store_chain.
5445
5446    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5447    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5448    the data correctly for the stores.  Return the final references for stores
5449    in RESULT_CHAIN.
5450
5451    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5452    The input is 4 vectors each containing 8 elements.  We assign a number to
5453    each element, the input sequence is:
5454
5455    1st vec:   0  1  2  3  4  5  6  7
5456    2nd vec:   8  9 10 11 12 13 14 15
5457    3rd vec:  16 17 18 19 20 21 22 23
5458    4th vec:  24 25 26 27 28 29 30 31
5459
5460    The output sequence should be:
5461
5462    1st vec:  0  8 16 24  1  9 17 25
5463    2nd vec:  2 10 18 26  3 11 19 27
5464    3rd vec:  4 12 20 28  5 13 21 30
5465    4th vec:  6 14 22 30  7 15 23 31
5466
5467    i.e., we interleave the contents of the four vectors in their order.
5468
5469    We use interleave_high/low instructions to create such output.  The input of
5470    each interleave_high/low operation is two vectors:
5471    1st vec    2nd vec
5472    0 1 2 3    4 5 6 7
5473    the even elements of the result vector are obtained left-to-right from the
5474    high/low elements of the first vector.  The odd elements of the result are
5475    obtained left-to-right from the high/low elements of the second vector.
5476    The output of interleave_high will be:   0 4 1 5
5477    and of interleave_low:                   2 6 3 7
5478
5479
5480    The permutation is done in log LENGTH stages.  In each stage interleave_high
5481    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5482    where the first argument is taken from the first half of DR_CHAIN and the
5483    second argument from it's second half.
5484    In our example,
5485
5486    I1: interleave_high (1st vec, 3rd vec)
5487    I2: interleave_low (1st vec, 3rd vec)
5488    I3: interleave_high (2nd vec, 4th vec)
5489    I4: interleave_low (2nd vec, 4th vec)
5490
5491    The output for the first stage is:
5492
5493    I1:  0 16  1 17  2 18  3 19
5494    I2:  4 20  5 21  6 22  7 23
5495    I3:  8 24  9 25 10 26 11 27
5496    I4: 12 28 13 29 14 30 15 31
5497
5498    The output of the second stage, i.e. the final result is:
5499
5500    I1:  0  8 16 24  1  9 17 25
5501    I2:  2 10 18 26  3 11 19 27
5502    I3:  4 12 20 28  5 13 21 30
5503    I4:  6 14 22 30  7 15 23 31.  */
5504
5505 void
5506 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5507                           unsigned int length,
5508                           stmt_vec_info stmt_info,
5509                           gimple_stmt_iterator *gsi,
5510                           vec<tree> *result_chain)
5511 {
5512   tree vect1, vect2, high, low;
5513   gimple *perm_stmt;
5514   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5515   tree perm_mask_low, perm_mask_high;
5516   tree data_ref;
5517   tree perm3_mask_low, perm3_mask_high;
5518   unsigned int i, j, n, log_length = exact_log2 (length);
5519
5520   result_chain->quick_grow (length);
5521   memcpy (result_chain->address (), dr_chain.address (),
5522           length * sizeof (tree));
5523
5524   if (length == 3)
5525     {
5526       /* vect_grouped_store_supported ensures that this is constant.  */
5527       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5528       unsigned int j0 = 0, j1 = 0, j2 = 0;
5529
5530       vec_perm_builder sel (nelt, nelt, 1);
5531       sel.quick_grow (nelt);
5532       vec_perm_indices indices;
5533       for (j = 0; j < 3; j++)
5534         {
5535           int nelt0 = ((3 - j) * nelt) % 3;
5536           int nelt1 = ((3 - j) * nelt + 1) % 3;
5537           int nelt2 = ((3 - j) * nelt + 2) % 3;
5538
5539           for (i = 0; i < nelt; i++)
5540             {
5541               if (3 * i + nelt0 < nelt)
5542                 sel[3 * i + nelt0] = j0++;
5543               if (3 * i + nelt1 < nelt)
5544                 sel[3 * i + nelt1] = nelt + j1++;
5545               if (3 * i + nelt2 < nelt)
5546                 sel[3 * i + nelt2] = 0;
5547             }
5548           indices.new_vector (sel, 2, nelt);
5549           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5550
5551           for (i = 0; i < nelt; i++)
5552             {
5553               if (3 * i + nelt0 < nelt)
5554                 sel[3 * i + nelt0] = 3 * i + nelt0;
5555               if (3 * i + nelt1 < nelt)
5556                 sel[3 * i + nelt1] = 3 * i + nelt1;
5557               if (3 * i + nelt2 < nelt)
5558                 sel[3 * i + nelt2] = nelt + j2++;
5559             }
5560           indices.new_vector (sel, 2, nelt);
5561           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5562
5563           vect1 = dr_chain[0];
5564           vect2 = dr_chain[1];
5565
5566           /* Create interleaving stmt:
5567              low = VEC_PERM_EXPR <vect1, vect2,
5568                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5569                                    j + 2, nelt + j + 2, *, ...}>  */
5570           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5571           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5572                                            vect2, perm3_mask_low);
5573           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5574
5575           vect1 = data_ref;
5576           vect2 = dr_chain[2];
5577           /* Create interleaving stmt:
5578              low = VEC_PERM_EXPR <vect1, vect2,
5579                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5580                                    6, 7, nelt + j + 2, ...}>  */
5581           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5582           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5583                                            vect2, perm3_mask_high);
5584           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5585           (*result_chain)[j] = data_ref;
5586         }
5587     }
5588   else
5589     {
5590       /* If length is not equal to 3 then only power of 2 is supported.  */
5591       gcc_assert (pow2p_hwi (length));
5592
5593       /* The encoding has 2 interleaved stepped patterns.  */
5594       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5595       vec_perm_builder sel (nelt, 2, 3);
5596       sel.quick_grow (6);
5597       for (i = 0; i < 3; i++)
5598         {
5599           sel[i * 2] = i;
5600           sel[i * 2 + 1] = i + nelt;
5601         }
5602         vec_perm_indices indices (sel, 2, nelt);
5603         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5604
5605         for (i = 0; i < 6; i++)
5606           sel[i] += exact_div (nelt, 2);
5607         indices.new_vector (sel, 2, nelt);
5608         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5609
5610         for (i = 0, n = log_length; i < n; i++)
5611           {
5612             for (j = 0; j < length/2; j++)
5613               {
5614                 vect1 = dr_chain[j];
5615                 vect2 = dr_chain[j+length/2];
5616
5617                 /* Create interleaving stmt:
5618                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5619                                                         ...}>  */
5620                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5621                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5622                                                  vect2, perm_mask_high);
5623                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5624                 (*result_chain)[2*j] = high;
5625
5626                 /* Create interleaving stmt:
5627                    low = VEC_PERM_EXPR <vect1, vect2,
5628                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5629                                          ...}>  */
5630                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5631                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5632                                                  vect2, perm_mask_low);
5633                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5634                 (*result_chain)[2*j+1] = low;
5635               }
5636             memcpy (dr_chain.address (), result_chain->address (),
5637                     length * sizeof (tree));
5638           }
5639     }
5640 }
5641
5642 /* Function vect_setup_realignment
5643
5644    This function is called when vectorizing an unaligned load using
5645    the dr_explicit_realign[_optimized] scheme.
5646    This function generates the following code at the loop prolog:
5647
5648       p = initial_addr;
5649    x  msq_init = *(floor(p));   # prolog load
5650       realignment_token = call target_builtin;
5651     loop:
5652    x  msq = phi (msq_init, ---)
5653
5654    The stmts marked with x are generated only for the case of
5655    dr_explicit_realign_optimized.
5656
5657    The code above sets up a new (vector) pointer, pointing to the first
5658    location accessed by STMT_INFO, and a "floor-aligned" load using that
5659    pointer.  It also generates code to compute the "realignment-token"
5660    (if the relevant target hook was defined), and creates a phi-node at the
5661    loop-header bb whose arguments are the result of the prolog-load (created
5662    by this function) and the result of a load that takes place in the loop
5663    (to be created by the caller to this function).
5664
5665    For the case of dr_explicit_realign_optimized:
5666    The caller to this function uses the phi-result (msq) to create the
5667    realignment code inside the loop, and sets up the missing phi argument,
5668    as follows:
5669     loop:
5670       msq = phi (msq_init, lsq)
5671       lsq = *(floor(p'));        # load in loop
5672       result = realign_load (msq, lsq, realignment_token);
5673
5674    For the case of dr_explicit_realign:
5675     loop:
5676       msq = *(floor(p));        # load in loop
5677       p' = p + (VS-1);
5678       lsq = *(floor(p'));       # load in loop
5679       result = realign_load (msq, lsq, realignment_token);
5680
5681    Input:
5682    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5683                a memory location that may be unaligned.
5684    BSI - place where new code is to be inserted.
5685    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5686                               is used.
5687
5688    Output:
5689    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5690                        target hook, if defined.
5691    Return value - the result of the loop-header phi node.  */
5692
5693 tree
5694 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5695                         gimple_stmt_iterator *gsi, tree *realignment_token,
5696                         enum dr_alignment_support alignment_support_scheme,
5697                         tree init_addr,
5698                         class loop **at_loop)
5699 {
5700   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5701   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5702   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5703   struct data_reference *dr = dr_info->dr;
5704   class loop *loop = NULL;
5705   edge pe = NULL;
5706   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5707   tree vec_dest;
5708   gimple *inc;
5709   tree ptr;
5710   tree data_ref;
5711   basic_block new_bb;
5712   tree msq_init = NULL_TREE;
5713   tree new_temp;
5714   gphi *phi_stmt;
5715   tree msq = NULL_TREE;
5716   gimple_seq stmts = NULL;
5717   bool compute_in_loop = false;
5718   bool nested_in_vect_loop = false;
5719   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5720   class loop *loop_for_initial_load = NULL;
5721
5722   if (loop_vinfo)
5723     {
5724       loop = LOOP_VINFO_LOOP (loop_vinfo);
5725       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5726     }
5727
5728   gcc_assert (alignment_support_scheme == dr_explicit_realign
5729               || alignment_support_scheme == dr_explicit_realign_optimized);
5730
5731   /* We need to generate three things:
5732      1. the misalignment computation
5733      2. the extra vector load (for the optimized realignment scheme).
5734      3. the phi node for the two vectors from which the realignment is
5735       done (for the optimized realignment scheme).  */
5736
5737   /* 1. Determine where to generate the misalignment computation.
5738
5739      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5740      calculation will be generated by this function, outside the loop (in the
5741      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5742      caller, inside the loop.
5743
5744      Background: If the misalignment remains fixed throughout the iterations of
5745      the loop, then both realignment schemes are applicable, and also the
5746      misalignment computation can be done outside LOOP.  This is because we are
5747      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5748      are a multiple of VS (the Vector Size), and therefore the misalignment in
5749      different vectorized LOOP iterations is always the same.
5750      The problem arises only if the memory access is in an inner-loop nested
5751      inside LOOP, which is now being vectorized using outer-loop vectorization.
5752      This is the only case when the misalignment of the memory access may not
5753      remain fixed throughout the iterations of the inner-loop (as explained in
5754      detail in vect_supportable_dr_alignment).  In this case, not only is the
5755      optimized realignment scheme not applicable, but also the misalignment
5756      computation (and generation of the realignment token that is passed to
5757      REALIGN_LOAD) have to be done inside the loop.
5758
5759      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5760      or not, which in turn determines if the misalignment is computed inside
5761      the inner-loop, or outside LOOP.  */
5762
5763   if (init_addr != NULL_TREE || !loop_vinfo)
5764     {
5765       compute_in_loop = true;
5766       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5767     }
5768
5769
5770   /* 2. Determine where to generate the extra vector load.
5771
5772      For the optimized realignment scheme, instead of generating two vector
5773      loads in each iteration, we generate a single extra vector load in the
5774      preheader of the loop, and in each iteration reuse the result of the
5775      vector load from the previous iteration.  In case the memory access is in
5776      an inner-loop nested inside LOOP, which is now being vectorized using
5777      outer-loop vectorization, we need to determine whether this initial vector
5778      load should be generated at the preheader of the inner-loop, or can be
5779      generated at the preheader of LOOP.  If the memory access has no evolution
5780      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5781      to be generated inside LOOP (in the preheader of the inner-loop).  */
5782
5783   if (nested_in_vect_loop)
5784     {
5785       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5786       bool invariant_in_outerloop =
5787             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5788       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5789     }
5790   else
5791     loop_for_initial_load = loop;
5792   if (at_loop)
5793     *at_loop = loop_for_initial_load;
5794
5795   tree vuse = NULL_TREE;
5796   if (loop_for_initial_load)
5797     {
5798       pe = loop_preheader_edge (loop_for_initial_load);
5799       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5800         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5801     }
5802   if (!vuse)
5803     vuse = gimple_vuse (gsi_stmt (*gsi));
5804
5805   /* 3. For the case of the optimized realignment, create the first vector
5806       load at the loop preheader.  */
5807
5808   if (alignment_support_scheme == dr_explicit_realign_optimized)
5809     {
5810       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5811       gassign *new_stmt;
5812
5813       gcc_assert (!compute_in_loop);
5814       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5815       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5816                                       loop_for_initial_load, NULL_TREE,
5817                                       &init_addr, NULL, &inc, true);
5818       if (TREE_CODE (ptr) == SSA_NAME)
5819         new_temp = copy_ssa_name (ptr);
5820       else
5821         new_temp = make_ssa_name (TREE_TYPE (ptr));
5822       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5823       tree type = TREE_TYPE (ptr);
5824       new_stmt = gimple_build_assign
5825                    (new_temp, BIT_AND_EXPR, ptr,
5826                     fold_build2 (MINUS_EXPR, type,
5827                                  build_int_cst (type, 0),
5828                                  build_int_cst (type, align)));
5829       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5830       gcc_assert (!new_bb);
5831       data_ref
5832         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5833                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5834       vect_copy_ref_info (data_ref, DR_REF (dr));
5835       new_stmt = gimple_build_assign (vec_dest, data_ref);
5836       new_temp = make_ssa_name (vec_dest, new_stmt);
5837       gimple_assign_set_lhs (new_stmt, new_temp);
5838       gimple_set_vuse (new_stmt, vuse);
5839       if (pe)
5840         {
5841           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5842           gcc_assert (!new_bb);
5843         }
5844       else
5845          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5846
5847       msq_init = gimple_assign_lhs (new_stmt);
5848     }
5849
5850   /* 4. Create realignment token using a target builtin, if available.
5851       It is done either inside the containing loop, or before LOOP (as
5852       determined above).  */
5853
5854   if (targetm.vectorize.builtin_mask_for_load)
5855     {
5856       gcall *new_stmt;
5857       tree builtin_decl;
5858
5859       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5860       if (!init_addr)
5861         {
5862           /* Generate the INIT_ADDR computation outside LOOP.  */
5863           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5864                                                             stmt_info, &stmts,
5865                                                             NULL_TREE);
5866           if (loop)
5867             {
5868               pe = loop_preheader_edge (loop);
5869               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5870               gcc_assert (!new_bb);
5871             }
5872           else
5873              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5874         }
5875
5876       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5877       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5878       vec_dest =
5879         vect_create_destination_var (scalar_dest,
5880                                      gimple_call_return_type (new_stmt));
5881       new_temp = make_ssa_name (vec_dest, new_stmt);
5882       gimple_call_set_lhs (new_stmt, new_temp);
5883
5884       if (compute_in_loop)
5885         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5886       else
5887         {
5888           /* Generate the misalignment computation outside LOOP.  */
5889           pe = loop_preheader_edge (loop);
5890           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5891           gcc_assert (!new_bb);
5892         }
5893
5894       *realignment_token = gimple_call_lhs (new_stmt);
5895
5896       /* The result of the CALL_EXPR to this builtin is determined from
5897          the value of the parameter and no global variables are touched
5898          which makes the builtin a "const" function.  Requiring the
5899          builtin to have the "const" attribute makes it unnecessary
5900          to call mark_call_clobbered.  */
5901       gcc_assert (TREE_READONLY (builtin_decl));
5902     }
5903
5904   if (alignment_support_scheme == dr_explicit_realign)
5905     return msq;
5906
5907   gcc_assert (!compute_in_loop);
5908   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5909
5910
5911   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5912
5913   pe = loop_preheader_edge (containing_loop);
5914   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5915   msq = make_ssa_name (vec_dest);
5916   phi_stmt = create_phi_node (msq, containing_loop->header);
5917   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5918
5919   return msq;
5920 }
5921
5922
5923 /* Function vect_grouped_load_supported.
5924
5925    COUNT is the size of the load group (the number of statements plus the
5926    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5927    only one statement, with a gap of COUNT - 1.
5928
5929    Returns true if a suitable permute exists.  */
5930
5931 bool
5932 vect_grouped_load_supported (tree vectype, bool single_element_p,
5933                              unsigned HOST_WIDE_INT count)
5934 {
5935   machine_mode mode = TYPE_MODE (vectype);
5936
5937   /* If this is single-element interleaving with an element distance
5938      that leaves unused vector loads around punt - we at least create
5939      very sub-optimal code in that case (and blow up memory,
5940      see PR65518).  */
5941   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5942     {
5943       if (dump_enabled_p ())
5944         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5945                          "single-element interleaving not supported "
5946                          "for not adjacent vector loads\n");
5947       return false;
5948     }
5949
5950   /* vect_permute_load_chain requires the group size to be equal to 3 or
5951      be a power of two.  */
5952   if (count != 3 && exact_log2 (count) == -1)
5953     {
5954       if (dump_enabled_p ())
5955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5956                          "the size of the group of accesses"
5957                          " is not a power of 2 or not equal to 3\n");
5958       return false;
5959     }
5960
5961   /* Check that the permutation is supported.  */
5962   if (VECTOR_MODE_P (mode))
5963     {
5964       unsigned int i, j;
5965       if (count == 3)
5966         {
5967           unsigned int nelt;
5968           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5969             {
5970               if (dump_enabled_p ())
5971                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5972                                  "cannot handle groups of 3 loads for"
5973                                  " variable-length vectors\n");
5974               return false;
5975             }
5976
5977           vec_perm_builder sel (nelt, nelt, 1);
5978           sel.quick_grow (nelt);
5979           vec_perm_indices indices;
5980           unsigned int k;
5981           for (k = 0; k < 3; k++)
5982             {
5983               for (i = 0; i < nelt; i++)
5984                 if (3 * i + k < 2 * nelt)
5985                   sel[i] = 3 * i + k;
5986                 else
5987                   sel[i] = 0;
5988               indices.new_vector (sel, 2, nelt);
5989               if (!can_vec_perm_const_p (mode, mode, indices))
5990                 {
5991                   if (dump_enabled_p ())
5992                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5993                                      "shuffle of 3 loads is not supported by"
5994                                      " target\n");
5995                   return false;
5996                 }
5997               for (i = 0, j = 0; i < nelt; i++)
5998                 if (3 * i + k < 2 * nelt)
5999                   sel[i] = i;
6000                 else
6001                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6002               indices.new_vector (sel, 2, nelt);
6003               if (!can_vec_perm_const_p (mode, mode, indices))
6004                 {
6005                   if (dump_enabled_p ())
6006                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6007                                      "shuffle of 3 loads is not supported by"
6008                                      " target\n");
6009                   return false;
6010                 }
6011             }
6012           return true;
6013         }
6014       else
6015         {
6016           /* If length is not equal to 3 then only power of 2 is supported.  */
6017           gcc_assert (pow2p_hwi (count));
6018           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6019
6020           /* The encoding has a single stepped pattern.  */
6021           vec_perm_builder sel (nelt, 1, 3);
6022           sel.quick_grow (3);
6023           for (i = 0; i < 3; i++)
6024             sel[i] = i * 2;
6025           vec_perm_indices indices (sel, 2, nelt);
6026           if (can_vec_perm_const_p (mode, mode, indices))
6027             {
6028               for (i = 0; i < 3; i++)
6029                 sel[i] = i * 2 + 1;
6030               indices.new_vector (sel, 2, nelt);
6031               if (can_vec_perm_const_p (mode, mode, indices))
6032                 return true;
6033             }
6034         }
6035     }
6036
6037   if (dump_enabled_p ())
6038     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6039                      "extract even/odd not supported by target\n");
6040   return false;
6041 }
6042
6043 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6044    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6045
6046 bool
6047 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6048                            bool masked_p)
6049 {
6050   if (masked_p)
6051     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6052                                          vec_mask_load_lanes_optab,
6053                                          vectype, count);
6054   else
6055     return vect_lanes_optab_supported_p ("vec_load_lanes",
6056                                          vec_load_lanes_optab,
6057                                          vectype, count);
6058 }
6059
6060 /* Function vect_permute_load_chain.
6061
6062    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6063    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6064    the input data correctly.  Return the final references for loads in
6065    RESULT_CHAIN.
6066
6067    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6068    The input is 4 vectors each containing 8 elements. We assign a number to each
6069    element, the input sequence is:
6070
6071    1st vec:   0  1  2  3  4  5  6  7
6072    2nd vec:   8  9 10 11 12 13 14 15
6073    3rd vec:  16 17 18 19 20 21 22 23
6074    4th vec:  24 25 26 27 28 29 30 31
6075
6076    The output sequence should be:
6077
6078    1st vec:  0 4  8 12 16 20 24 28
6079    2nd vec:  1 5  9 13 17 21 25 29
6080    3rd vec:  2 6 10 14 18 22 26 30
6081    4th vec:  3 7 11 15 19 23 27 31
6082
6083    i.e., the first output vector should contain the first elements of each
6084    interleaving group, etc.
6085
6086    We use extract_even/odd instructions to create such output.  The input of
6087    each extract_even/odd operation is two vectors
6088    1st vec    2nd vec
6089    0 1 2 3    4 5 6 7
6090
6091    and the output is the vector of extracted even/odd elements.  The output of
6092    extract_even will be:   0 2 4 6
6093    and of extract_odd:     1 3 5 7
6094
6095
6096    The permutation is done in log LENGTH stages.  In each stage extract_even
6097    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6098    their order.  In our example,
6099
6100    E1: extract_even (1st vec, 2nd vec)
6101    E2: extract_odd (1st vec, 2nd vec)
6102    E3: extract_even (3rd vec, 4th vec)
6103    E4: extract_odd (3rd vec, 4th vec)
6104
6105    The output for the first stage will be:
6106
6107    E1:  0  2  4  6  8 10 12 14
6108    E2:  1  3  5  7  9 11 13 15
6109    E3: 16 18 20 22 24 26 28 30
6110    E4: 17 19 21 23 25 27 29 31
6111
6112    In order to proceed and create the correct sequence for the next stage (or
6113    for the correct output, if the second stage is the last one, as in our
6114    example), we first put the output of extract_even operation and then the
6115    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6116    The input for the second stage is:
6117
6118    1st vec (E1):  0  2  4  6  8 10 12 14
6119    2nd vec (E3): 16 18 20 22 24 26 28 30
6120    3rd vec (E2):  1  3  5  7  9 11 13 15
6121    4th vec (E4): 17 19 21 23 25 27 29 31
6122
6123    The output of the second stage:
6124
6125    E1: 0 4  8 12 16 20 24 28
6126    E2: 2 6 10 14 18 22 26 30
6127    E3: 1 5  9 13 17 21 25 29
6128    E4: 3 7 11 15 19 23 27 31
6129
6130    And RESULT_CHAIN after reordering:
6131
6132    1st vec (E1):  0 4  8 12 16 20 24 28
6133    2nd vec (E3):  1 5  9 13 17 21 25 29
6134    3rd vec (E2):  2 6 10 14 18 22 26 30
6135    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6136
6137 static void
6138 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6139                          unsigned int length,
6140                          stmt_vec_info stmt_info,
6141                          gimple_stmt_iterator *gsi,
6142                          vec<tree> *result_chain)
6143 {
6144   tree data_ref, first_vect, second_vect;
6145   tree perm_mask_even, perm_mask_odd;
6146   tree perm3_mask_low, perm3_mask_high;
6147   gimple *perm_stmt;
6148   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6149   unsigned int i, j, log_length = exact_log2 (length);
6150
6151   result_chain->quick_grow (length);
6152   memcpy (result_chain->address (), dr_chain.address (),
6153           length * sizeof (tree));
6154
6155   if (length == 3)
6156     {
6157       /* vect_grouped_load_supported ensures that this is constant.  */
6158       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6159       unsigned int k;
6160
6161       vec_perm_builder sel (nelt, nelt, 1);
6162       sel.quick_grow (nelt);
6163       vec_perm_indices indices;
6164       for (k = 0; k < 3; k++)
6165         {
6166           for (i = 0; i < nelt; i++)
6167             if (3 * i + k < 2 * nelt)
6168               sel[i] = 3 * i + k;
6169             else
6170               sel[i] = 0;
6171           indices.new_vector (sel, 2, nelt);
6172           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6173
6174           for (i = 0, j = 0; i < nelt; i++)
6175             if (3 * i + k < 2 * nelt)
6176               sel[i] = i;
6177             else
6178               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6179           indices.new_vector (sel, 2, nelt);
6180           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6181
6182           first_vect = dr_chain[0];
6183           second_vect = dr_chain[1];
6184
6185           /* Create interleaving stmt (low part of):
6186              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6187                                                              ...}>  */
6188           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6189           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6190                                            second_vect, perm3_mask_low);
6191           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6192
6193           /* Create interleaving stmt (high part of):
6194              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6195                                                               ...}>  */
6196           first_vect = data_ref;
6197           second_vect = dr_chain[2];
6198           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6199           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6200                                            second_vect, perm3_mask_high);
6201           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6202           (*result_chain)[k] = data_ref;
6203         }
6204     }
6205   else
6206     {
6207       /* If length is not equal to 3 then only power of 2 is supported.  */
6208       gcc_assert (pow2p_hwi (length));
6209
6210       /* The encoding has a single stepped pattern.  */
6211       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6212       vec_perm_builder sel (nelt, 1, 3);
6213       sel.quick_grow (3);
6214       for (i = 0; i < 3; ++i)
6215         sel[i] = i * 2;
6216       vec_perm_indices indices (sel, 2, nelt);
6217       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6218
6219       for (i = 0; i < 3; ++i)
6220         sel[i] = i * 2 + 1;
6221       indices.new_vector (sel, 2, nelt);
6222       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6223
6224       for (i = 0; i < log_length; i++)
6225         {
6226           for (j = 0; j < length; j += 2)
6227             {
6228               first_vect = dr_chain[j];
6229               second_vect = dr_chain[j+1];
6230
6231               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6232               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6233               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6234                                                first_vect, second_vect,
6235                                                perm_mask_even);
6236               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6237               (*result_chain)[j/2] = data_ref;
6238
6239               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6240               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6241               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6242                                                first_vect, second_vect,
6243                                                perm_mask_odd);
6244               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6245               (*result_chain)[j/2+length/2] = data_ref;
6246             }
6247           memcpy (dr_chain.address (), result_chain->address (),
6248                   length * sizeof (tree));
6249         }
6250     }
6251 }
6252
6253 /* Function vect_shift_permute_load_chain.
6254
6255    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6256    sequence of stmts to reorder the input data accordingly.
6257    Return the final references for loads in RESULT_CHAIN.
6258    Return true if successed, false otherwise.
6259
6260    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6261    The input is 3 vectors each containing 8 elements.  We assign a
6262    number to each element, the input sequence is:
6263
6264    1st vec:   0  1  2  3  4  5  6  7
6265    2nd vec:   8  9 10 11 12 13 14 15
6266    3rd vec:  16 17 18 19 20 21 22 23
6267
6268    The output sequence should be:
6269
6270    1st vec:  0 3 6  9 12 15 18 21
6271    2nd vec:  1 4 7 10 13 16 19 22
6272    3rd vec:  2 5 8 11 14 17 20 23
6273
6274    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6275
6276    First we shuffle all 3 vectors to get correct elements order:
6277
6278    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6279    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6280    3rd vec:  (16 19 22) (17 20 23) (18 21)
6281
6282    Next we unite and shift vector 3 times:
6283
6284    1st step:
6285      shift right by 6 the concatenation of:
6286      "1st vec" and  "2nd vec"
6287        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6288      "2nd vec" and  "3rd vec"
6289        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6290      "3rd vec" and  "1st vec"
6291        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6292                              | New vectors                   |
6293
6294      So that now new vectors are:
6295
6296      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6297      2nd vec:  (10 13) (16 19 22) (17 20 23)
6298      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6299
6300    2nd step:
6301      shift right by 5 the concatenation of:
6302      "1st vec" and  "3rd vec"
6303        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6304      "2nd vec" and  "1st vec"
6305        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6306      "3rd vec" and  "2nd vec"
6307        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6308                           | New vectors                   |
6309
6310      So that now new vectors are:
6311
6312      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6313      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6314      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6315
6316    3rd step:
6317      shift right by 5 the concatenation of:
6318      "1st vec" and  "1st vec"
6319        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6320      shift right by 3 the concatenation of:
6321      "2nd vec" and  "2nd vec"
6322                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6323                           | New vectors                   |
6324
6325      So that now all vectors are READY:
6326      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6327      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6328      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6329
6330    This algorithm is faster than one in vect_permute_load_chain if:
6331      1.  "shift of a concatination" is faster than general permutation.
6332          This is usually so.
6333      2.  The TARGET machine can't execute vector instructions in parallel.
6334          This is because each step of the algorithm depends on previous.
6335          The algorithm in vect_permute_load_chain is much more parallel.
6336
6337    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6338 */
6339
6340 static bool
6341 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6342                                unsigned int length,
6343                                stmt_vec_info stmt_info,
6344                                gimple_stmt_iterator *gsi,
6345                                vec<tree> *result_chain)
6346 {
6347   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6348   tree perm2_mask1, perm2_mask2, perm3_mask;
6349   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6350   gimple *perm_stmt;
6351
6352   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6353   machine_mode vmode = TYPE_MODE (vectype);
6354   unsigned int i;
6355   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6356
6357   unsigned HOST_WIDE_INT nelt, vf;
6358   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6359       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6360     /* Not supported for variable-length vectors.  */
6361     return false;
6362
6363   vec_perm_builder sel (nelt, nelt, 1);
6364   sel.quick_grow (nelt);
6365
6366   result_chain->quick_grow (length);
6367   memcpy (result_chain->address (), dr_chain.address (),
6368           length * sizeof (tree));
6369
6370   if (pow2p_hwi (length) && vf > 4)
6371     {
6372       unsigned int j, log_length = exact_log2 (length);
6373       for (i = 0; i < nelt / 2; ++i)
6374         sel[i] = i * 2;
6375       for (i = 0; i < nelt / 2; ++i)
6376         sel[nelt / 2 + i] = i * 2 + 1;
6377       vec_perm_indices indices (sel, 2, nelt);
6378       if (!can_vec_perm_const_p (vmode, vmode, indices))
6379         {
6380           if (dump_enabled_p ())
6381             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6382                              "shuffle of 2 fields structure is not \
6383                               supported by target\n");
6384           return false;
6385         }
6386       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6387
6388       for (i = 0; i < nelt / 2; ++i)
6389         sel[i] = i * 2 + 1;
6390       for (i = 0; i < nelt / 2; ++i)
6391         sel[nelt / 2 + i] = i * 2;
6392       indices.new_vector (sel, 2, nelt);
6393       if (!can_vec_perm_const_p (vmode, vmode, indices))
6394         {
6395           if (dump_enabled_p ())
6396             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6397                              "shuffle of 2 fields structure is not \
6398                               supported by target\n");
6399           return false;
6400         }
6401       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6402
6403       /* Generating permutation constant to shift all elements.
6404          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6405       for (i = 0; i < nelt; i++)
6406         sel[i] = nelt / 2 + i;
6407       indices.new_vector (sel, 2, nelt);
6408       if (!can_vec_perm_const_p (vmode, vmode, indices))
6409         {
6410           if (dump_enabled_p ())
6411             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6412                              "shift permutation is not supported by target\n");
6413           return false;
6414         }
6415       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6416
6417       /* Generating permutation constant to select vector from 2.
6418          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6419       for (i = 0; i < nelt / 2; i++)
6420         sel[i] = i;
6421       for (i = nelt / 2; i < nelt; i++)
6422         sel[i] = nelt + i;
6423       indices.new_vector (sel, 2, nelt);
6424       if (!can_vec_perm_const_p (vmode, vmode, indices))
6425         {
6426           if (dump_enabled_p ())
6427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428                              "select is not supported by target\n");
6429           return false;
6430         }
6431       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6432
6433       for (i = 0; i < log_length; i++)
6434         {
6435           for (j = 0; j < length; j += 2)
6436             {
6437               first_vect = dr_chain[j];
6438               second_vect = dr_chain[j + 1];
6439
6440               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6441               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6442                                                first_vect, first_vect,
6443                                                perm2_mask1);
6444               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6445               vect[0] = data_ref;
6446
6447               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6448               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6449                                                second_vect, second_vect,
6450                                                perm2_mask2);
6451               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6452               vect[1] = data_ref;
6453
6454               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6455               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6456                                                vect[0], vect[1], shift1_mask);
6457               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6458               (*result_chain)[j/2 + length/2] = data_ref;
6459
6460               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6461               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6462                                                vect[0], vect[1], select_mask);
6463               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6464               (*result_chain)[j/2] = data_ref;
6465             }
6466           memcpy (dr_chain.address (), result_chain->address (),
6467                   length * sizeof (tree));
6468         }
6469       return true;
6470     }
6471   if (length == 3 && vf > 2)
6472     {
6473       unsigned int k = 0, l = 0;
6474
6475       /* Generating permutation constant to get all elements in rigth order.
6476          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6477       for (i = 0; i < nelt; i++)
6478         {
6479           if (3 * k + (l % 3) >= nelt)
6480             {
6481               k = 0;
6482               l += (3 - (nelt % 3));
6483             }
6484           sel[i] = 3 * k + (l % 3);
6485           k++;
6486         }
6487       vec_perm_indices indices (sel, 2, nelt);
6488       if (!can_vec_perm_const_p (vmode, vmode, indices))
6489         {
6490           if (dump_enabled_p ())
6491             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492                              "shuffle of 3 fields structure is not \
6493                               supported by target\n");
6494           return false;
6495         }
6496       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6497
6498       /* Generating permutation constant to shift all elements.
6499          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6500       for (i = 0; i < nelt; i++)
6501         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6502       indices.new_vector (sel, 2, nelt);
6503       if (!can_vec_perm_const_p (vmode, vmode, indices))
6504         {
6505           if (dump_enabled_p ())
6506             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6507                              "shift permutation is not supported by target\n");
6508           return false;
6509         }
6510       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6511
6512       /* Generating permutation constant to shift all elements.
6513          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6514       for (i = 0; i < nelt; i++)
6515         sel[i] = 2 * (nelt / 3) + 1 + i;
6516       indices.new_vector (sel, 2, nelt);
6517       if (!can_vec_perm_const_p (vmode, vmode, indices))
6518         {
6519           if (dump_enabled_p ())
6520             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6521                              "shift permutation is not supported by target\n");
6522           return false;
6523         }
6524       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6525
6526       /* Generating permutation constant to shift all elements.
6527          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6528       for (i = 0; i < nelt; i++)
6529         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6530       indices.new_vector (sel, 2, nelt);
6531       if (!can_vec_perm_const_p (vmode, vmode, indices))
6532         {
6533           if (dump_enabled_p ())
6534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6535                              "shift permutation is not supported by target\n");
6536           return false;
6537         }
6538       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6539
6540       /* Generating permutation constant to shift all elements.
6541          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6542       for (i = 0; i < nelt; i++)
6543         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6544       indices.new_vector (sel, 2, nelt);
6545       if (!can_vec_perm_const_p (vmode, vmode, indices))
6546         {
6547           if (dump_enabled_p ())
6548             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6549                              "shift permutation is not supported by target\n");
6550           return false;
6551         }
6552       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6553
6554       for (k = 0; k < 3; k++)
6555         {
6556           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6557           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6558                                            dr_chain[k], dr_chain[k],
6559                                            perm3_mask);
6560           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6561           vect[k] = data_ref;
6562         }
6563
6564       for (k = 0; k < 3; k++)
6565         {
6566           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6567           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6568                                            vect[k % 3], vect[(k + 1) % 3],
6569                                            shift1_mask);
6570           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6571           vect_shift[k] = data_ref;
6572         }
6573
6574       for (k = 0; k < 3; k++)
6575         {
6576           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6577           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6578                                            vect_shift[(4 - k) % 3],
6579                                            vect_shift[(3 - k) % 3],
6580                                            shift2_mask);
6581           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6582           vect[k] = data_ref;
6583         }
6584
6585       (*result_chain)[3 - (nelt % 3)] = vect[2];
6586
6587       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6588       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6589                                        vect[0], shift3_mask);
6590       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6591       (*result_chain)[nelt % 3] = data_ref;
6592
6593       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6594       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6595                                        vect[1], shift4_mask);
6596       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6597       (*result_chain)[0] = data_ref;
6598       return true;
6599     }
6600   return false;
6601 }
6602
6603 /* Function vect_transform_grouped_load.
6604
6605    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6606    to perform their permutation and ascribe the result vectorized statements to
6607    the scalar statements.
6608 */
6609
6610 void
6611 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6612                              vec<tree> dr_chain,
6613                              int size, gimple_stmt_iterator *gsi)
6614 {
6615   machine_mode mode;
6616   vec<tree> result_chain = vNULL;
6617
6618   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6619      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6620      vectors, that are ready for vector computation.  */
6621   result_chain.create (size);
6622
6623   /* If reassociation width for vector type is 2 or greater target machine can
6624      execute 2 or more vector instructions in parallel.  Otherwise try to
6625      get chain for loads group using vect_shift_permute_load_chain.  */
6626   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6627   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6628       || pow2p_hwi (size)
6629       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6630                                          gsi, &result_chain))
6631     vect_permute_load_chain (vinfo, dr_chain,
6632                              size, stmt_info, gsi, &result_chain);
6633   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6634   result_chain.release ();
6635 }
6636
6637 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6638    generated as part of the vectorization of STMT_INFO.  Assign the statement
6639    for each vector to the associated scalar statement.  */
6640
6641 void
6642 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6643                                   vec<tree> result_chain)
6644 {
6645   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6646   unsigned int i, gap_count;
6647   tree tmp_data_ref;
6648
6649   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6650      Since we scan the chain starting from it's first node, their order
6651      corresponds the order of data-refs in RESULT_CHAIN.  */
6652   stmt_vec_info next_stmt_info = first_stmt_info;
6653   gap_count = 1;
6654   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6655     {
6656       if (!next_stmt_info)
6657         break;
6658
6659       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6660        code elimination pass later.  No need to check for the first stmt in
6661        the group, since it always exists.
6662        DR_GROUP_GAP is the number of steps in elements from the previous
6663        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6664        correspond to the gaps.  */
6665       if (next_stmt_info != first_stmt_info
6666           && gap_count < DR_GROUP_GAP (next_stmt_info))
6667         {
6668           gap_count++;
6669           continue;
6670         }
6671
6672       /* ???  The following needs cleanup after the removal of
6673          DR_GROUP_SAME_DR_STMT.  */
6674       if (next_stmt_info)
6675         {
6676           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6677           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6678              copies, and we put the new vector statement last.  */
6679           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6680
6681           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6682           gap_count = 1;
6683         }
6684     }
6685 }
6686
6687 /* Function vect_force_dr_alignment_p.
6688
6689    Returns whether the alignment of a DECL can be forced to be aligned
6690    on ALIGNMENT bit boundary.  */
6691
6692 bool
6693 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6694 {
6695   if (!VAR_P (decl))
6696     return false;
6697
6698   if (decl_in_symtab_p (decl)
6699       && !symtab_node::get (decl)->can_increase_alignment_p ())
6700     return false;
6701
6702   if (TREE_STATIC (decl))
6703     return (known_le (alignment,
6704                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6705   else
6706     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6707 }
6708
6709 /* Return whether the data reference DR_INFO is supported with respect to its
6710    alignment.
6711    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6712    it is aligned, i.e., check if it is possible to vectorize it with different
6713    alignment.  */
6714
6715 enum dr_alignment_support
6716 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6717                                tree vectype, int misalignment)
6718 {
6719   data_reference *dr = dr_info->dr;
6720   stmt_vec_info stmt_info = dr_info->stmt;
6721   machine_mode mode = TYPE_MODE (vectype);
6722   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6723   class loop *vect_loop = NULL;
6724   bool nested_in_vect_loop = false;
6725
6726   if (misalignment == 0)
6727     return dr_aligned;
6728
6729   /* For now assume all conditional loads/stores support unaligned
6730      access without any special code.  */
6731   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6732     if (gimple_call_internal_p (stmt)
6733         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6734             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6735       return dr_unaligned_supported;
6736
6737   if (loop_vinfo)
6738     {
6739       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6740       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6741     }
6742
6743   /* Possibly unaligned access.  */
6744
6745   /* We can choose between using the implicit realignment scheme (generating
6746      a misaligned_move stmt) and the explicit realignment scheme (generating
6747      aligned loads with a REALIGN_LOAD).  There are two variants to the
6748      explicit realignment scheme: optimized, and unoptimized.
6749      We can optimize the realignment only if the step between consecutive
6750      vector loads is equal to the vector size.  Since the vector memory
6751      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6752      is guaranteed that the misalignment amount remains the same throughout the
6753      execution of the vectorized loop.  Therefore, we can create the
6754      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6755      at the loop preheader.
6756
6757      However, in the case of outer-loop vectorization, when vectorizing a
6758      memory access in the inner-loop nested within the LOOP that is now being
6759      vectorized, while it is guaranteed that the misalignment of the
6760      vectorized memory access will remain the same in different outer-loop
6761      iterations, it is *not* guaranteed that is will remain the same throughout
6762      the execution of the inner-loop.  This is because the inner-loop advances
6763      with the original scalar step (and not in steps of VS).  If the inner-loop
6764      step happens to be a multiple of VS, then the misalignment remains fixed
6765      and we can use the optimized realignment scheme.  For example:
6766
6767       for (i=0; i<N; i++)
6768         for (j=0; j<M; j++)
6769           s += a[i+j];
6770
6771      When vectorizing the i-loop in the above example, the step between
6772      consecutive vector loads is 1, and so the misalignment does not remain
6773      fixed across the execution of the inner-loop, and the realignment cannot
6774      be optimized (as illustrated in the following pseudo vectorized loop):
6775
6776       for (i=0; i<N; i+=4)
6777         for (j=0; j<M; j++){
6778           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6779                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6780                          // (assuming that we start from an aligned address).
6781           }
6782
6783      We therefore have to use the unoptimized realignment scheme:
6784
6785       for (i=0; i<N; i+=4)
6786           for (j=k; j<M; j+=4)
6787           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6788                            // that the misalignment of the initial address is
6789                            // 0).
6790
6791      The loop can then be vectorized as follows:
6792
6793       for (k=0; k<4; k++){
6794         rt = get_realignment_token (&vp[k]);
6795         for (i=0; i<N; i+=4){
6796           v1 = vp[i+k];
6797           for (j=k; j<M; j+=4){
6798             v2 = vp[i+j+VS-1];
6799             va = REALIGN_LOAD <v1,v2,rt>;
6800             vs += va;
6801             v1 = v2;
6802           }
6803         }
6804     } */
6805
6806   if (DR_IS_READ (dr))
6807     {
6808       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6809           && (!targetm.vectorize.builtin_mask_for_load
6810               || targetm.vectorize.builtin_mask_for_load ()))
6811         {
6812           /* If we are doing SLP then the accesses need not have the
6813              same alignment, instead it depends on the SLP group size.  */
6814           if (loop_vinfo
6815               && STMT_SLP_TYPE (stmt_info)
6816               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6817                               * (DR_GROUP_SIZE
6818                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6819                               TYPE_VECTOR_SUBPARTS (vectype)))
6820             ;
6821           else if (!loop_vinfo
6822                    || (nested_in_vect_loop
6823                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6824                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6825             return dr_explicit_realign;
6826           else
6827             return dr_explicit_realign_optimized;
6828         }
6829     }
6830
6831   bool is_packed = false;
6832   tree type = TREE_TYPE (DR_REF (dr));
6833   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6834     is_packed = not_size_aligned (DR_REF (dr));
6835   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6836                                                      is_packed))
6837     return dr_unaligned_supported;
6838
6839   /* Unsupported.  */
6840   return dr_unaligned_unsupported;
6841 }