gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 140         {
 141           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144           if (rhs < lhs)
 145             scalar_type = rhs_type;
 146         }
 147     }
 148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 149     {
 150       unsigned int i = 0;
 151       if (gimple_call_internal_p (call))
 152         {
 153           internal_fn ifn = gimple_call_internal_fn (call);
 154           if (internal_load_fn_p (ifn))
 155             /* For loads the LHS type does the trick.  */
 156             i = ~0U;
 157           else if (internal_store_fn_p (ifn))
 158             {
 159               /* For stores use the tyep of the stored value.  */
 160               i = internal_fn_stored_value_index (ifn);
 161               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 162               i = ~0U;
 163             }
 164           else if (internal_fn_mask_index (ifn) == 0)
 165             i = 1;
 166         }
 167       if (i < gimple_call_num_args (call))
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 170           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 171             {
 172               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 173               if (rhs < lhs)
 174                 scalar_type = rhs_type;
 175             }
 176         }
 177     }
 178
 179   return scalar_type;
 180 }
 181
 182
 183 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 184    tested at run-time.  Return TRUE if DDR was successfully inserted.
 185    Return false if versioning is not supported.  */
 186
 187 static opt_result
 188 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 189 {
 190   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 191
 192   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 193     return opt_result::failure_at (vect_location,
 194                                    "will not create alias checks, as"
 195                                    " --param vect-max-version-for-alias-checks"
 196                                    " == 0\n");
 197
 198   opt_result res
 199     = runtime_alias_check_p (ddr, loop,
 200                              optimize_loop_nest_for_speed_p (loop));
 201   if (!res)
 202     return res;
 203
 204   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 205   return opt_result::success ();
 206 }
 207
 208 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 209
 210 static void
 211 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 212 {
 213   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 214   for (unsigned int i = 0; i < checks.length(); ++i)
 215     if (checks[i] == value)
 216       return;
 217
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location,
 220                      "need run-time check that %T is nonzero\n",
 221                      value);
 222   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 223 }
 224
 225 /* Return true if we know that the order of vectorized DR_INFO_A and
 226    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 227    DR_INFO_B.  At least one of the accesses is a write.  */
 228
 229 static bool
 230 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 231 {
 232   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 233   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 234
 235   /* Single statements are always kept in their original order.  */
 236   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 237       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 238     return true;
 239
 240   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 241      emitted at the position of the first scalar load.
 242      Stores in a group are emitted at the position of the last scalar store.
 243      Compute that position and check whether the resulting order matches
 244      the current one.  */
 245   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 246   if (il_a)
 247     {
 248       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 249         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 250              s = DR_GROUP_NEXT_ELEMENT (s))
 251           il_a = get_later_stmt (il_a, s);
 252       else /* DR_IS_READ */
 253         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 254              s = DR_GROUP_NEXT_ELEMENT (s))
 255           if (get_later_stmt (il_a, s) == il_a)
 256             il_a = s;
 257     }
 258   else
 259     il_a = stmtinfo_a;
 260   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 261   if (il_b)
 262     {
 263       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 264         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 265              s = DR_GROUP_NEXT_ELEMENT (s))
 266           il_b = get_later_stmt (il_b, s);
 267       else /* DR_IS_READ */
 268         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 269              s = DR_GROUP_NEXT_ELEMENT (s))
 270           if (get_later_stmt (il_b, s) == il_b)
 271             il_b = s;
 272     }
 273   else
 274     il_b = stmtinfo_b;
 275   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 276   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 277 }
 278
 279 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 280    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 281    distances.  These distances are conservatively correct but they don't
 282    reflect a guaranteed dependence.
 283
 284    Return true if this function does all the work necessary to avoid
 285    an alias or false if the caller should use the dependence distances
 286    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 287    the depth of the loop described by LOOP_VINFO and the other arguments
 288    are as for vect_analyze_data_ref_dependence.  */
 289
 290 static bool
 291 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 292                                        loop_vec_info loop_vinfo,
 293                                        int loop_depth, unsigned int *max_vf)
 294 {
 295   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 296   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 297     {
 298       int dist = dist_v[loop_depth];
 299       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 300         {
 301           /* If the user asserted safelen >= DIST consecutive iterations
 302              can be executed concurrently, assume independence.
 303
 304              ??? An alternative would be to add the alias check even
 305              in this case, and vectorize the fallback loop with the
 306              maximum VF set to safelen.  However, if the user has
 307              explicitly given a length, it's less likely that that
 308              would be a win.  */
 309           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 310             {
 311               if ((unsigned int) loop->safelen < *max_vf)
 312                 *max_vf = loop->safelen;
 313               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 314               continue;
 315             }
 316
 317           /* For dependence distances of 2 or more, we have the option
 318              of limiting VF or checking for an alias at runtime.
 319              Prefer to check at runtime if we can, to avoid limiting
 320              the VF unnecessarily when the bases are in fact independent.
 321
 322              Note that the alias checks will be removed if the VF ends up
 323              being small enough.  */
 324           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 325           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 326           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 327                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 328                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 329         }
 330     }
 331   return true;
 332 }
 333
 334
 335 /* Function vect_analyze_data_ref_dependence.
 336
 337    FIXME: I needed to change the sense of the returned flag.
 338
 339    Return FALSE if there (might) exist a dependence between a memory-reference
 340    DRA and a memory-reference DRB.  When versioning for alias may check a
 341    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 342    the data dependence.  */
 343
 344 static opt_result
 345 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 346                                   loop_vec_info loop_vinfo,
 347                                   unsigned int *max_vf)
 348 {
 349   unsigned int i;
 350   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 351   struct data_reference *dra = DDR_A (ddr);
 352   struct data_reference *drb = DDR_B (ddr);
 353   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 354   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 355   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 356   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 357   lambda_vector dist_v;
 358   unsigned int loop_depth;
 359
 360   /* If user asserted safelen consecutive iterations can be
 361      executed concurrently, assume independence.  */
 362   auto apply_safelen = [&]()
 363     {
 364       if (loop->safelen >= 2)
 365         {
 366           if ((unsigned int) loop->safelen < *max_vf)
 367             *max_vf = loop->safelen;
 368           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 369           return true;
 370         }
 371       return false;
 372     };
 373
 374   /* In loop analysis all data references should be vectorizable.  */
 375   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 376       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 377     gcc_unreachable ();
 378
 379   /* Independent data accesses.  */
 380   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 381     return opt_result::success ();
 382
 383   if (dra == drb
 384       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 385     return opt_result::success ();
 386
 387   /* We do not have to consider dependences between accesses that belong
 388      to the same group, unless the stride could be smaller than the
 389      group size.  */
 390   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 391       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 392           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 393       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 394     return opt_result::success ();
 395
 396   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 397      least two scalar iterations, there is always also a true dependence.
 398      As the vectorizer does not re-order loads and stores we can ignore
 399      the anti-dependence if TBAA can disambiguate both DRs similar to the
 400      case with known negative distance anti-dependences (positive
 401      distance anti-dependences would violate TBAA constraints).  */
 402   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 403        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 404       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 405                                  get_alias_set (DR_REF (drb))))
 406     return opt_result::success ();
 407
 408   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 409       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 410     {
 411       if (apply_safelen ())
 412         return opt_result::success ();
 413
 414       return opt_result::failure_at
 415         (stmtinfo_a->stmt,
 416          "possible alias involving gather/scatter between %T and %T\n",
 417          DR_REF (dra), DR_REF (drb));
 418     }
 419
 420   /* Unknown data dependence.  */
 421   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 422     {
 423       if (apply_safelen ())
 424         return opt_result::success ();
 425
 426       if (dump_enabled_p ())
 427         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 428                          "versioning for alias required: "
 429                          "can't determine dependence between %T and %T\n",
 430                          DR_REF (dra), DR_REF (drb));
 431
 432       /* Add to list of ddrs that need to be tested at run-time.  */
 433       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 434     }
 435
 436   /* Known data dependence.  */
 437   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 438     {
 439       if (apply_safelen ())
 440         return opt_result::success ();
 441
 442       if (dump_enabled_p ())
 443         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 444                          "versioning for alias required: "
 445                          "bad dist vector for %T and %T\n",
 446                          DR_REF (dra), DR_REF (drb));
 447       /* Add to list of ddrs that need to be tested at run-time.  */
 448       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 449     }
 450
 451   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 452
 453   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 454       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 455                                                 loop_depth, max_vf))
 456     return opt_result::success ();
 457
 458   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 459     {
 460       int dist = dist_v[loop_depth];
 461
 462       if (dump_enabled_p ())
 463         dump_printf_loc (MSG_NOTE, vect_location,
 464                          "dependence distance  = %d.\n", dist);
 465
 466       if (dist == 0)
 467         {
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location,
 470                              "dependence distance == 0 between %T and %T\n",
 471                              DR_REF (dra), DR_REF (drb));
 472
 473           /* When we perform grouped accesses and perform implicit CSE
 474              by detecting equal accesses and doing disambiguation with
 475              runtime alias tests like for
 476                 .. = a[i];
 477                 .. = a[i+1];
 478                 a[i] = ..;
 479                 a[i+1] = ..;
 480                 *p = ..;
 481                 .. = a[i];
 482                 .. = a[i+1];
 483              where we will end up loading { a[i], a[i+1] } once, make
 484              sure that inserting group loads before the first load and
 485              stores after the last store will do the right thing.
 486              Similar for groups like
 487                 a[i] = ...;
 488                 ... = a[i];
 489                 a[i+1] = ...;
 490              where loads from the group interleave with the store.  */
 491           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 492             return opt_result::failure_at (stmtinfo_a->stmt,
 493                                            "READ_WRITE dependence"
 494                                            " in interleaving.\n");
 495
 496           if (loop->safelen < 2)
 497             {
 498               tree indicator = dr_zero_step_indicator (dra);
 499               if (!indicator || integer_zerop (indicator))
 500                 return opt_result::failure_at (stmtinfo_a->stmt,
 501                                                "access also has a zero step\n");
 502               else if (TREE_CODE (indicator) != INTEGER_CST)
 503                 vect_check_nonzero_value (loop_vinfo, indicator);
 504             }
 505           continue;
 506         }
 507
 508       if (dist > 0 && DDR_REVERSED_P (ddr))
 509         {
 510           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 511              reversed (to make distance vector positive), and the actual
 512              distance is negative.  */
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "dependence distance negative.\n");
 516           /* When doing outer loop vectorization, we need to check if there is
 517              a backward dependence at the inner loop level if the dependence
 518              at the outer loop is reversed.  See PR81740.  */
 519           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 520               || nested_in_vect_loop_p (loop, stmtinfo_b))
 521             {
 522               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 523                                                          DDR_LOOP_NEST (ddr));
 524               if (dist_v[inner_depth] < 0)
 525                 return opt_result::failure_at (stmtinfo_a->stmt,
 526                                                "not vectorized, dependence "
 527                                                "between data-refs %T and %T\n",
 528                                                DR_REF (dra), DR_REF (drb));
 529             }
 530           /* Record a negative dependence distance to later limit the
 531              amount of stmt copying / unrolling we can perform.
 532              Only need to handle read-after-write dependence.  */
 533           if (DR_IS_READ (drb)
 534               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 535                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 536             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 537           continue;
 538         }
 539
 540       unsigned int abs_dist = abs (dist);
 541       if (abs_dist >= 2 && abs_dist < *max_vf)
 542         {
 543           /* The dependence distance requires reduction of the maximal
 544              vectorization factor.  */
 545           *max_vf = abs_dist;
 546           if (dump_enabled_p ())
 547             dump_printf_loc (MSG_NOTE, vect_location,
 548                              "adjusting maximal vectorization factor to %i\n",
 549                              *max_vf);
 550         }
 551
 552       if (abs_dist >= *max_vf)
 553         {
 554           /* Dependence distance does not create dependence, as far as
 555              vectorization is concerned, in this case.  */
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location,
 558                              "dependence distance >= VF.\n");
 559           continue;
 560         }
 561
 562       return opt_result::failure_at (stmtinfo_a->stmt,
 563                                      "not vectorized, possible dependence "
 564                                      "between data-refs %T and %T\n",
 565                                      DR_REF (dra), DR_REF (drb));
 566     }
 567
 568   return opt_result::success ();
 569 }
 570
 571 /* Function vect_analyze_data_ref_dependences.
 572
 573    Examine all the data references in the loop, and make sure there do not
 574    exist any data dependences between them.  Set *MAX_VF according to
 575    the maximum vectorization factor the data dependences allow.  */
 576
 577 opt_result
 578 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 579                                    unsigned int *max_vf)
 580 {
 581   unsigned int i;
 582   struct data_dependence_relation *ddr;
 583
 584   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 585
 586   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 587     {
 588       LOOP_VINFO_DDRS (loop_vinfo)
 589         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 590                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 591       /* We do not need read-read dependences.  */
 592       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 593                                           &LOOP_VINFO_DDRS (loop_vinfo),
 594                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 595                                           false);
 596       gcc_assert (res);
 597     }
 598
 599   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 600
 601   /* For epilogues we either have no aliases or alias versioning
 602      was applied to original loop.  Therefore we may just get max_vf
 603      using VF of original loop.  */
 604   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 605     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 606   else
 607     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 608       {
 609         opt_result res
 610           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 611         if (!res)
 612           return res;
 613       }
 614
 615   return opt_result::success ();
 616 }
 617
 618
 619 /* Function vect_slp_analyze_data_ref_dependence.
 620
 621    Return TRUE if there (might) exist a dependence between a memory-reference
 622    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 623    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 624    according to the data dependence.  */
 625
 626 static bool
 627 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 628                                       struct data_dependence_relation *ddr)
 629 {
 630   struct data_reference *dra = DDR_A (ddr);
 631   struct data_reference *drb = DDR_B (ddr);
 632   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 633   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 634
 635   /* We need to check dependences of statements marked as unvectorizable
 636      as well, they still can prohibit vectorization.  */
 637
 638   /* Independent data accesses.  */
 639   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 640     return false;
 641
 642   if (dra == drb)
 643     return false;
 644
 645   /* Read-read is OK.  */
 646   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 647     return false;
 648
 649   /* If dra and drb are part of the same interleaving chain consider
 650      them independent.  */
 651   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 652       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 653           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 654     return false;
 655
 656   /* Unknown data dependence.  */
 657   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 658     {
 659       if  (dump_enabled_p ())
 660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                          "can't determine dependence between %T and %T\n",
 662                          DR_REF (dra), DR_REF (drb));
 663     }
 664   else if (dump_enabled_p ())
 665     dump_printf_loc (MSG_NOTE, vect_location,
 666                      "determined dependence between %T and %T\n",
 667                      DR_REF (dra), DR_REF (drb));
 668
 669   return true;
 670 }
 671
 672
 673 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 674
 675 static bool
 676 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 677 {
 678   /* This walks over all stmts involved in the SLP store done
 679      in NODE verifying we can sink them up to the last stmt in the
 680      group.  */
 681   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 682   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 683
 684   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 685     {
 686       stmt_vec_info access_info
 687         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 688       if (access_info == last_access_info)
 689         continue;
 690       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 691       ao_ref ref;
 692       bool ref_initialized_p = false;
 693       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 694            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 695         {
 696           gimple *stmt = gsi_stmt (gsi);
 697           if (! gimple_vuse (stmt))
 698             continue;
 699
 700           /* If we couldn't record a (single) data reference for this
 701              stmt we have to resort to the alias oracle.  */
 702           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 703           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 704           if (!dr_b)
 705             {
 706               /* We are moving a store - this means
 707                  we cannot use TBAA for disambiguation.  */
 708               if (!ref_initialized_p)
 709                 ao_ref_init (&ref, DR_REF (dr_a));
 710               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 711                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 712                 return false;
 713               continue;
 714             }
 715
 716           gcc_assert (!gimple_visited_p (stmt));
 717
 718           ddr_p ddr = initialize_data_dependence_relation (dr_a,
 719                                                            dr_b, vNULL);
 720           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 721           free_dependence_relation (ddr);
 722           if (dependent)
 723             return false;
 724         }
 725     }
 726   return true;
 727 }
 728
 729 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
 730    contain the vector of scalar stores of this instance if we are
 731    disambiguating the loads.  */
 732
 733 static bool
 734 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
 735                                    vec<stmt_vec_info> stores,
 736                                    stmt_vec_info last_store_info)
 737 {
 738   /* This walks over all stmts involved in the SLP load done
 739      in NODE verifying we can hoist them up to the first stmt in the
 740      group.  */
 741   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
 742   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
 743
 744   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 745     {
 746       stmt_vec_info access_info
 747         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 748       if (access_info == first_access_info)
 749         continue;
 750       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 751       ao_ref ref;
 752       bool ref_initialized_p = false;
 753       hash_set<stmt_vec_info> grp_visited;
 754       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 755            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 756         {
 757           gimple *stmt = gsi_stmt (gsi);
 758           if (! gimple_vdef (stmt))
 759             continue;
 760
 761           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 762
 763           /* If we run into a store of this same instance (we've just
 764              marked those) then delay dependence checking until we run
 765              into the last store because this is where it will have
 766              been sunk to (and we verified that we can do that already).  */
 767           if (gimple_visited_p (stmt))
 768             {
 769               if (stmt_info != last_store_info)
 770                 continue;
 771
 772               for (stmt_vec_info &store_info : stores)
 773                 {
 774                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 775                   ddr_p ddr = initialize_data_dependence_relation
 776                                 (dr_a, store_dr, vNULL);
 777                   bool dependent
 778                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 779                   free_dependence_relation (ddr);
 780                   if (dependent)
 781                     return false;
 782                 }
 783               continue;
 784             }
 785
 786           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
 787             {
 788               /* We are hoisting a load - this means we can use TBAA for
 789                  disambiguation.  */
 790               if (!ref_initialized_p)
 791                 ao_ref_init (&ref, DR_REF (dr_a));
 792               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
 793                 {
 794                   /* If we couldn't record a (single) data reference for this
 795                      stmt we have to give up now.  */
 796                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 797                   if (!dr_b)
 798                     return false;
 799                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 800                                                                    dr_b, vNULL);
 801                   bool dependent
 802                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 803                   free_dependence_relation (ddr);
 804                   if (dependent)
 805                     return false;
 806                 }
 807               /* No dependence.  */
 808               return true;
 809             };
 810           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 811             {
 812               /* When we run into a store group we have to honor
 813                  that earlier stores might be moved here.  We don't
 814                  know exactly which and where to since we lack a
 815                  back-mapping from DR to SLP node, so assume all
 816                  earlier stores are sunk here.  It's enough to
 817                  consider the last stmt of a group for this.
 818                  ???  Both this and the fact that we disregard that
 819                  the conflicting instance might be removed later
 820                  is overly conservative.  */
 821               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
 822                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
 823                      store_info != NULL;
 824                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
 825                   if ((store_info == stmt_info
 826                        || get_later_stmt (store_info, stmt_info) == stmt_info)
 827                       && !check_hoist (store_info))
 828                     return false;
 829             }
 830           else
 831             {
 832               if (!check_hoist (stmt_info))
 833                 return false;
 834             }
 835         }
 836     }
 837   return true;
 838 }
 839
 840
 841 /* Function vect_analyze_data_ref_dependences.
 842
 843    Examine all the data references in the basic-block, and make sure there
 844    do not exist any data dependences between them.  Set *MAX_VF according to
 845    the maximum vectorization factor the data dependences allow.  */
 846
 847 bool
 848 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 849 {
 850   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 851
 852   /* The stores of this instance are at the root of the SLP tree.  */
 853   slp_tree store = NULL;
 854   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 855     store = SLP_INSTANCE_TREE (instance);
 856
 857   /* Verify we can sink stores to the vectorized stmt insert location.  */
 858   stmt_vec_info last_store_info = NULL;
 859   if (store)
 860     {
 861       if (! vect_slp_analyze_store_dependences (vinfo, store))
 862         return false;
 863
 864       /* Mark stores in this instance and remember the last one.  */
 865       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 866       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 867         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 868     }
 869
 870   bool res = true;
 871
 872   /* Verify we can sink loads to the vectorized stmt insert location,
 873      special-casing stores of this instance.  */
 874   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 875     if (! vect_slp_analyze_load_dependences (vinfo, load,
 876                                              store
 877                                              ? SLP_TREE_SCALAR_STMTS (store)
 878                                              : vNULL, last_store_info))
 879       {
 880         res = false;
 881         break;
 882       }
 883
 884   /* Unset the visited flag.  */
 885   if (store)
 886     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 887       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 888
 889   return res;
 890 }
 891
 892 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 893    applied.  */
 894
 895 int
 896 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 897 {
 898   HOST_WIDE_INT diff = 0;
 899   /* Alignment is only analyzed for the first element of a DR group,
 900      use that but adjust misalignment by the offset of the access.  */
 901   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 902     {
 903       dr_vec_info *first_dr
 904         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 905       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 906          INTEGER_CSTs and the first element in the group has the lowest
 907          address.  */
 908       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 909               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 910       gcc_assert (diff >= 0);
 911       dr_info = first_dr;
 912     }
 913
 914   int misalign = dr_info->misalignment;
 915   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 916   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 917     return misalign;
 918
 919   /* If the access is only aligned for a vector type with smaller alignment
 920      requirement the access has unknown misalignment.  */
 921   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 922                 targetm.vectorize.preferred_vector_alignment (vectype)))
 923     return DR_MISALIGNMENT_UNKNOWN;
 924
 925   /* Apply the offset from the DR group start and the externally supplied
 926      offset which can for example result from a negative stride access.  */
 927   poly_int64 misalignment = misalign + diff + offset;
 928
 929   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 930      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 931   unsigned HOST_WIDE_INT target_alignment_c
 932     = dr_info->target_alignment.to_constant ();
 933   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 934     return DR_MISALIGNMENT_UNKNOWN;
 935   return misalign;
 936 }
 937
 938 /* Record the base alignment guarantee given by DRB, which occurs
 939    in STMT_INFO.  */
 940
 941 static void
 942 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 943                             innermost_loop_behavior *drb)
 944 {
 945   bool existed;
 946   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 947     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 948   if (!existed || entry.second->base_alignment < drb->base_alignment)
 949     {
 950       entry = std::make_pair (stmt_info, drb);
 951       if (dump_enabled_p ())
 952         dump_printf_loc (MSG_NOTE, vect_location,
 953                          "recording new base alignment for %T\n"
 954                          "  alignment:    %d\n"
 955                          "  misalignment: %d\n"
 956                          "  based on:     %G",
 957                          drb->base_address,
 958                          drb->base_alignment,
 959                          drb->base_misalignment,
 960                          stmt_info->stmt);
 961     }
 962 }
 963
 964 /* If the region we're going to vectorize is reached, all unconditional
 965    data references occur at least once.  We can therefore pool the base
 966    alignment guarantees from each unconditional reference.  Do this by
 967    going through all the data references in VINFO and checking whether
 968    the containing statement makes the reference unconditionally.  If so,
 969    record the alignment of the base address in VINFO so that it can be
 970    used for all other references with the same base.  */
 971
 972 void
 973 vect_record_base_alignments (vec_info *vinfo)
 974 {
 975   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 976   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 977   for (data_reference *dr : vinfo->shared->datarefs)
 978     {
 979       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 980       stmt_vec_info stmt_info = dr_info->stmt;
 981       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 982           && STMT_VINFO_VECTORIZABLE (stmt_info)
 983           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 984         {
 985           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 986
 987           /* If DR is nested in the loop that is being vectorized, we can also
 988              record the alignment of the base wrt the outer loop.  */
 989           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 990             vect_record_base_alignment
 991               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 992         }
 993     }
 994 }
 995
 996 /* Function vect_compute_data_ref_alignment
 997
 998    Compute the misalignment of the data reference DR_INFO when vectorizing
 999    with VECTYPE.
1000
1001    Output:
1002    1. initialized misalignment info for DR_INFO
1003
1004    FOR NOW: No analysis is actually performed. Misalignment is calculated
1005    only for trivial cases. TODO.  */
1006
1007 static void
1008 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1009                                  tree vectype)
1010 {
1011   stmt_vec_info stmt_info = dr_info->stmt;
1012   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1013   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1014   class loop *loop = NULL;
1015   tree ref = DR_REF (dr_info->dr);
1016
1017   if (dump_enabled_p ())
1018     dump_printf_loc (MSG_NOTE, vect_location,
1019                      "vect_compute_data_ref_alignment:\n");
1020
1021   if (loop_vinfo)
1022     loop = LOOP_VINFO_LOOP (loop_vinfo);
1023
1024   /* Initialize misalignment to unknown.  */
1025   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1026
1027   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1028     return;
1029
1030   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1031   bool step_preserves_misalignment_p;
1032
1033   poly_uint64 vector_alignment
1034     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1035                  BITS_PER_UNIT);
1036   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1037
1038   /* If the main loop has peeled for alignment we have no way of knowing
1039      whether the data accesses in the epilogues are aligned.  We can't at
1040      compile time answer the question whether we have entered the main loop or
1041      not.  Fixes PR 92351.  */
1042   if (loop_vinfo)
1043     {
1044       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1045       if (orig_loop_vinfo
1046           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1047         return;
1048     }
1049
1050   unsigned HOST_WIDE_INT vect_align_c;
1051   if (!vector_alignment.is_constant (&vect_align_c))
1052     return;
1053
1054   /* No step for BB vectorization.  */
1055   if (!loop)
1056     {
1057       gcc_assert (integer_zerop (drb->step));
1058       step_preserves_misalignment_p = true;
1059     }
1060
1061   /* In case the dataref is in an inner-loop of the loop that is being
1062      vectorized (LOOP), we use the base and misalignment information
1063      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1064      stays the same throughout the execution of the inner-loop, which is why
1065      we have to check that the stride of the dataref in the inner-loop evenly
1066      divides by the vector alignment.  */
1067   else if (nested_in_vect_loop_p (loop, stmt_info))
1068     {
1069       step_preserves_misalignment_p
1070         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1071
1072       if (dump_enabled_p ())
1073         {
1074           if (step_preserves_misalignment_p)
1075             dump_printf_loc (MSG_NOTE, vect_location,
1076                              "inner step divides the vector alignment.\n");
1077           else
1078             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079                              "inner step doesn't divide the vector"
1080                              " alignment.\n");
1081         }
1082     }
1083
1084   /* Similarly we can only use base and misalignment information relative to
1085      an innermost loop if the misalignment stays the same throughout the
1086      execution of the loop.  As above, this is the case if the stride of
1087      the dataref evenly divides by the alignment.  */
1088   else
1089     {
1090       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1091       step_preserves_misalignment_p
1092         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1093
1094       if (!step_preserves_misalignment_p && dump_enabled_p ())
1095         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096                          "step doesn't divide the vector alignment.\n");
1097     }
1098
1099   unsigned int base_alignment = drb->base_alignment;
1100   unsigned int base_misalignment = drb->base_misalignment;
1101
1102   /* Calculate the maximum of the pooled base address alignment and the
1103      alignment that we can compute for DR itself.  */
1104   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1105     = base_alignments->get (drb->base_address);
1106   if (entry
1107       && base_alignment < (*entry).second->base_alignment
1108       && (loop_vinfo
1109           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1110                               gimple_bb (entry->first->stmt))
1111               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1112                   || (entry->first->dr_aux.group <= dr_info->group)))))
1113     {
1114       base_alignment = entry->second->base_alignment;
1115       base_misalignment = entry->second->base_misalignment;
1116     }
1117
1118   if (drb->offset_alignment < vect_align_c
1119       || !step_preserves_misalignment_p
1120       /* We need to know whether the step wrt the vectorized loop is
1121          negative when computing the starting misalignment below.  */
1122       || TREE_CODE (drb->step) != INTEGER_CST)
1123     {
1124       if (dump_enabled_p ())
1125         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1126                          "Unknown alignment for access: %T\n", ref);
1127       return;
1128     }
1129
1130   if (base_alignment < vect_align_c)
1131     {
1132       unsigned int max_alignment;
1133       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1134       if (max_alignment < vect_align_c
1135           || !vect_can_force_dr_alignment_p (base,
1136                                              vect_align_c * BITS_PER_UNIT))
1137         {
1138           if (dump_enabled_p ())
1139             dump_printf_loc (MSG_NOTE, vect_location,
1140                              "can't force alignment of ref: %T\n", ref);
1141           return;
1142         }
1143
1144       /* Force the alignment of the decl.
1145          NOTE: This is the only change to the code we make during
1146          the analysis phase, before deciding to vectorize the loop.  */
1147       if (dump_enabled_p ())
1148         dump_printf_loc (MSG_NOTE, vect_location,
1149                          "force alignment of %T\n", ref);
1150
1151       dr_info->base_decl = base;
1152       dr_info->base_misaligned = true;
1153       base_misalignment = 0;
1154     }
1155   poly_int64 misalignment
1156     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1157
1158   unsigned int const_misalignment;
1159   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1160     {
1161       if (dump_enabled_p ())
1162         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163                          "Non-constant misalignment for access: %T\n", ref);
1164       return;
1165     }
1166
1167   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1168
1169   if (dump_enabled_p ())
1170     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171                      "misalign = %d bytes of ref %T\n",
1172                      const_misalignment, ref);
1173
1174   return;
1175 }
1176
1177 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1178    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1179    is made aligned via peeling.  */
1180
1181 static bool
1182 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1183                                          dr_vec_info *dr_peel_info)
1184 {
1185   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1186                   DR_TARGET_ALIGNMENT (dr_info)))
1187     {
1188       poly_offset_int diff
1189         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1190            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1191       if (known_eq (diff, 0)
1192           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1193         return true;
1194     }
1195   return false;
1196 }
1197
1198 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1199    aligned via peeling.  */
1200
1201 static bool
1202 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1203                                  dr_vec_info *dr_peel_info)
1204 {
1205   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1206                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1207       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1208                            DR_OFFSET (dr_peel_info->dr), 0)
1209       || !operand_equal_p (DR_STEP (dr_info->dr),
1210                            DR_STEP (dr_peel_info->dr), 0))
1211     return false;
1212
1213   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1214 }
1215
1216 /* Compute the value for dr_info->misalign so that the access appears
1217    aligned.  This is used by peeling to compensate for dr_misalignment
1218    applying the offset for negative step.  */
1219
1220 int
1221 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1222 {
1223   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1224     return 0;
1225
1226   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1227   poly_int64 misalignment
1228     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1229        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1230
1231   unsigned HOST_WIDE_INT target_alignment_c;
1232   int misalign;
1233   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1234       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1235     return DR_MISALIGNMENT_UNKNOWN;
1236   return misalign;
1237 }
1238
1239 /* Function vect_update_misalignment_for_peel.
1240    Sets DR_INFO's misalignment
1241    - to 0 if it has the same alignment as DR_PEEL_INFO,
1242    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1243    - to -1 (unknown) otherwise.
1244
1245    DR_INFO - the data reference whose misalignment is to be adjusted.
1246    DR_PEEL_INFO - the data reference whose misalignment is being made
1247                   zero in the vector loop by the peel.
1248    NPEEL - the number of iterations in the peel loop if the misalignment
1249            of DR_PEEL_INFO is known at compile time.  */
1250
1251 static void
1252 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1253                                    dr_vec_info *dr_peel_info, int npeel)
1254 {
1255   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1256   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1257     {
1258       SET_DR_MISALIGNMENT (dr_info,
1259                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1260       return;
1261     }
1262
1263   unsigned HOST_WIDE_INT alignment;
1264   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1265       && known_alignment_for_access_p (dr_info,
1266                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1267       && known_alignment_for_access_p (dr_peel_info,
1268                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1269     {
1270       int misal = dr_info->misalignment;
1271       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1272       misal &= alignment - 1;
1273       set_dr_misalignment (dr_info, misal);
1274       return;
1275     }
1276
1277   if (dump_enabled_p ())
1278     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1279                      "to unknown (-1).\n");
1280   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1281 }
1282
1283 /* Return true if alignment is relevant for DR_INFO.  */
1284
1285 static bool
1286 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1287 {
1288   stmt_vec_info stmt_info = dr_info->stmt;
1289
1290   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1291     return false;
1292
1293   /* For interleaving, only the alignment of the first access matters.  */
1294   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1295       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1296     return false;
1297
1298   /* Scatter-gather and invariant accesses continue to address individual
1299      scalars, so vector-level alignment is irrelevant.  */
1300   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301       || integer_zerop (DR_STEP (dr_info->dr)))
1302     return false;
1303
1304   /* Strided accesses perform only component accesses, alignment is
1305      irrelevant for them.  */
1306   if (STMT_VINFO_STRIDED_P (stmt_info)
1307       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1308     return false;
1309
1310   return true;
1311 }
1312
1313 /* Given an memory reference EXP return whether its alignment is less
1314    than its size.  */
1315
1316 static bool
1317 not_size_aligned (tree exp)
1318 {
1319   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1320     return true;
1321
1322   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1323           > get_object_alignment (exp));
1324 }
1325
1326 /* Function vector_alignment_reachable_p
1327
1328    Return true if vector alignment for DR_INFO is reachable by peeling
1329    a few loop iterations.  Return false otherwise.  */
1330
1331 static bool
1332 vector_alignment_reachable_p (dr_vec_info *dr_info)
1333 {
1334   stmt_vec_info stmt_info = dr_info->stmt;
1335   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1336
1337   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1338     {
1339       /* For interleaved access we peel only if number of iterations in
1340          the prolog loop ({VF - misalignment}), is a multiple of the
1341          number of the interleaved accesses.  */
1342       int elem_size, mis_in_elements;
1343
1344       /* FORNOW: handle only known alignment.  */
1345       if (!known_alignment_for_access_p (dr_info, vectype))
1346         return false;
1347
1348       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1349       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1350       elem_size = vector_element_size (vector_size, nelements);
1351       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1352
1353       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1354         return false;
1355     }
1356
1357   /* If misalignment is known at the compile time then allow peeling
1358      only if natural alignment is reachable through peeling.  */
1359   if (known_alignment_for_access_p (dr_info, vectype)
1360       && !aligned_access_p (dr_info, vectype))
1361     {
1362       HOST_WIDE_INT elmsize =
1363                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1364       if (dump_enabled_p ())
1365         {
1366           dump_printf_loc (MSG_NOTE, vect_location,
1367                            "data size = %wd. misalignment = %d.\n", elmsize,
1368                            dr_misalignment (dr_info, vectype));
1369         }
1370       if (dr_misalignment (dr_info, vectype) % elmsize)
1371         {
1372           if (dump_enabled_p ())
1373             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374                              "data size does not divide the misalignment.\n");
1375           return false;
1376         }
1377     }
1378
1379   if (!known_alignment_for_access_p (dr_info, vectype))
1380     {
1381       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1382       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1383       if (dump_enabled_p ())
1384         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1385                          "Unknown misalignment, %snaturally aligned\n",
1386                          is_packed ? "not " : "");
1387       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1388     }
1389
1390   return true;
1391 }
1392
1393
1394 /* Calculate the cost of the memory access represented by DR_INFO.  */
1395
1396 static void
1397 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1398                            dr_alignment_support alignment_support_scheme,
1399                            int misalignment,
1400                            unsigned int *inside_cost,
1401                            unsigned int *outside_cost,
1402                            stmt_vector_for_cost *body_cost_vec,
1403                            stmt_vector_for_cost *prologue_cost_vec)
1404 {
1405   stmt_vec_info stmt_info = dr_info->stmt;
1406   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1407   int ncopies;
1408
1409   if (PURE_SLP_STMT (stmt_info))
1410     ncopies = 1;
1411   else
1412     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1413
1414   if (DR_IS_READ (dr_info->dr))
1415     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1416                         misalignment, true, inside_cost,
1417                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1418   else
1419     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1420                          misalignment, inside_cost, body_cost_vec);
1421
1422   if (dump_enabled_p ())
1423     dump_printf_loc (MSG_NOTE, vect_location,
1424                      "vect_get_data_access_cost: inside_cost = %d, "
1425                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1426 }
1427
1428
1429 typedef struct _vect_peel_info
1430 {
1431   dr_vec_info *dr_info;
1432   int npeel;
1433   unsigned int count;
1434 } *vect_peel_info;
1435
1436 typedef struct _vect_peel_extended_info
1437 {
1438   vec_info *vinfo;
1439   struct _vect_peel_info peel_info;
1440   unsigned int inside_cost;
1441   unsigned int outside_cost;
1442 } *vect_peel_extended_info;
1443
1444
1445 /* Peeling hashtable helpers.  */
1446
1447 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1448 {
1449   static inline hashval_t hash (const _vect_peel_info *);
1450   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1451 };
1452
1453 inline hashval_t
1454 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1455 {
1456   return (hashval_t) peel_info->npeel;
1457 }
1458
1459 inline bool
1460 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1461 {
1462   return (a->npeel == b->npeel);
1463 }
1464
1465
1466 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1467
1468 static void
1469 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1470                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1471                           int npeel, bool supportable_if_not_aligned)
1472 {
1473   struct _vect_peel_info elem, *slot;
1474   _vect_peel_info **new_slot;
1475
1476   elem.npeel = npeel;
1477   slot = peeling_htab->find (&elem);
1478   if (slot)
1479     slot->count++;
1480   else
1481     {
1482       slot = XNEW (struct _vect_peel_info);
1483       slot->npeel = npeel;
1484       slot->dr_info = dr_info;
1485       slot->count = 1;
1486       new_slot = peeling_htab->find_slot (slot, INSERT);
1487       *new_slot = slot;
1488     }
1489
1490   /* If this DR is not supported with unknown misalignment then bias
1491      this slot when the cost model is disabled.  */
1492   if (!supportable_if_not_aligned
1493       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1494     slot->count += VECT_MAX_COST;
1495 }
1496
1497
1498 /* Traverse peeling hash table to find peeling option that aligns maximum
1499    number of data accesses.  */
1500
1501 int
1502 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1503                                      _vect_peel_extended_info *max)
1504 {
1505   vect_peel_info elem = *slot;
1506
1507   if (elem->count > max->peel_info.count
1508       || (elem->count == max->peel_info.count
1509           && max->peel_info.npeel > elem->npeel))
1510     {
1511       max->peel_info.npeel = elem->npeel;
1512       max->peel_info.count = elem->count;
1513       max->peel_info.dr_info = elem->dr_info;
1514     }
1515
1516   return 1;
1517 }
1518
1519 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1520    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1521    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1522    after peeling.  */
1523
1524 static void
1525 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1526                                 dr_vec_info *dr0_info,
1527                                 unsigned int *inside_cost,
1528                                 unsigned int *outside_cost,
1529                                 stmt_vector_for_cost *body_cost_vec,
1530                                 stmt_vector_for_cost *prologue_cost_vec,
1531                                 unsigned int npeel)
1532 {
1533   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534
1535   bool dr0_alignment_known_p
1536     = (dr0_info
1537        && known_alignment_for_access_p (dr0_info,
1538                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1539
1540   for (data_reference *dr : datarefs)
1541     {
1542       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1543       if (!vect_relevant_for_alignment_p (dr_info))
1544         continue;
1545
1546       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1547       dr_alignment_support alignment_support_scheme;
1548       int misalignment;
1549       unsigned HOST_WIDE_INT alignment;
1550
1551       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1552                                             size_zero_node) < 0;
1553       poly_int64 off = 0;
1554       if (negative)
1555         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1556                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1557
1558       if (npeel == 0)
1559         misalignment = dr_misalignment (dr_info, vectype, off);
1560       else if (dr_info == dr0_info
1561                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1562         misalignment = 0;
1563       else if (!dr0_alignment_known_p
1564                || !known_alignment_for_access_p (dr_info, vectype)
1565                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1566         misalignment = DR_MISALIGNMENT_UNKNOWN;
1567       else
1568         {
1569           misalignment = dr_misalignment (dr_info, vectype, off);
1570           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1571           misalignment &= alignment - 1;
1572         }
1573       alignment_support_scheme
1574         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1575                                          misalignment);
1576
1577       vect_get_data_access_cost (loop_vinfo, dr_info,
1578                                  alignment_support_scheme, misalignment,
1579                                  inside_cost, outside_cost,
1580                                  body_cost_vec, prologue_cost_vec);
1581     }
1582 }
1583
1584 /* Traverse peeling hash table and calculate cost for each peeling option.
1585    Find the one with the lowest cost.  */
1586
1587 int
1588 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1589                                    _vect_peel_extended_info *min)
1590 {
1591   vect_peel_info elem = *slot;
1592   int dummy;
1593   unsigned int inside_cost = 0, outside_cost = 0;
1594   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1595   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1596                        epilogue_cost_vec;
1597
1598   prologue_cost_vec.create (2);
1599   body_cost_vec.create (2);
1600   epilogue_cost_vec.create (2);
1601
1602   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1603                                   &outside_cost, &body_cost_vec,
1604                                   &prologue_cost_vec, elem->npeel);
1605
1606   body_cost_vec.release ();
1607
1608   outside_cost += vect_get_known_peeling_cost
1609     (loop_vinfo, elem->npeel, &dummy,
1610      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1611      &prologue_cost_vec, &epilogue_cost_vec);
1612
1613   /* Prologue and epilogue costs are added to the target model later.
1614      These costs depend only on the scalar iteration cost, the
1615      number of peeling iterations finally chosen, and the number of
1616      misaligned statements.  So discard the information found here.  */
1617   prologue_cost_vec.release ();
1618   epilogue_cost_vec.release ();
1619
1620   if (inside_cost < min->inside_cost
1621       || (inside_cost == min->inside_cost
1622           && outside_cost < min->outside_cost))
1623     {
1624       min->inside_cost = inside_cost;
1625       min->outside_cost = outside_cost;
1626       min->peel_info.dr_info = elem->dr_info;
1627       min->peel_info.npeel = elem->npeel;
1628       min->peel_info.count = elem->count;
1629     }
1630
1631   return 1;
1632 }
1633
1634
1635 /* Choose best peeling option by traversing peeling hash table and either
1636    choosing an option with the lowest cost (if cost model is enabled) or the
1637    option that aligns as many accesses as possible.  */
1638
1639 static struct _vect_peel_extended_info
1640 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1641                                        loop_vec_info loop_vinfo)
1642 {
1643    struct _vect_peel_extended_info res;
1644
1645    res.peel_info.dr_info = NULL;
1646    res.vinfo = loop_vinfo;
1647
1648    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1649      {
1650        res.inside_cost = INT_MAX;
1651        res.outside_cost = INT_MAX;
1652        peeling_htab->traverse <_vect_peel_extended_info *,
1653                                vect_peeling_hash_get_lowest_cost> (&res);
1654      }
1655    else
1656      {
1657        res.peel_info.count = 0;
1658        peeling_htab->traverse <_vect_peel_extended_info *,
1659                                vect_peeling_hash_get_most_frequent> (&res);
1660        res.inside_cost = 0;
1661        res.outside_cost = 0;
1662      }
1663
1664    return res;
1665 }
1666
1667 /* Return true if the new peeling NPEEL is supported.  */
1668
1669 static bool
1670 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1671                           unsigned npeel)
1672 {
1673   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1674   enum dr_alignment_support supportable_dr_alignment;
1675
1676   bool dr0_alignment_known_p
1677     = known_alignment_for_access_p (dr0_info,
1678                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1679
1680   /* Ensure that all data refs can be vectorized after the peel.  */
1681   for (data_reference *dr : datarefs)
1682     {
1683       if (dr == dr0_info->dr)
1684         continue;
1685
1686       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1687       if (!vect_relevant_for_alignment_p (dr_info)
1688           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1689         continue;
1690
1691       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1692       int misalignment;
1693       unsigned HOST_WIDE_INT alignment;
1694       if (!dr0_alignment_known_p
1695           || !known_alignment_for_access_p (dr_info, vectype)
1696           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1697         misalignment = DR_MISALIGNMENT_UNKNOWN;
1698       else
1699         {
1700           misalignment = dr_misalignment (dr_info, vectype);
1701           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1702           misalignment &= alignment - 1;
1703         }
1704       supportable_dr_alignment
1705         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1706                                          misalignment);
1707       if (supportable_dr_alignment == dr_unaligned_unsupported)
1708         return false;
1709     }
1710
1711   return true;
1712 }
1713
1714 /* Compare two data-references DRA and DRB to group them into chunks
1715    with related alignment.  */
1716
1717 static int
1718 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1719 {
1720   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1721   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1722   int cmp;
1723
1724   /* Stabilize sort.  */
1725   if (dra == drb)
1726     return 0;
1727
1728   /* Ordering of DRs according to base.  */
1729   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1730                                DR_BASE_ADDRESS (drb));
1731   if (cmp != 0)
1732     return cmp;
1733
1734   /* And according to DR_OFFSET.  */
1735   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1736   if (cmp != 0)
1737     return cmp;
1738
1739   /* And after step.  */
1740   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1741   if (cmp != 0)
1742     return cmp;
1743
1744   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1745   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1746   if (cmp == 0)
1747     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1748   return cmp;
1749 }
1750
1751 /* Function vect_enhance_data_refs_alignment
1752
1753    This pass will use loop versioning and loop peeling in order to enhance
1754    the alignment of data references in the loop.
1755
1756    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1757    original loop is to be vectorized.  Any other loops that are created by
1758    the transformations performed in this pass - are not supposed to be
1759    vectorized.  This restriction will be relaxed.
1760
1761    This pass will require a cost model to guide it whether to apply peeling
1762    or versioning or a combination of the two.  For example, the scheme that
1763    intel uses when given a loop with several memory accesses, is as follows:
1764    choose one memory access ('p') which alignment you want to force by doing
1765    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1766    other accesses are not necessarily aligned, or (2) use loop versioning to
1767    generate one loop in which all accesses are aligned, and another loop in
1768    which only 'p' is necessarily aligned.
1769
1770    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1771    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1772    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1773
1774    Devising a cost model is the most critical aspect of this work.  It will
1775    guide us on which access to peel for, whether to use loop versioning, how
1776    many versions to create, etc.  The cost model will probably consist of
1777    generic considerations as well as target specific considerations (on
1778    powerpc for example, misaligned stores are more painful than misaligned
1779    loads).
1780
1781    Here are the general steps involved in alignment enhancements:
1782
1783      -- original loop, before alignment analysis:
1784         for (i=0; i<N; i++){
1785           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1786           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1787         }
1788
1789      -- After vect_compute_data_refs_alignment:
1790         for (i=0; i<N; i++){
1791           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1792           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1793         }
1794
1795      -- Possibility 1: we do loop versioning:
1796      if (p is aligned) {
1797         for (i=0; i<N; i++){    # loop 1A
1798           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1799           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1800         }
1801      }
1802      else {
1803         for (i=0; i<N; i++){    # loop 1B
1804           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1805           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1806         }
1807      }
1808
1809      -- Possibility 2: we do loop peeling:
1810      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1811         x = q[i];
1812         p[i] = y;
1813      }
1814      for (i = 3; i < N; i++){   # loop 2A
1815         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1816         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1817      }
1818
1819      -- Possibility 3: combination of loop peeling and versioning:
1820      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1821         x = q[i];
1822         p[i] = y;
1823      }
1824      if (p is aligned) {
1825         for (i = 3; i<N; i++){  # loop 3A
1826           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1827           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1828         }
1829      }
1830      else {
1831         for (i = 3; i<N; i++){  # loop 3B
1832           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1833           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1834         }
1835      }
1836
1837      These loops are later passed to loop_transform to be vectorized.  The
1838      vectorizer will use the alignment information to guide the transformation
1839      (whether to generate regular loads/stores, or with special handling for
1840      misalignment).  */
1841
1842 opt_result
1843 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1844 {
1845   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1846   dr_vec_info *first_store = NULL;
1847   dr_vec_info *dr0_info = NULL;
1848   struct data_reference *dr;
1849   unsigned int i;
1850   bool do_peeling = false;
1851   bool do_versioning = false;
1852   unsigned int npeel = 0;
1853   bool one_misalignment_known = false;
1854   bool one_misalignment_unknown = false;
1855   bool one_dr_unsupportable = false;
1856   dr_vec_info *unsupportable_dr_info = NULL;
1857   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1858   hash_table<peel_info_hasher> peeling_htab (1);
1859
1860   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1861
1862   /* Reset data so we can safely be called multiple times.  */
1863   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1864   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1865
1866   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1867     return opt_result::success ();
1868
1869   /* Sort the vector of datarefs so DRs that have the same or dependent
1870      alignment are next to each other.  */
1871   auto_vec<data_reference_p> datarefs
1872     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1873   datarefs.qsort (dr_align_group_sort_cmp);
1874
1875   /* Compute the number of DRs that become aligned when we peel
1876      a dataref so it becomes aligned.  */
1877   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1878   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1879   unsigned i0;
1880   for (i0 = 0; i0 < datarefs.length (); ++i0)
1881     if (DR_BASE_ADDRESS (datarefs[i0]))
1882       break;
1883   for (i = i0 + 1; i <= datarefs.length (); ++i)
1884     {
1885       if (i == datarefs.length ()
1886           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1887                                DR_BASE_ADDRESS (datarefs[i]), 0)
1888           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1889                                DR_OFFSET (datarefs[i]), 0)
1890           || !operand_equal_p (DR_STEP (datarefs[i0]),
1891                                DR_STEP (datarefs[i]), 0))
1892         {
1893           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1894              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1895              will get known misalignment if we align one of the refs
1896              with the largest DR_TARGET_ALIGNMENT.  */
1897           for (unsigned j = i0; j < i; ++j)
1898             {
1899               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1900               for (unsigned k = i0; k < i; ++k)
1901                 {
1902                   if (k == j)
1903                     continue;
1904                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1905                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1906                                                                dr_infoj))
1907                     n_same_align_refs[j]++;
1908                 }
1909             }
1910           i0 = i;
1911         }
1912     }
1913
1914   /* While cost model enhancements are expected in the future, the high level
1915      view of the code at this time is as follows:
1916
1917      A) If there is a misaligned access then see if peeling to align
1918         this access can make all data references satisfy
1919         vect_supportable_dr_alignment.  If so, update data structures
1920         as needed and return true.
1921
1922      B) If peeling wasn't possible and there is a data reference with an
1923         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1924         then see if loop versioning checks can be used to make all data
1925         references satisfy vect_supportable_dr_alignment.  If so, update
1926         data structures as needed and return true.
1927
1928      C) If neither peeling nor versioning were successful then return false if
1929         any data reference does not satisfy vect_supportable_dr_alignment.
1930
1931      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1932
1933      Note, Possibility 3 above (which is peeling and versioning together) is not
1934      being done at this time.  */
1935
1936   /* (1) Peeling to force alignment.  */
1937
1938   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1939      Considerations:
1940      + How many accesses will become aligned due to the peeling
1941      - How many accesses will become unaligned due to the peeling,
1942        and the cost of misaligned accesses.
1943      - The cost of peeling (the extra runtime checks, the increase
1944        in code size).  */
1945
1946   FOR_EACH_VEC_ELT (datarefs, i, dr)
1947     {
1948       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1949       if (!vect_relevant_for_alignment_p (dr_info))
1950         continue;
1951
1952       stmt_vec_info stmt_info = dr_info->stmt;
1953       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1954       do_peeling = vector_alignment_reachable_p (dr_info);
1955       if (do_peeling)
1956         {
1957           if (known_alignment_for_access_p (dr_info, vectype))
1958             {
1959               unsigned int npeel_tmp = 0;
1960               bool negative = tree_int_cst_compare (DR_STEP (dr),
1961                                                     size_zero_node) < 0;
1962
1963               /* If known_alignment_for_access_p then we have set
1964                  DR_MISALIGNMENT which is only done if we know it at compiler
1965                  time, so it is safe to assume target alignment is constant.
1966                */
1967               unsigned int target_align =
1968                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1969               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1970               poly_int64 off = 0;
1971               if (negative)
1972                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1973               unsigned int mis = dr_misalignment (dr_info, vectype, off);
1974               mis = negative ? mis : -mis;
1975               if (mis != 0)
1976                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1977
1978               /* For multiple types, it is possible that the bigger type access
1979                  will have more than one peeling option.  E.g., a loop with two
1980                  types: one of size (vector size / 4), and the other one of
1981                  size (vector size / 8).  Vectorization factor will 8.  If both
1982                  accesses are misaligned by 3, the first one needs one scalar
1983                  iteration to be aligned, and the second one needs 5.  But the
1984                  first one will be aligned also by peeling 5 scalar
1985                  iterations, and in that case both accesses will be aligned.
1986                  Hence, except for the immediate peeling amount, we also want
1987                  to try to add full vector size, while we don't exceed
1988                  vectorization factor.
1989                  We do this automatically for cost model, since we calculate
1990                  cost for every peeling option.  */
1991               poly_uint64 nscalars = npeel_tmp;
1992               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1993                 {
1994                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1995                   nscalars = (STMT_SLP_TYPE (stmt_info)
1996                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1997                 }
1998
1999               /* Save info about DR in the hash table.  Also include peeling
2000                  amounts according to the explanation above.  Indicate
2001                  the alignment status when the ref is not aligned.
2002                  ???  Rather than using unknown alignment here we should
2003                  prune all entries from the peeling hashtable which cause
2004                  DRs to be not supported.  */
2005               bool supportable_if_not_aligned
2006                 = vect_supportable_dr_alignment
2007                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2008               while (known_le (npeel_tmp, nscalars))
2009                 {
2010                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2011                                             dr_info, npeel_tmp,
2012                                             supportable_if_not_aligned);
2013                   npeel_tmp += MAX (1, target_align / dr_size);
2014                 }
2015
2016               one_misalignment_known = true;
2017             }
2018           else
2019             {
2020               /* If we don't know any misalignment values, we prefer
2021                  peeling for data-ref that has the maximum number of data-refs
2022                  with the same alignment, unless the target prefers to align
2023                  stores over load.  */
2024               unsigned same_align_drs = n_same_align_refs[i];
2025               if (!dr0_info
2026                   || dr0_same_align_drs < same_align_drs)
2027                 {
2028                   dr0_same_align_drs = same_align_drs;
2029                   dr0_info = dr_info;
2030                 }
2031               /* For data-refs with the same number of related
2032                  accesses prefer the one where the misalign
2033                  computation will be invariant in the outermost loop.  */
2034               else if (dr0_same_align_drs == same_align_drs)
2035                 {
2036                   class loop *ivloop0, *ivloop;
2037                   ivloop0 = outermost_invariant_loop_for_expr
2038                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2039                   ivloop = outermost_invariant_loop_for_expr
2040                     (loop, DR_BASE_ADDRESS (dr));
2041                   if ((ivloop && !ivloop0)
2042                       || (ivloop && ivloop0
2043                           && flow_loop_nested_p (ivloop, ivloop0)))
2044                     dr0_info = dr_info;
2045                 }
2046
2047               one_misalignment_unknown = true;
2048
2049               /* Check for data refs with unsupportable alignment that
2050                  can be peeled.  */
2051               enum dr_alignment_support supportable_dr_alignment
2052                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2053                                                  DR_MISALIGNMENT_UNKNOWN);
2054               if (supportable_dr_alignment == dr_unaligned_unsupported)
2055                 {
2056                   one_dr_unsupportable = true;
2057                   unsupportable_dr_info = dr_info;
2058                 }
2059
2060               if (!first_store && DR_IS_WRITE (dr))
2061                 {
2062                   first_store = dr_info;
2063                   first_store_same_align_drs = same_align_drs;
2064                 }
2065             }
2066         }
2067       else
2068         {
2069           if (!aligned_access_p (dr_info, vectype))
2070             {
2071               if (dump_enabled_p ())
2072                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073                                  "vector alignment may not be reachable\n");
2074               break;
2075             }
2076         }
2077     }
2078
2079   /* Check if we can possibly peel the loop.  */
2080   if (!vect_can_advance_ivs_p (loop_vinfo)
2081       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2082       || loop->inner)
2083     do_peeling = false;
2084
2085   struct _vect_peel_extended_info peel_for_known_alignment;
2086   struct _vect_peel_extended_info peel_for_unknown_alignment;
2087   struct _vect_peel_extended_info best_peel;
2088
2089   peel_for_unknown_alignment.inside_cost = INT_MAX;
2090   peel_for_unknown_alignment.outside_cost = INT_MAX;
2091   peel_for_unknown_alignment.peel_info.count = 0;
2092
2093   if (do_peeling
2094       && one_misalignment_unknown)
2095     {
2096       /* Check if the target requires to prefer stores over loads, i.e., if
2097          misaligned stores are more expensive than misaligned loads (taking
2098          drs with same alignment into account).  */
2099       unsigned int load_inside_cost = 0;
2100       unsigned int load_outside_cost = 0;
2101       unsigned int store_inside_cost = 0;
2102       unsigned int store_outside_cost = 0;
2103       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2104
2105       stmt_vector_for_cost dummy;
2106       dummy.create (2);
2107       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2108                                       &load_inside_cost,
2109                                       &load_outside_cost,
2110                                       &dummy, &dummy, estimated_npeels);
2111       dummy.release ();
2112
2113       if (first_store)
2114         {
2115           dummy.create (2);
2116           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2117                                           &store_inside_cost,
2118                                           &store_outside_cost,
2119                                           &dummy, &dummy,
2120                                           estimated_npeels);
2121           dummy.release ();
2122         }
2123       else
2124         {
2125           store_inside_cost = INT_MAX;
2126           store_outside_cost = INT_MAX;
2127         }
2128
2129       if (load_inside_cost > store_inside_cost
2130           || (load_inside_cost == store_inside_cost
2131               && load_outside_cost > store_outside_cost))
2132         {
2133           dr0_info = first_store;
2134           dr0_same_align_drs = first_store_same_align_drs;
2135           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2136           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2137         }
2138       else
2139         {
2140           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2141           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2142         }
2143
2144       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2145       prologue_cost_vec.create (2);
2146       epilogue_cost_vec.create (2);
2147
2148       int dummy2;
2149       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2150         (loop_vinfo, estimated_npeels, &dummy2,
2151          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2152          &prologue_cost_vec, &epilogue_cost_vec);
2153
2154       prologue_cost_vec.release ();
2155       epilogue_cost_vec.release ();
2156
2157       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2158     }
2159
2160   peel_for_unknown_alignment.peel_info.npeel = 0;
2161   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2162
2163   best_peel = peel_for_unknown_alignment;
2164
2165   peel_for_known_alignment.inside_cost = INT_MAX;
2166   peel_for_known_alignment.outside_cost = INT_MAX;
2167   peel_for_known_alignment.peel_info.count = 0;
2168   peel_for_known_alignment.peel_info.dr_info = NULL;
2169
2170   if (do_peeling && one_misalignment_known)
2171     {
2172       /* Peeling is possible, but there is no data access that is not supported
2173          unless aligned.  So we try to choose the best possible peeling from
2174          the hash table.  */
2175       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2176         (&peeling_htab, loop_vinfo);
2177     }
2178
2179   /* Compare costs of peeling for known and unknown alignment. */
2180   if (peel_for_known_alignment.peel_info.dr_info != NULL
2181       && peel_for_unknown_alignment.inside_cost
2182       >= peel_for_known_alignment.inside_cost)
2183     {
2184       best_peel = peel_for_known_alignment;
2185
2186       /* If the best peeling for known alignment has NPEEL == 0, perform no
2187          peeling at all except if there is an unsupportable dr that we can
2188          align.  */
2189       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2190         do_peeling = false;
2191     }
2192
2193   /* If there is an unsupportable data ref, prefer this over all choices so far
2194      since we'd have to discard a chosen peeling except when it accidentally
2195      aligned the unsupportable data ref.  */
2196   if (one_dr_unsupportable)
2197     dr0_info = unsupportable_dr_info;
2198   else if (do_peeling)
2199     {
2200       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2201          TODO: Use nopeel_outside_cost or get rid of it?  */
2202       unsigned nopeel_inside_cost = 0;
2203       unsigned nopeel_outside_cost = 0;
2204
2205       stmt_vector_for_cost dummy;
2206       dummy.create (2);
2207       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2208                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2209       dummy.release ();
2210
2211       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2212          costs will be recorded.  */
2213       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2214       prologue_cost_vec.create (2);
2215       epilogue_cost_vec.create (2);
2216
2217       int dummy2;
2218       nopeel_outside_cost += vect_get_known_peeling_cost
2219         (loop_vinfo, 0, &dummy2,
2220          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2221          &prologue_cost_vec, &epilogue_cost_vec);
2222
2223       prologue_cost_vec.release ();
2224       epilogue_cost_vec.release ();
2225
2226       npeel = best_peel.peel_info.npeel;
2227       dr0_info = best_peel.peel_info.dr_info;
2228
2229       /* If no peeling is not more expensive than the best peeling we
2230          have so far, don't perform any peeling.  */
2231       if (nopeel_inside_cost <= best_peel.inside_cost)
2232         do_peeling = false;
2233     }
2234
2235   if (do_peeling)
2236     {
2237       stmt_vec_info stmt_info = dr0_info->stmt;
2238       if (known_alignment_for_access_p (dr0_info,
2239                                         STMT_VINFO_VECTYPE (stmt_info)))
2240         {
2241           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2242                                                 size_zero_node) < 0;
2243           if (!npeel)
2244             {
2245               /* Since it's known at compile time, compute the number of
2246                  iterations in the peeled loop (the peeling factor) for use in
2247                  updating DR_MISALIGNMENT values.  The peeling factor is the
2248                  vectorization factor minus the misalignment as an element
2249                  count.  */
2250               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2251               poly_int64 off = 0;
2252               if (negative)
2253                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2254                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2255               unsigned int mis
2256                 = dr_misalignment (dr0_info, vectype, off);
2257               mis = negative ? mis : -mis;
2258               /* If known_alignment_for_access_p then we have set
2259                  DR_MISALIGNMENT which is only done if we know it at compiler
2260                  time, so it is safe to assume target alignment is constant.
2261                */
2262               unsigned int target_align =
2263                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2264               npeel = ((mis & (target_align - 1))
2265                        / vect_get_scalar_dr_size (dr0_info));
2266             }
2267
2268           /* For interleaved data access every iteration accesses all the
2269              members of the group, therefore we divide the number of iterations
2270              by the group size.  */
2271           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2272             npeel /= DR_GROUP_SIZE (stmt_info);
2273
2274           if (dump_enabled_p ())
2275             dump_printf_loc (MSG_NOTE, vect_location,
2276                              "Try peeling by %d\n", npeel);
2277         }
2278
2279       /* Ensure that all datarefs can be vectorized after the peel.  */
2280       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2281         do_peeling = false;
2282
2283       /* Check if all datarefs are supportable and log.  */
2284       if (do_peeling
2285           && npeel == 0
2286           && known_alignment_for_access_p (dr0_info,
2287                                            STMT_VINFO_VECTYPE (stmt_info)))
2288         return opt_result::success ();
2289
2290       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2291       if (do_peeling)
2292         {
2293           unsigned max_allowed_peel
2294             = param_vect_max_peeling_for_alignment;
2295           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2296             max_allowed_peel = 0;
2297           if (max_allowed_peel != (unsigned)-1)
2298             {
2299               unsigned max_peel = npeel;
2300               if (max_peel == 0)
2301                 {
2302                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2303                   unsigned HOST_WIDE_INT target_align_c;
2304                   if (target_align.is_constant (&target_align_c))
2305                     max_peel =
2306                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2307                   else
2308                     {
2309                       do_peeling = false;
2310                       if (dump_enabled_p ())
2311                         dump_printf_loc (MSG_NOTE, vect_location,
2312                           "Disable peeling, max peels set and vector"
2313                           " alignment unknown\n");
2314                     }
2315                 }
2316               if (max_peel > max_allowed_peel)
2317                 {
2318                   do_peeling = false;
2319                   if (dump_enabled_p ())
2320                     dump_printf_loc (MSG_NOTE, vect_location,
2321                         "Disable peeling, max peels reached: %d\n", max_peel);
2322                 }
2323             }
2324         }
2325
2326       /* Cost model #2 - if peeling may result in a remaining loop not
2327          iterating enough to be vectorized then do not peel.  Since this
2328          is a cost heuristic rather than a correctness decision, use the
2329          most likely runtime value for variable vectorization factors.  */
2330       if (do_peeling
2331           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2332         {
2333           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2334           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2335           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2336               < assumed_vf + max_peel)
2337             do_peeling = false;
2338         }
2339
2340       if (do_peeling)
2341         {
2342           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2343              If the misalignment of DR_i is identical to that of dr0 then set
2344              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2345              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2346              by the peeling factor times the element size of DR_i (MOD the
2347              vectorization factor times the size).  Otherwise, the
2348              misalignment of DR_i must be set to unknown.  */
2349           FOR_EACH_VEC_ELT (datarefs, i, dr)
2350             if (dr != dr0_info->dr)
2351               {
2352                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2353                 if (!vect_relevant_for_alignment_p (dr_info))
2354                   continue;
2355
2356                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2357               }
2358
2359           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2360           if (npeel)
2361             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2362           else
2363             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2364           SET_DR_MISALIGNMENT (dr0_info,
2365                                vect_dr_misalign_for_aligned_access (dr0_info));
2366           if (dump_enabled_p ())
2367             {
2368               dump_printf_loc (MSG_NOTE, vect_location,
2369                                "Alignment of access forced using peeling.\n");
2370               dump_printf_loc (MSG_NOTE, vect_location,
2371                                "Peeling for alignment will be applied.\n");
2372             }
2373
2374           /* The inside-loop cost will be accounted for in vectorizable_load
2375              and vectorizable_store correctly with adjusted alignments.
2376              Drop the body_cst_vec on the floor here.  */
2377           return opt_result::success ();
2378         }
2379     }
2380
2381   /* (2) Versioning to force alignment.  */
2382
2383   /* Try versioning if:
2384      1) optimize loop for speed and the cost-model is not cheap
2385      2) there is at least one unsupported misaligned data ref with an unknown
2386         misalignment, and
2387      3) all misaligned data refs with a known misalignment are supported, and
2388      4) the number of runtime alignment checks is within reason.  */
2389
2390   do_versioning
2391     = (optimize_loop_nest_for_speed_p (loop)
2392        && !loop->inner /* FORNOW */
2393        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2394
2395   if (do_versioning)
2396     {
2397       FOR_EACH_VEC_ELT (datarefs, i, dr)
2398         {
2399           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2400           if (!vect_relevant_for_alignment_p (dr_info))
2401             continue;
2402
2403           stmt_vec_info stmt_info = dr_info->stmt;
2404           if (STMT_VINFO_STRIDED_P (stmt_info))
2405             {
2406               do_versioning = false;
2407               break;
2408             }
2409
2410           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2411           bool negative = tree_int_cst_compare (DR_STEP (dr),
2412                                                 size_zero_node) < 0;
2413           poly_int64 off = 0;
2414           if (negative)
2415             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2416                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2417           int misalignment;
2418           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2419             continue;
2420
2421           enum dr_alignment_support supportable_dr_alignment
2422             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2423                                              misalignment);
2424           if (supportable_dr_alignment == dr_unaligned_unsupported)
2425             {
2426               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2427                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2428                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2429                 {
2430                   do_versioning = false;
2431                   break;
2432                 }
2433
2434               /* At present we don't support versioning for alignment
2435                  with variable VF, since there's no guarantee that the
2436                  VF is a power of two.  We could relax this if we added
2437                  a way of enforcing a power-of-two size.  */
2438               unsigned HOST_WIDE_INT size;
2439               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2440                 {
2441                   do_versioning = false;
2442                   break;
2443                 }
2444
2445               /* Forcing alignment in the first iteration is no good if
2446                  we don't keep it across iterations.  For now, just disable
2447                  versioning in this case.
2448                  ?? We could actually unroll the loop to achieve the required
2449                  overall step alignment, and forcing the alignment could be
2450                  done by doing some iterations of the non-vectorized loop.  */
2451               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2452                                * DR_STEP_ALIGNMENT (dr),
2453                                DR_TARGET_ALIGNMENT (dr_info)))
2454                 {
2455                   do_versioning = false;
2456                   break;
2457                 }
2458
2459               /* The rightmost bits of an aligned address must be zeros.
2460                  Construct the mask needed for this test.  For example,
2461                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2462                  mask must be 15 = 0xf. */
2463               int mask = size - 1;
2464
2465               /* FORNOW: use the same mask to test all potentially unaligned
2466                  references in the loop.  */
2467               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2468                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2469                 {
2470                   do_versioning = false;
2471                   break;
2472                 }
2473
2474               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2475               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2476             }
2477         }
2478
2479       /* Versioning requires at least one misaligned data reference.  */
2480       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2481         do_versioning = false;
2482       else if (!do_versioning)
2483         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2484     }
2485
2486   if (do_versioning)
2487     {
2488       const vec<stmt_vec_info> &may_misalign_stmts
2489         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2490       stmt_vec_info stmt_info;
2491
2492       /* It can now be assumed that the data references in the statements
2493          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2494          of the loop being vectorized.  */
2495       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2496         {
2497           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2498           SET_DR_MISALIGNMENT (dr_info,
2499                                vect_dr_misalign_for_aligned_access (dr_info));
2500           if (dump_enabled_p ())
2501             dump_printf_loc (MSG_NOTE, vect_location,
2502                              "Alignment of access forced using versioning.\n");
2503         }
2504
2505       if (dump_enabled_p ())
2506         dump_printf_loc (MSG_NOTE, vect_location,
2507                          "Versioning for alignment will be applied.\n");
2508
2509       /* Peeling and versioning can't be done together at this time.  */
2510       gcc_assert (! (do_peeling && do_versioning));
2511
2512       return opt_result::success ();
2513     }
2514
2515   /* This point is reached if neither peeling nor versioning is being done.  */
2516   gcc_assert (! (do_peeling || do_versioning));
2517
2518   return opt_result::success ();
2519 }
2520
2521
2522 /* Function vect_analyze_data_refs_alignment
2523
2524    Analyze the alignment of the data-references in the loop.
2525    Return FALSE if a data reference is found that cannot be vectorized.  */
2526
2527 opt_result
2528 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2529 {
2530   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2531
2532   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2533   struct data_reference *dr;
2534   unsigned int i;
2535
2536   vect_record_base_alignments (loop_vinfo);
2537   FOR_EACH_VEC_ELT (datarefs, i, dr)
2538     {
2539       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2540       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2541         {
2542           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2543               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2544             continue;
2545           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2546                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2547         }
2548     }
2549
2550   return opt_result::success ();
2551 }
2552
2553
2554 /* Analyze alignment of DRs of stmts in NODE.  */
2555
2556 static bool
2557 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2558 {
2559   /* Alignment is maintained in the first element of the group.  */
2560   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2561   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2562   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2563   tree vectype = SLP_TREE_VECTYPE (node);
2564   poly_uint64 vector_alignment
2565     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2566                  BITS_PER_UNIT);
2567   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2568     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2569   /* Re-analyze alignment when we're facing a vectorization with a bigger
2570      alignment requirement.  */
2571   else if (known_lt (dr_info->target_alignment, vector_alignment))
2572     {
2573       poly_uint64 old_target_alignment = dr_info->target_alignment;
2574       int old_misalignment = dr_info->misalignment;
2575       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2576       /* But keep knowledge about a smaller alignment.  */
2577       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2578           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2579         {
2580           dr_info->target_alignment = old_target_alignment;
2581           dr_info->misalignment = old_misalignment;
2582         }
2583     }
2584   /* When we ever face unordered target alignments the first one wins in terms
2585      of analyzing and the other will become unknown in dr_misalignment.  */
2586   return true;
2587 }
2588
2589 /* Function vect_slp_analyze_instance_alignment
2590
2591    Analyze the alignment of the data-references in the SLP instance.
2592    Return FALSE if a data reference is found that cannot be vectorized.  */
2593
2594 bool
2595 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2596                                                 slp_instance instance)
2597 {
2598   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2599
2600   slp_tree node;
2601   unsigned i;
2602   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2603     if (! vect_slp_analyze_node_alignment (vinfo, node))
2604       return false;
2605
2606   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2607       && ! vect_slp_analyze_node_alignment
2608              (vinfo, SLP_INSTANCE_TREE (instance)))
2609     return false;
2610
2611   return true;
2612 }
2613
2614
2615 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2616    accesses of legal size, step, etc.  Detect gaps, single element
2617    interleaving, and other special cases. Set grouped access info.
2618    Collect groups of strided stores for further use in SLP analysis.
2619    Worker for vect_analyze_group_access.  */
2620
2621 static bool
2622 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2623 {
2624   data_reference *dr = dr_info->dr;
2625   tree step = DR_STEP (dr);
2626   tree scalar_type = TREE_TYPE (DR_REF (dr));
2627   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2628   stmt_vec_info stmt_info = dr_info->stmt;
2629   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2630   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2631   HOST_WIDE_INT dr_step = -1;
2632   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2633   bool slp_impossible = false;
2634
2635   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2636      size of the interleaving group (including gaps).  */
2637   if (tree_fits_shwi_p (step))
2638     {
2639       dr_step = tree_to_shwi (step);
2640       /* Check that STEP is a multiple of type size.  Otherwise there is
2641          a non-element-sized gap at the end of the group which we
2642          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2643          ???  As we can handle non-constant step fine here we should
2644          simply remove uses of DR_GROUP_GAP between the last and first
2645          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2646          simply not include that gap.  */
2647       if ((dr_step % type_size) != 0)
2648         {
2649           if (dump_enabled_p ())
2650             dump_printf_loc (MSG_NOTE, vect_location,
2651                              "Step %T is not a multiple of the element size"
2652                              " for %T\n",
2653                              step, DR_REF (dr));
2654           return false;
2655         }
2656       groupsize = absu_hwi (dr_step) / type_size;
2657     }
2658   else
2659     groupsize = 0;
2660
2661   /* Not consecutive access is possible only if it is a part of interleaving.  */
2662   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2663     {
2664       /* Check if it this DR is a part of interleaving, and is a single
2665          element of the group that is accessed in the loop.  */
2666
2667       /* Gaps are supported only for loads. STEP must be a multiple of the type
2668          size.  */
2669       if (DR_IS_READ (dr)
2670           && (dr_step % type_size) == 0
2671           && groupsize > 0
2672           /* This could be UINT_MAX but as we are generating code in a very
2673              inefficient way we have to cap earlier.
2674              See PR91403 for example.  */
2675           && groupsize <= 4096)
2676         {
2677           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2678           DR_GROUP_SIZE (stmt_info) = groupsize;
2679           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2680           if (dump_enabled_p ())
2681             dump_printf_loc (MSG_NOTE, vect_location,
2682                              "Detected single element interleaving %T"
2683                              " step %T\n",
2684                              DR_REF (dr), step);
2685
2686           return true;
2687         }
2688
2689       if (dump_enabled_p ())
2690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2691                          "not consecutive access %G", stmt_info->stmt);
2692
2693       if (bb_vinfo)
2694         {
2695           /* Mark the statement as unvectorizable.  */
2696           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2697           return true;
2698         }
2699
2700       if (dump_enabled_p ())
2701         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2702       STMT_VINFO_STRIDED_P (stmt_info) = true;
2703       return true;
2704     }
2705
2706   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2707     {
2708       /* First stmt in the interleaving chain. Check the chain.  */
2709       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2710       struct data_reference *data_ref = dr;
2711       unsigned int count = 1;
2712       tree prev_init = DR_INIT (data_ref);
2713       HOST_WIDE_INT diff, gaps = 0;
2714
2715       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2716       while (next)
2717         {
2718           /* We never have the same DR multiple times.  */
2719           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2720                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2721
2722           data_ref = STMT_VINFO_DATA_REF (next);
2723
2724           /* All group members have the same STEP by construction.  */
2725           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2726
2727           /* Check that the distance between two accesses is equal to the type
2728              size. Otherwise, we have gaps.  */
2729           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2730                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2731           if (diff < 1 || diff > UINT_MAX)
2732             {
2733               /* For artificial testcases with array accesses with large
2734                  constant indices we can run into overflow issues which
2735                  can end up fooling the groupsize constraint below so
2736                  check the individual gaps (which are represented as
2737                  unsigned int) as well.  */
2738               if (dump_enabled_p ())
2739                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2740                                  "interleaved access with gap larger "
2741                                  "than representable\n");
2742               return false;
2743             }
2744           if (diff != 1)
2745             {
2746               /* FORNOW: SLP of accesses with gaps is not supported.  */
2747               slp_impossible = true;
2748               if (DR_IS_WRITE (data_ref))
2749                 {
2750                   if (dump_enabled_p ())
2751                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2752                                      "interleaved store with gaps\n");
2753                   return false;
2754                 }
2755
2756               gaps += diff - 1;
2757             }
2758
2759           last_accessed_element += diff;
2760
2761           /* Store the gap from the previous member of the group. If there is no
2762              gap in the access, DR_GROUP_GAP is always 1.  */
2763           DR_GROUP_GAP (next) = diff;
2764
2765           prev_init = DR_INIT (data_ref);
2766           next = DR_GROUP_NEXT_ELEMENT (next);
2767           /* Count the number of data-refs in the chain.  */
2768           count++;
2769         }
2770
2771       if (groupsize == 0)
2772         groupsize = count + gaps;
2773
2774       /* This could be UINT_MAX but as we are generating code in a very
2775          inefficient way we have to cap earlier.  See PR78699 for example.  */
2776       if (groupsize > 4096)
2777         {
2778           if (dump_enabled_p ())
2779             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2780                              "group is too large\n");
2781           return false;
2782         }
2783
2784       /* Check that the size of the interleaving is equal to count for stores,
2785          i.e., that there are no gaps.  */
2786       if (groupsize != count
2787           && !DR_IS_READ (dr))
2788         {
2789           groupsize = count;
2790           STMT_VINFO_STRIDED_P (stmt_info) = true;
2791         }
2792
2793       /* If there is a gap after the last load in the group it is the
2794          difference between the groupsize and the last accessed
2795          element.
2796          When there is no gap, this difference should be 0.  */
2797       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2798
2799       DR_GROUP_SIZE (stmt_info) = groupsize;
2800       if (dump_enabled_p ())
2801         {
2802           dump_printf_loc (MSG_NOTE, vect_location,
2803                            "Detected interleaving ");
2804           if (DR_IS_READ (dr))
2805             dump_printf (MSG_NOTE, "load ");
2806           else if (STMT_VINFO_STRIDED_P (stmt_info))
2807             dump_printf (MSG_NOTE, "strided store ");
2808           else
2809             dump_printf (MSG_NOTE, "store ");
2810           dump_printf (MSG_NOTE, "of size %u\n",
2811                        (unsigned)groupsize);
2812           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2813           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2814           while (next)
2815             {
2816               if (DR_GROUP_GAP (next) != 1)
2817                 dump_printf_loc (MSG_NOTE, vect_location,
2818                                  "\t<gap of %d elements>\n",
2819                                  DR_GROUP_GAP (next) - 1);
2820               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2821               next = DR_GROUP_NEXT_ELEMENT (next);
2822             }
2823           if (DR_GROUP_GAP (stmt_info) != 0)
2824             dump_printf_loc (MSG_NOTE, vect_location,
2825                              "\t<gap of %d elements>\n",
2826                              DR_GROUP_GAP (stmt_info));
2827         }
2828
2829       /* SLP: create an SLP data structure for every interleaving group of
2830          stores for further analysis in vect_analyse_slp.  */
2831       if (DR_IS_WRITE (dr) && !slp_impossible)
2832         {
2833           if (loop_vinfo)
2834             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2835           if (bb_vinfo)
2836             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2837         }
2838     }
2839
2840   return true;
2841 }
2842
2843 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2844    accesses of legal size, step, etc.  Detect gaps, single element
2845    interleaving, and other special cases. Set grouped access info.
2846    Collect groups of strided stores for further use in SLP analysis.  */
2847
2848 static bool
2849 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2850 {
2851   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2852     {
2853       /* Dissolve the group if present.  */
2854       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2855       while (stmt_info)
2856         {
2857           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2858           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2859           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2860           stmt_info = next;
2861         }
2862       return false;
2863     }
2864   return true;
2865 }
2866
2867 /* Analyze the access pattern of the data-reference DR_INFO.
2868    In case of non-consecutive accesses call vect_analyze_group_access() to
2869    analyze groups of accesses.  */
2870
2871 static bool
2872 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2873 {
2874   data_reference *dr = dr_info->dr;
2875   tree step = DR_STEP (dr);
2876   tree scalar_type = TREE_TYPE (DR_REF (dr));
2877   stmt_vec_info stmt_info = dr_info->stmt;
2878   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2879   class loop *loop = NULL;
2880
2881   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2882     return true;
2883
2884   if (loop_vinfo)
2885     loop = LOOP_VINFO_LOOP (loop_vinfo);
2886
2887   if (loop_vinfo && !step)
2888     {
2889       if (dump_enabled_p ())
2890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891                          "bad data-ref access in loop\n");
2892       return false;
2893     }
2894
2895   /* Allow loads with zero step in inner-loop vectorization.  */
2896   if (loop_vinfo && integer_zerop (step))
2897     {
2898       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2899       if (!nested_in_vect_loop_p (loop, stmt_info))
2900         return DR_IS_READ (dr);
2901       /* Allow references with zero step for outer loops marked
2902          with pragma omp simd only - it guarantees absence of
2903          loop-carried dependencies between inner loop iterations.  */
2904       if (loop->safelen < 2)
2905         {
2906           if (dump_enabled_p ())
2907             dump_printf_loc (MSG_NOTE, vect_location,
2908                              "zero step in inner loop of nest\n");
2909           return false;
2910         }
2911     }
2912
2913   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2914     {
2915       /* Interleaved accesses are not yet supported within outer-loop
2916         vectorization for references in the inner-loop.  */
2917       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2918
2919       /* For the rest of the analysis we use the outer-loop step.  */
2920       step = STMT_VINFO_DR_STEP (stmt_info);
2921       if (integer_zerop (step))
2922         {
2923           if (dump_enabled_p ())
2924             dump_printf_loc (MSG_NOTE, vect_location,
2925                              "zero step in outer loop.\n");
2926           return DR_IS_READ (dr);
2927         }
2928     }
2929
2930   /* Consecutive?  */
2931   if (TREE_CODE (step) == INTEGER_CST)
2932     {
2933       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2934       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2935           || (dr_step < 0
2936               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2937         {
2938           /* Mark that it is not interleaving.  */
2939           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2940           return true;
2941         }
2942     }
2943
2944   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2945     {
2946       if (dump_enabled_p ())
2947         dump_printf_loc (MSG_NOTE, vect_location,
2948                          "grouped access in outer loop.\n");
2949       return false;
2950     }
2951
2952
2953   /* Assume this is a DR handled by non-constant strided load case.  */
2954   if (TREE_CODE (step) != INTEGER_CST)
2955     return (STMT_VINFO_STRIDED_P (stmt_info)
2956             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2957                 || vect_analyze_group_access (vinfo, dr_info)));
2958
2959   /* Not consecutive access - check if it's a part of interleaving group.  */
2960   return vect_analyze_group_access (vinfo, dr_info);
2961 }
2962
2963 /* Compare two data-references DRA and DRB to group them into chunks
2964    suitable for grouping.  */
2965
2966 static int
2967 dr_group_sort_cmp (const void *dra_, const void *drb_)
2968 {
2969   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2970   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2971   data_reference_p dra = dra_info->dr;
2972   data_reference_p drb = drb_info->dr;
2973   int cmp;
2974
2975   /* Stabilize sort.  */
2976   if (dra == drb)
2977     return 0;
2978
2979   /* Different group IDs lead never belong to the same group.  */
2980   if (dra_info->group != drb_info->group)
2981     return dra_info->group < drb_info->group ? -1 : 1;
2982
2983   /* Ordering of DRs according to base.  */
2984   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2985                                DR_BASE_ADDRESS (drb));
2986   if (cmp != 0)
2987     return cmp;
2988
2989   /* And according to DR_OFFSET.  */
2990   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2991   if (cmp != 0)
2992     return cmp;
2993
2994   /* Put reads before writes.  */
2995   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2996     return DR_IS_READ (dra) ? -1 : 1;
2997
2998   /* Then sort after access size.  */
2999   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3000                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3001   if (cmp != 0)
3002     return cmp;
3003
3004   /* And after step.  */
3005   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3006   if (cmp != 0)
3007     return cmp;
3008
3009   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3010   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3011   if (cmp == 0)
3012     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3013   return cmp;
3014 }
3015
3016 /* If OP is the result of a conversion, return the unconverted value,
3017    otherwise return null.  */
3018
3019 static tree
3020 strip_conversion (tree op)
3021 {
3022   if (TREE_CODE (op) != SSA_NAME)
3023     return NULL_TREE;
3024   gimple *stmt = SSA_NAME_DEF_STMT (op);
3025   if (!is_gimple_assign (stmt)
3026       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3027     return NULL_TREE;
3028   return gimple_assign_rhs1 (stmt);
3029 }
3030
3031 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3032    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3033    be grouped in SLP mode.  */
3034
3035 static bool
3036 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3037                    bool allow_slp_p)
3038 {
3039   if (gimple_assign_single_p (stmt1_info->stmt))
3040     return gimple_assign_single_p (stmt2_info->stmt);
3041
3042   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3043   if (call1 && gimple_call_internal_p (call1))
3044     {
3045       /* Check for two masked loads or two masked stores.  */
3046       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3047       if (!call2 || !gimple_call_internal_p (call2))
3048         return false;
3049       internal_fn ifn = gimple_call_internal_fn (call1);
3050       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3051         return false;
3052       if (ifn != gimple_call_internal_fn (call2))
3053         return false;
3054
3055       /* Check that the masks are the same.  Cope with casts of masks,
3056          like those created by build_mask_conversion.  */
3057       tree mask1 = gimple_call_arg (call1, 2);
3058       tree mask2 = gimple_call_arg (call2, 2);
3059       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3060         {
3061           mask1 = strip_conversion (mask1);
3062           if (!mask1)
3063             return false;
3064           mask2 = strip_conversion (mask2);
3065           if (!mask2)
3066             return false;
3067           if (!operand_equal_p (mask1, mask2, 0))
3068             return false;
3069         }
3070       return true;
3071     }
3072
3073   return false;
3074 }
3075
3076 /* Function vect_analyze_data_ref_accesses.
3077
3078    Analyze the access pattern of all the data references in the loop.
3079
3080    FORNOW: the only access pattern that is considered vectorizable is a
3081            simple step 1 (consecutive) access.
3082
3083    FORNOW: handle only arrays and pointer accesses.  */
3084
3085 opt_result
3086 vect_analyze_data_ref_accesses (vec_info *vinfo,
3087                                 vec<int> *dataref_groups)
3088 {
3089   unsigned int i;
3090   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3091
3092   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3093
3094   if (datarefs.is_empty ())
3095     return opt_result::success ();
3096
3097   /* Sort the array of datarefs to make building the interleaving chains
3098      linear.  Don't modify the original vector's order, it is needed for
3099      determining what dependencies are reversed.  */
3100   vec<dr_vec_info *> datarefs_copy;
3101   datarefs_copy.create (datarefs.length ());
3102   for (unsigned i = 0; i < datarefs.length (); i++)
3103     {
3104       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3105       /* If the caller computed DR grouping use that, otherwise group by
3106          basic blocks.  */
3107       if (dataref_groups)
3108         dr_info->group = (*dataref_groups)[i];
3109       else
3110         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3111       datarefs_copy.quick_push (dr_info);
3112     }
3113   datarefs_copy.qsort (dr_group_sort_cmp);
3114   hash_set<stmt_vec_info> to_fixup;
3115
3116   /* Build the interleaving chains.  */
3117   for (i = 0; i < datarefs_copy.length () - 1;)
3118     {
3119       dr_vec_info *dr_info_a = datarefs_copy[i];
3120       data_reference_p dra = dr_info_a->dr;
3121       int dra_group_id = dr_info_a->group;
3122       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3123       stmt_vec_info lastinfo = NULL;
3124       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3125           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3126         {
3127           ++i;
3128           continue;
3129         }
3130       for (i = i + 1; i < datarefs_copy.length (); ++i)
3131         {
3132           dr_vec_info *dr_info_b = datarefs_copy[i];
3133           data_reference_p drb = dr_info_b->dr;
3134           int drb_group_id = dr_info_b->group;
3135           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3136           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3137               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3138             break;
3139
3140           /* ???  Imperfect sorting (non-compatible types, non-modulo
3141              accesses, same accesses) can lead to a group to be artificially
3142              split here as we don't just skip over those.  If it really
3143              matters we can push those to a worklist and re-iterate
3144              over them.  The we can just skip ahead to the next DR here.  */
3145
3146           /* DRs in a different DR group should not be put into the same
3147              interleaving group.  */
3148           if (dra_group_id != drb_group_id)
3149             break;
3150
3151           /* Check that the data-refs have same first location (except init)
3152              and they are both either store or load (not load and store,
3153              not masked loads or stores).  */
3154           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3155               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3156                                         DR_BASE_ADDRESS (drb)) != 0
3157               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3158               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3159             break;
3160
3161           /* Check that the data-refs have the same constant size.  */
3162           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3163           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3164           if (!tree_fits_uhwi_p (sza)
3165               || !tree_fits_uhwi_p (szb)
3166               || !tree_int_cst_equal (sza, szb))
3167             break;
3168
3169           /* Check that the data-refs have the same step.  */
3170           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3171             break;
3172
3173           /* Check the types are compatible.
3174              ???  We don't distinguish this during sorting.  */
3175           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3176                                    TREE_TYPE (DR_REF (drb))))
3177             break;
3178
3179           /* Check that the DR_INITs are compile-time constants.  */
3180           if (!tree_fits_shwi_p (DR_INIT (dra))
3181               || !tree_fits_shwi_p (DR_INIT (drb)))
3182             break;
3183
3184           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3185              just hold extra information.  */
3186           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3187               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3188               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3189             break;
3190
3191           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3192           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3193           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3194           HOST_WIDE_INT init_prev
3195             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3196           gcc_assert (init_a <= init_b
3197                       && init_a <= init_prev
3198                       && init_prev <= init_b);
3199
3200           /* Do not place the same access in the interleaving chain twice.  */
3201           if (init_b == init_prev)
3202             {
3203               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3204                           < gimple_uid (DR_STMT (drb)));
3205               /* Simply link in duplicates and fix up the chain below.  */
3206             }
3207           else
3208             {
3209               /* If init_b == init_a + the size of the type * k, we have an
3210                  interleaving, and DRA is accessed before DRB.  */
3211               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3212               if (type_size_a == 0
3213                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3214                       % type_size_a != 0))
3215                 break;
3216
3217               /* If we have a store, the accesses are adjacent.  This splits
3218                  groups into chunks we support (we don't support vectorization
3219                  of stores with gaps).  */
3220               if (!DR_IS_READ (dra)
3221                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3222                       != type_size_a))
3223                 break;
3224
3225               /* If the step (if not zero or non-constant) is smaller than the
3226                  difference between data-refs' inits this splits groups into
3227                  suitable sizes.  */
3228               if (tree_fits_shwi_p (DR_STEP (dra)))
3229                 {
3230                   unsigned HOST_WIDE_INT step
3231                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3232                   if (step != 0
3233                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3234                     break;
3235                 }
3236             }
3237
3238           if (dump_enabled_p ())
3239             dump_printf_loc (MSG_NOTE, vect_location,
3240                              DR_IS_READ (dra)
3241                              ? "Detected interleaving load %T and %T\n"
3242                              : "Detected interleaving store %T and %T\n",
3243                              DR_REF (dra), DR_REF (drb));
3244
3245           /* Link the found element into the group list.  */
3246           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3247             {
3248               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3249               lastinfo = stmtinfo_a;
3250             }
3251           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3252           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3253           lastinfo = stmtinfo_b;
3254
3255           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3256             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3257
3258           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3259             dump_printf_loc (MSG_NOTE, vect_location,
3260                              "Load suitable for SLP vectorization only.\n");
3261
3262           if (init_b == init_prev
3263               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3264               && dump_enabled_p ())
3265             dump_printf_loc (MSG_NOTE, vect_location,
3266                              "Queuing group with duplicate access for fixup\n");
3267         }
3268     }
3269
3270   /* Fixup groups with duplicate entries by splitting it.  */
3271   while (1)
3272     {
3273       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3274       if (!(it != to_fixup.end ()))
3275         break;
3276       stmt_vec_info grp = *it;
3277       to_fixup.remove (grp);
3278
3279       /* Find the earliest duplicate group member.  */
3280       unsigned first_duplicate = -1u;
3281       stmt_vec_info next, g = grp;
3282       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3283         {
3284           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3285                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3286               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3287             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3288           g = next;
3289         }
3290       if (first_duplicate == -1U)
3291         continue;
3292
3293       /* Then move all stmts after the first duplicate to a new group.
3294          Note this is a heuristic but one with the property that *it
3295          is fixed up completely.  */
3296       g = grp;
3297       stmt_vec_info newgroup = NULL, ng = grp;
3298       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3299         {
3300           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3301             {
3302               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3303               if (!newgroup)
3304                 newgroup = next;
3305               else
3306                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3307               ng = next;
3308               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3309             }
3310           else
3311             g = DR_GROUP_NEXT_ELEMENT (g);
3312         }
3313       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3314
3315       /* Fixup the new group which still may contain duplicates.  */
3316       to_fixup.add (newgroup);
3317     }
3318
3319   dr_vec_info *dr_info;
3320   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3321     {
3322       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3323           && !vect_analyze_data_ref_access (vinfo, dr_info))
3324         {
3325           if (dump_enabled_p ())
3326             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3327                              "not vectorized: complicated access pattern.\n");
3328
3329           if (is_a <bb_vec_info> (vinfo))
3330             {
3331               /* Mark the statement as not vectorizable.  */
3332               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3333               continue;
3334             }
3335           else
3336             {
3337               datarefs_copy.release ();
3338               return opt_result::failure_at (dr_info->stmt->stmt,
3339                                              "not vectorized:"
3340                                              " complicated access pattern.\n");
3341             }
3342         }
3343     }
3344
3345   datarefs_copy.release ();
3346   return opt_result::success ();
3347 }
3348
3349 /* Function vect_vfa_segment_size.
3350
3351    Input:
3352      DR_INFO: The data reference.
3353      LENGTH_FACTOR: segment length to consider.
3354
3355    Return a value suitable for the dr_with_seg_len::seg_len field.
3356    This is the "distance travelled" by the pointer from the first
3357    iteration in the segment to the last.  Note that it does not include
3358    the size of the access; in effect it only describes the first byte.  */
3359
3360 static tree
3361 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3362 {
3363   length_factor = size_binop (MINUS_EXPR,
3364                               fold_convert (sizetype, length_factor),
3365                               size_one_node);
3366   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3367                      length_factor);
3368 }
3369
3370 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3371    gives the worst-case number of bytes covered by the segment.  */
3372
3373 static unsigned HOST_WIDE_INT
3374 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3375 {
3376   stmt_vec_info stmt_vinfo = dr_info->stmt;
3377   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3378   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3379   unsigned HOST_WIDE_INT access_size = ref_size;
3380   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3381     {
3382       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3383       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3384     }
3385   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3386   int misalignment;
3387   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3388       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3389       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3390           == dr_explicit_realign_optimized))
3391     {
3392       /* We might access a full vector's worth.  */
3393       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3394     }
3395   return access_size;
3396 }
3397
3398 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3399    describes.  */
3400
3401 static unsigned int
3402 vect_vfa_align (dr_vec_info *dr_info)
3403 {
3404   return dr_alignment (dr_info->dr);
3405 }
3406
3407 /* Function vect_no_alias_p.
3408
3409    Given data references A and B with equal base and offset, see whether
3410    the alias relation can be decided at compilation time.  Return 1 if
3411    it can and the references alias, 0 if it can and the references do
3412    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3413    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3414    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3415
3416 static int
3417 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3418                          tree segment_length_a, tree segment_length_b,
3419                          unsigned HOST_WIDE_INT access_size_a,
3420                          unsigned HOST_WIDE_INT access_size_b)
3421 {
3422   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3423   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3424   poly_uint64 const_length_a;
3425   poly_uint64 const_length_b;
3426
3427   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3428      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3429      [a, a+12) */
3430   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3431     {
3432       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3433       offset_a -= const_length_a;
3434     }
3435   else
3436     const_length_a = tree_to_poly_uint64 (segment_length_a);
3437   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3438     {
3439       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3440       offset_b -= const_length_b;
3441     }
3442   else
3443     const_length_b = tree_to_poly_uint64 (segment_length_b);
3444
3445   const_length_a += access_size_a;
3446   const_length_b += access_size_b;
3447
3448   if (ranges_known_overlap_p (offset_a, const_length_a,
3449                               offset_b, const_length_b))
3450     return 1;
3451
3452   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3453                                offset_b, const_length_b))
3454     return 0;
3455
3456   return -1;
3457 }
3458
3459 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3460    in DDR is >= VF.  */
3461
3462 static bool
3463 dependence_distance_ge_vf (data_dependence_relation *ddr,
3464                            unsigned int loop_depth, poly_uint64 vf)
3465 {
3466   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3467       || DDR_NUM_DIST_VECTS (ddr) == 0)
3468     return false;
3469
3470   /* If the dependence is exact, we should have limited the VF instead.  */
3471   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3472
3473   unsigned int i;
3474   lambda_vector dist_v;
3475   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3476     {
3477       HOST_WIDE_INT dist = dist_v[loop_depth];
3478       if (dist != 0
3479           && !(dist > 0 && DDR_REVERSED_P (ddr))
3480           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3481         return false;
3482     }
3483
3484   if (dump_enabled_p ())
3485     dump_printf_loc (MSG_NOTE, vect_location,
3486                      "dependence distance between %T and %T is >= VF\n",
3487                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3488
3489   return true;
3490 }
3491
3492 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3493
3494 static void
3495 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3496 {
3497   dump_printf (dump_kind, "%s (%T) >= ",
3498                lower_bound.unsigned_p ? "unsigned" : "abs",
3499                lower_bound.expr);
3500   dump_dec (dump_kind, lower_bound.min_value);
3501 }
3502
3503 /* Record that the vectorized loop requires the vec_lower_bound described
3504    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3505
3506 static void
3507 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3508                         poly_uint64 min_value)
3509 {
3510   vec<vec_lower_bound> &lower_bounds
3511     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3512   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3513     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3514       {
3515         unsigned_p &= lower_bounds[i].unsigned_p;
3516         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3517         if (lower_bounds[i].unsigned_p != unsigned_p
3518             || maybe_lt (lower_bounds[i].min_value, min_value))
3519           {
3520             lower_bounds[i].unsigned_p = unsigned_p;
3521             lower_bounds[i].min_value = min_value;
3522             if (dump_enabled_p ())
3523               {
3524                 dump_printf_loc (MSG_NOTE, vect_location,
3525                                  "updating run-time check to ");
3526                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3527                 dump_printf (MSG_NOTE, "\n");
3528               }
3529           }
3530         return;
3531       }
3532
3533   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3534   if (dump_enabled_p ())
3535     {
3536       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3537       dump_lower_bound (MSG_NOTE, lower_bound);
3538       dump_printf (MSG_NOTE, "\n");
3539     }
3540   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3541 }
3542
3543 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3544    will span fewer than GAP bytes.  */
3545
3546 static bool
3547 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3548                   poly_int64 gap)
3549 {
3550   stmt_vec_info stmt_info = dr_info->stmt;
3551   HOST_WIDE_INT count
3552     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3553   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3554     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3555   return (estimated_poly_value (gap)
3556           <= count * vect_get_scalar_dr_size (dr_info));
3557 }
3558
3559 /* Return true if we know that there is no alias between DR_INFO_A and
3560    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3561    When returning true, set *LOWER_BOUND_OUT to this N.  */
3562
3563 static bool
3564 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3565                                 poly_uint64 *lower_bound_out)
3566 {
3567   /* Check that there is a constant gap of known sign between DR_A
3568      and DR_B.  */
3569   data_reference *dr_a = dr_info_a->dr;
3570   data_reference *dr_b = dr_info_b->dr;
3571   poly_int64 init_a, init_b;
3572   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3573       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3574       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3575       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3576       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3577       || !ordered_p (init_a, init_b))
3578     return false;
3579
3580   /* Sort DR_A and DR_B by the address they access.  */
3581   if (maybe_lt (init_b, init_a))
3582     {
3583       std::swap (init_a, init_b);
3584       std::swap (dr_info_a, dr_info_b);
3585       std::swap (dr_a, dr_b);
3586     }
3587
3588   /* If the two accesses could be dependent within a scalar iteration,
3589      make sure that we'd retain their order.  */
3590   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3591       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3592     return false;
3593
3594   /* There is no alias if abs (DR_STEP) is greater than or equal to
3595      the bytes spanned by the combination of the two accesses.  */
3596   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3597   return true;
3598 }
3599
3600 /* Function vect_prune_runtime_alias_test_list.
3601
3602    Prune a list of ddrs to be tested at run-time by versioning for alias.
3603    Merge several alias checks into one if possible.
3604    Return FALSE if resulting list of ddrs is longer then allowed by
3605    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3606
3607 opt_result
3608 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3609 {
3610   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3611   hash_set <tree_pair_hash> compared_objects;
3612
3613   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3614   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3615     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3616   const vec<vec_object_pair> &check_unequal_addrs
3617     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3618   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3619   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3620
3621   ddr_p ddr;
3622   unsigned int i;
3623   tree length_factor;
3624
3625   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3626
3627   /* Step values are irrelevant for aliasing if the number of vector
3628      iterations is equal to the number of scalar iterations (which can
3629      happen for fully-SLP loops).  */
3630   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3631
3632   if (!vf_one_p)
3633     {
3634       /* Convert the checks for nonzero steps into bound tests.  */
3635       tree value;
3636       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3637         vect_check_lower_bound (loop_vinfo, value, true, 1);
3638     }
3639
3640   if (may_alias_ddrs.is_empty ())
3641     return opt_result::success ();
3642
3643   comp_alias_ddrs.create (may_alias_ddrs.length ());
3644
3645   unsigned int loop_depth
3646     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3647                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3648
3649   /* First, we collect all data ref pairs for aliasing checks.  */
3650   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3651     {
3652       poly_uint64 lower_bound;
3653       tree segment_length_a, segment_length_b;
3654       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3655       unsigned int align_a, align_b;
3656
3657       /* Ignore the alias if the VF we chose ended up being no greater
3658          than the dependence distance.  */
3659       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3660         continue;
3661
3662       if (DDR_OBJECT_A (ddr))
3663         {
3664           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3665           if (!compared_objects.add (new_pair))
3666             {
3667               if (dump_enabled_p ())
3668                 dump_printf_loc (MSG_NOTE, vect_location,
3669                                  "checking that %T and %T"
3670                                  " have different addresses\n",
3671                                  new_pair.first, new_pair.second);
3672               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3673             }
3674           continue;
3675         }
3676
3677       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3678       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3679
3680       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3681       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3682
3683       bool preserves_scalar_order_p
3684         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3685       bool ignore_step_p
3686           = (vf_one_p
3687              && (preserves_scalar_order_p
3688                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3689                                      DR_STEP (dr_info_b->dr))));
3690
3691       /* Skip the pair if inter-iteration dependencies are irrelevant
3692          and intra-iteration dependencies are guaranteed to be honored.  */
3693       if (ignore_step_p
3694           && (preserves_scalar_order_p
3695               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3696                                                  &lower_bound)))
3697         {
3698           if (dump_enabled_p ())
3699             dump_printf_loc (MSG_NOTE, vect_location,
3700                              "no need for alias check between "
3701                              "%T and %T when VF is 1\n",
3702                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3703           continue;
3704         }
3705
3706       /* See whether we can handle the alias using a bounds check on
3707          the step, and whether that's likely to be the best approach.
3708          (It might not be, for example, if the minimum step is much larger
3709          than the number of bytes handled by one vector iteration.)  */
3710       if (!ignore_step_p
3711           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3712           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3713                                              &lower_bound)
3714           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3715               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3716         {
3717           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3718           if (dump_enabled_p ())
3719             {
3720               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3721                                "%T and %T when the step %T is outside ",
3722                                DR_REF (dr_info_a->dr),
3723                                DR_REF (dr_info_b->dr),
3724                                DR_STEP (dr_info_a->dr));
3725               if (unsigned_p)
3726                 dump_printf (MSG_NOTE, "[0");
3727               else
3728                 {
3729                   dump_printf (MSG_NOTE, "(");
3730                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3731                 }
3732               dump_printf (MSG_NOTE, ", ");
3733               dump_dec (MSG_NOTE, lower_bound);
3734               dump_printf (MSG_NOTE, ")\n");
3735             }
3736           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3737                                   unsigned_p, lower_bound);
3738           continue;
3739         }
3740
3741       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3742       if (dr_group_first_a)
3743         {
3744           stmt_info_a = dr_group_first_a;
3745           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3746         }
3747
3748       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3749       if (dr_group_first_b)
3750         {
3751           stmt_info_b = dr_group_first_b;
3752           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3753         }
3754
3755       if (ignore_step_p)
3756         {
3757           segment_length_a = size_zero_node;
3758           segment_length_b = size_zero_node;
3759         }
3760       else
3761         {
3762           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3763                                 DR_STEP (dr_info_b->dr), 0))
3764             length_factor = scalar_loop_iters;
3765           else
3766             length_factor = size_int (vect_factor);
3767           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3768           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3769         }
3770       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3771       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3772       align_a = vect_vfa_align (dr_info_a);
3773       align_b = vect_vfa_align (dr_info_b);
3774
3775       /* See whether the alias is known at compilation time.  */
3776       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3777                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3778           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3779                               DR_OFFSET (dr_info_b->dr), 0)
3780           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3781           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3782           && poly_int_tree_p (segment_length_a)
3783           && poly_int_tree_p (segment_length_b))
3784         {
3785           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3786                                              segment_length_a,
3787                                              segment_length_b,
3788                                              access_size_a,
3789                                              access_size_b);
3790           if (res >= 0 && dump_enabled_p ())
3791             {
3792               dump_printf_loc (MSG_NOTE, vect_location,
3793                                "can tell at compile time that %T and %T",
3794                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3795               if (res == 0)
3796                 dump_printf (MSG_NOTE, " do not alias\n");
3797               else
3798                 dump_printf (MSG_NOTE, " alias\n");
3799             }
3800
3801           if (res == 0)
3802             continue;
3803
3804           if (res == 1)
3805             return opt_result::failure_at (stmt_info_b->stmt,
3806                                            "not vectorized:"
3807                                            " compilation time alias: %G%G",
3808                                            stmt_info_a->stmt,
3809                                            stmt_info_b->stmt);
3810         }
3811
3812       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3813                             access_size_a, align_a);
3814       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3815                             access_size_b, align_b);
3816       /* Canonicalize the order to be the one that's needed for accurate
3817          RAW, WAR and WAW flags, in cases where the data references are
3818          well-ordered.  The order doesn't really matter otherwise,
3819          but we might as well be consistent.  */
3820       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3821         std::swap (dr_a, dr_b);
3822
3823       dr_with_seg_len_pair_t dr_with_seg_len_pair
3824         (dr_a, dr_b, (preserves_scalar_order_p
3825                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3826                       : dr_with_seg_len_pair_t::REORDERED));
3827
3828       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3829     }
3830
3831   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3832
3833   unsigned int count = (comp_alias_ddrs.length ()
3834                         + check_unequal_addrs.length ());
3835
3836   if (count
3837       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3838           == VECT_COST_MODEL_VERY_CHEAP))
3839     return opt_result::failure_at
3840       (vect_location, "would need a runtime alias check\n");
3841
3842   if (dump_enabled_p ())
3843     dump_printf_loc (MSG_NOTE, vect_location,
3844                      "improved number of alias checks from %d to %d\n",
3845                      may_alias_ddrs.length (), count);
3846   unsigned limit = param_vect_max_version_for_alias_checks;
3847   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3848     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3849   if (count > limit)
3850     return opt_result::failure_at
3851       (vect_location,
3852        "number of versioning for alias run-time tests exceeds %d "
3853        "(--param vect-max-version-for-alias-checks)\n", limit);
3854
3855   return opt_result::success ();
3856 }
3857
3858 /* Check whether we can use an internal function for a gather load
3859    or scatter store.  READ_P is true for loads and false for stores.
3860    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3861    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3862    is the type of the offset that is being applied to the invariant
3863    base address.  SCALE is the amount by which the offset should
3864    be multiplied *after* it has been converted to address width.
3865
3866    Return true if the function is supported, storing the function id in
3867    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3868
3869 bool
3870 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3871                           tree vectype, tree memory_type, tree offset_type,
3872                           int scale, internal_fn *ifn_out,
3873                           tree *offset_vectype_out)
3874 {
3875   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3876   unsigned int element_bits = vector_element_bits (vectype);
3877   if (element_bits != memory_bits)
3878     /* For now the vector elements must be the same width as the
3879        memory elements.  */
3880     return false;
3881
3882   /* Work out which function we need.  */
3883   internal_fn ifn, alt_ifn, alt_ifn2;
3884   if (read_p)
3885     {
3886       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3887       alt_ifn = IFN_MASK_GATHER_LOAD;
3888       /* When target supports MASK_LEN_GATHER_LOAD, we always
3889          use MASK_LEN_GATHER_LOAD regardless whether len and
3890          mask are valid or not.  */
3891       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
3892     }
3893   else
3894     {
3895       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3896       alt_ifn = IFN_MASK_SCATTER_STORE;
3897       /* When target supports MASK_LEN_SCATTER_STORE, we always
3898          use MASK_LEN_SCATTER_STORE regardless whether len and
3899          mask are valid or not.  */
3900       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
3901     }
3902
3903   for (;;)
3904     {
3905       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3906       if (!offset_vectype)
3907         return false;
3908
3909       /* Test whether the target supports this combination.  */
3910       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3911                                                   offset_vectype, scale))
3912         {
3913           *ifn_out = ifn;
3914           *offset_vectype_out = offset_vectype;
3915           return true;
3916         }
3917       else if (!masked_p
3918                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3919                                                           memory_type,
3920                                                           offset_vectype,
3921                                                           scale))
3922         {
3923           *ifn_out = alt_ifn;
3924           *offset_vectype_out = offset_vectype;
3925           return true;
3926         }
3927       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
3928                                                        memory_type,
3929                                                        offset_vectype, scale))
3930         {
3931           *ifn_out = alt_ifn2;
3932           *offset_vectype_out = offset_vectype;
3933           return true;
3934         }
3935
3936       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3937           && TYPE_PRECISION (offset_type) >= element_bits)
3938         return false;
3939
3940       offset_type = build_nonstandard_integer_type
3941         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3942     }
3943 }
3944
3945 /* STMT_INFO is a call to an internal gather load or scatter store function.
3946    Describe the operation in INFO.  */
3947
3948 static void
3949 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3950                                    gather_scatter_info *info)
3951 {
3952   gcall *call = as_a <gcall *> (stmt_info->stmt);
3953   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3954   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3955
3956   info->ifn = gimple_call_internal_fn (call);
3957   info->decl = NULL_TREE;
3958   info->base = gimple_call_arg (call, 0);
3959   info->offset = gimple_call_arg (call, 1);
3960   info->offset_dt = vect_unknown_def_type;
3961   info->offset_vectype = NULL_TREE;
3962   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3963   info->element_type = TREE_TYPE (vectype);
3964   info->memory_type = TREE_TYPE (DR_REF (dr));
3965 }
3966
3967 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3968    gather load or scatter store.  Describe the operation in *INFO if so.  */
3969
3970 bool
3971 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3972                            gather_scatter_info *info)
3973 {
3974   HOST_WIDE_INT scale = 1;
3975   poly_int64 pbitpos, pbitsize;
3976   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3977   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3978   tree offtype = NULL_TREE;
3979   tree decl = NULL_TREE, base, off;
3980   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3981   tree memory_type = TREE_TYPE (DR_REF (dr));
3982   machine_mode pmode;
3983   int punsignedp, reversep, pvolatilep = 0;
3984   internal_fn ifn;
3985   tree offset_vectype;
3986   bool masked_p = false;
3987
3988   /* See whether this is already a call to a gather/scatter internal function.
3989      If not, see whether it's a masked load or store.  */
3990   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3991   if (call && gimple_call_internal_p (call))
3992     {
3993       ifn = gimple_call_internal_fn (call);
3994       if (internal_gather_scatter_fn_p (ifn))
3995         {
3996           vect_describe_gather_scatter_call (stmt_info, info);
3997           return true;
3998         }
3999       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4000     }
4001
4002   /* True if we should aim to use internal functions rather than
4003      built-in functions.  */
4004   bool use_ifn_p = (DR_IS_READ (dr)
4005                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4006                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4007
4008   base = DR_REF (dr);
4009   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4010      see if we can use the def stmt of the address.  */
4011   if (masked_p
4012       && TREE_CODE (base) == MEM_REF
4013       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4014       && integer_zerop (TREE_OPERAND (base, 1))
4015       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4016     {
4017       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4018       if (is_gimple_assign (def_stmt)
4019           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4020         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4021     }
4022
4023   /* The gather and scatter builtins need address of the form
4024      loop_invariant + vector * {1, 2, 4, 8}
4025      or
4026      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4027      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4028      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4029      multiplications and additions in it.  To get a vector, we need
4030      a single SSA_NAME that will be defined in the loop and will
4031      contain everything that is not loop invariant and that can be
4032      vectorized.  The following code attempts to find such a preexistng
4033      SSA_NAME OFF and put the loop invariants into a tree BASE
4034      that can be gimplified before the loop.  */
4035   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4036                               &punsignedp, &reversep, &pvolatilep);
4037   if (reversep)
4038     return false;
4039
4040   /* PR 107346.  Packed structs can have fields at offsets that are not
4041      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4042   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4043     return false;
4044
4045   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4046
4047   if (TREE_CODE (base) == MEM_REF)
4048     {
4049       if (!integer_zerop (TREE_OPERAND (base, 1)))
4050         {
4051           if (off == NULL_TREE)
4052             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4053           else
4054             off = size_binop (PLUS_EXPR, off,
4055                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4056         }
4057       base = TREE_OPERAND (base, 0);
4058     }
4059   else
4060     base = build_fold_addr_expr (base);
4061
4062   if (off == NULL_TREE)
4063     off = size_zero_node;
4064
4065   /* If base is not loop invariant, either off is 0, then we start with just
4066      the constant offset in the loop invariant BASE and continue with base
4067      as OFF, otherwise give up.
4068      We could handle that case by gimplifying the addition of base + off
4069      into some SSA_NAME and use that as off, but for now punt.  */
4070   if (!expr_invariant_in_loop_p (loop, base))
4071     {
4072       if (!integer_zerop (off))
4073         return false;
4074       off = base;
4075       base = size_int (pbytepos);
4076     }
4077   /* Otherwise put base + constant offset into the loop invariant BASE
4078      and continue with OFF.  */
4079   else
4080     {
4081       base = fold_convert (sizetype, base);
4082       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4083     }
4084
4085   /* OFF at this point may be either a SSA_NAME or some tree expression
4086      from get_inner_reference.  Try to peel off loop invariants from it
4087      into BASE as long as possible.  */
4088   STRIP_NOPS (off);
4089   while (offtype == NULL_TREE)
4090     {
4091       enum tree_code code;
4092       tree op0, op1, add = NULL_TREE;
4093
4094       if (TREE_CODE (off) == SSA_NAME)
4095         {
4096           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4097
4098           if (expr_invariant_in_loop_p (loop, off))
4099             return false;
4100
4101           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4102             break;
4103
4104           op0 = gimple_assign_rhs1 (def_stmt);
4105           code = gimple_assign_rhs_code (def_stmt);
4106           op1 = gimple_assign_rhs2 (def_stmt);
4107         }
4108       else
4109         {
4110           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4111             return false;
4112           code = TREE_CODE (off);
4113           extract_ops_from_tree (off, &code, &op0, &op1);
4114         }
4115       switch (code)
4116         {
4117         case POINTER_PLUS_EXPR:
4118         case PLUS_EXPR:
4119           if (expr_invariant_in_loop_p (loop, op0))
4120             {
4121               add = op0;
4122               off = op1;
4123             do_add:
4124               add = fold_convert (sizetype, add);
4125               if (scale != 1)
4126                 add = size_binop (MULT_EXPR, add, size_int (scale));
4127               base = size_binop (PLUS_EXPR, base, add);
4128               continue;
4129             }
4130           if (expr_invariant_in_loop_p (loop, op1))
4131             {
4132               add = op1;
4133               off = op0;
4134               goto do_add;
4135             }
4136           break;
4137         case MINUS_EXPR:
4138           if (expr_invariant_in_loop_p (loop, op1))
4139             {
4140               add = fold_convert (sizetype, op1);
4141               add = size_binop (MINUS_EXPR, size_zero_node, add);
4142               off = op0;
4143               goto do_add;
4144             }
4145           break;
4146         case MULT_EXPR:
4147           if (scale == 1 && tree_fits_shwi_p (op1))
4148             {
4149               int new_scale = tree_to_shwi (op1);
4150               /* Only treat this as a scaling operation if the target
4151                  supports it for at least some offset type.  */
4152               if (use_ifn_p
4153                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4154                                                 masked_p, vectype, memory_type,
4155                                                 signed_char_type_node,
4156                                                 new_scale, &ifn,
4157                                                 &offset_vectype)
4158                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4159                                                 masked_p, vectype, memory_type,
4160                                                 unsigned_char_type_node,
4161                                                 new_scale, &ifn,
4162                                                 &offset_vectype))
4163                 break;
4164               scale = new_scale;
4165               off = op0;
4166               continue;
4167             }
4168           break;
4169         case SSA_NAME:
4170           off = op0;
4171           continue;
4172         CASE_CONVERT:
4173           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4174               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4175             break;
4176
4177           /* Don't include the conversion if the target is happy with
4178              the current offset type.  */
4179           if (use_ifn_p
4180               && TREE_CODE (off) == SSA_NAME
4181               && !POINTER_TYPE_P (TREE_TYPE (off))
4182               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4183                                            masked_p, vectype, memory_type,
4184                                            TREE_TYPE (off), scale, &ifn,
4185                                            &offset_vectype))
4186             break;
4187
4188           if (TYPE_PRECISION (TREE_TYPE (op0))
4189               == TYPE_PRECISION (TREE_TYPE (off)))
4190             {
4191               off = op0;
4192               continue;
4193             }
4194
4195           /* Include the conversion if it is widening and we're using
4196              the IFN path or the target can handle the converted from
4197              offset or the current size is not already the same as the
4198              data vector element size.  */
4199           if ((TYPE_PRECISION (TREE_TYPE (op0))
4200                < TYPE_PRECISION (TREE_TYPE (off)))
4201               && (use_ifn_p
4202                   || (DR_IS_READ (dr)
4203                       ? (targetm.vectorize.builtin_gather
4204                          && targetm.vectorize.builtin_gather (vectype,
4205                                                               TREE_TYPE (op0),
4206                                                               scale))
4207                       : (targetm.vectorize.builtin_scatter
4208                          && targetm.vectorize.builtin_scatter (vectype,
4209                                                                TREE_TYPE (op0),
4210                                                                scale)))
4211                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4212                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4213             {
4214               off = op0;
4215               offtype = TREE_TYPE (off);
4216               STRIP_NOPS (off);
4217               continue;
4218             }
4219           break;
4220         default:
4221           break;
4222         }
4223       break;
4224     }
4225
4226   /* If at the end OFF still isn't a SSA_NAME or isn't
4227      defined in the loop, punt.  */
4228   if (TREE_CODE (off) != SSA_NAME
4229       || expr_invariant_in_loop_p (loop, off))
4230     return false;
4231
4232   if (offtype == NULL_TREE)
4233     offtype = TREE_TYPE (off);
4234
4235   if (use_ifn_p)
4236     {
4237       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4238                                      vectype, memory_type, offtype, scale,
4239                                      &ifn, &offset_vectype))
4240         ifn = IFN_LAST;
4241       decl = NULL_TREE;
4242     }
4243   else
4244     {
4245       if (DR_IS_READ (dr))
4246         {
4247           if (targetm.vectorize.builtin_gather)
4248             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4249         }
4250       else
4251         {
4252           if (targetm.vectorize.builtin_scatter)
4253             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4254         }
4255       ifn = IFN_LAST;
4256       /* The offset vector type will be read from DECL when needed.  */
4257       offset_vectype = NULL_TREE;
4258     }
4259
4260   info->ifn = ifn;
4261   info->decl = decl;
4262   info->base = base;
4263   info->offset = off;
4264   info->offset_dt = vect_unknown_def_type;
4265   info->offset_vectype = offset_vectype;
4266   info->scale = scale;
4267   info->element_type = TREE_TYPE (vectype);
4268   info->memory_type = memory_type;
4269   return true;
4270 }
4271
4272 /* Find the data references in STMT, analyze them with respect to LOOP and
4273    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4274    be handled.  */
4275
4276 opt_result
4277 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4278                                vec<data_reference_p> *datarefs,
4279                                vec<int> *dataref_groups, int group_id)
4280 {
4281   /* We can ignore clobbers for dataref analysis - they are removed during
4282      loop vectorization and BB vectorization checks dependences with a
4283      stmt walk.  */
4284   if (gimple_clobber_p (stmt))
4285     return opt_result::success ();
4286
4287   if (gimple_has_volatile_ops (stmt))
4288     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4289                                    stmt);
4290
4291   if (stmt_can_throw_internal (cfun, stmt))
4292     return opt_result::failure_at (stmt,
4293                                    "not vectorized:"
4294                                    " statement can throw an exception: %G",
4295                                    stmt);
4296
4297   auto_vec<data_reference_p, 2> refs;
4298   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4299   if (!res)
4300     return res;
4301
4302   if (refs.is_empty ())
4303     return opt_result::success ();
4304
4305   if (refs.length () > 1)
4306     {
4307       while (!refs.is_empty ())
4308         free_data_ref (refs.pop ());
4309       return opt_result::failure_at (stmt,
4310                                      "not vectorized: more than one "
4311                                      "data ref in stmt: %G", stmt);
4312     }
4313
4314   data_reference_p dr = refs.pop ();
4315   if (gcall *call = dyn_cast <gcall *> (stmt))
4316     if (!gimple_call_internal_p (call)
4317         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4318             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4319       {
4320         free_data_ref (dr);
4321         return opt_result::failure_at (stmt,
4322                                        "not vectorized: dr in a call %G", stmt);
4323       }
4324
4325   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4326       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4327     {
4328       free_data_ref (dr);
4329       return opt_result::failure_at (stmt,
4330                                      "not vectorized:"
4331                                      " statement is an unsupported"
4332                                      " bitfield access %G", stmt);
4333     }
4334
4335   if (DR_BASE_ADDRESS (dr)
4336       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4337     {
4338       free_data_ref (dr);
4339       return opt_result::failure_at (stmt,
4340                                      "not vectorized:"
4341                                      " base addr of dr is a constant\n");
4342     }
4343
4344   /* Check whether this may be a SIMD lane access and adjust the
4345      DR to make it easier for us to handle it.  */
4346   if (loop
4347       && loop->simduid
4348       && (!DR_BASE_ADDRESS (dr)
4349           || !DR_OFFSET (dr)
4350           || !DR_INIT (dr)
4351           || !DR_STEP (dr)))
4352     {
4353       struct data_reference *newdr
4354         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4355                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4356       if (DR_BASE_ADDRESS (newdr)
4357           && DR_OFFSET (newdr)
4358           && DR_INIT (newdr)
4359           && DR_STEP (newdr)
4360           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4361           && integer_zerop (DR_STEP (newdr)))
4362         {
4363           tree base_address = DR_BASE_ADDRESS (newdr);
4364           tree off = DR_OFFSET (newdr);
4365           tree step = ssize_int (1);
4366           if (integer_zerop (off)
4367               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4368             {
4369               off = TREE_OPERAND (base_address, 1);
4370               base_address = TREE_OPERAND (base_address, 0);
4371             }
4372           STRIP_NOPS (off);
4373           if (TREE_CODE (off) == MULT_EXPR
4374               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4375             {
4376               step = TREE_OPERAND (off, 1);
4377               off = TREE_OPERAND (off, 0);
4378               STRIP_NOPS (off);
4379             }
4380           if (CONVERT_EXPR_P (off)
4381               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4382                   < TYPE_PRECISION (TREE_TYPE (off))))
4383             off = TREE_OPERAND (off, 0);
4384           if (TREE_CODE (off) == SSA_NAME)
4385             {
4386               gimple *def = SSA_NAME_DEF_STMT (off);
4387               /* Look through widening conversion.  */
4388               if (is_gimple_assign (def)
4389                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4390                 {
4391                   tree rhs1 = gimple_assign_rhs1 (def);
4392                   if (TREE_CODE (rhs1) == SSA_NAME
4393                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4394                       && (TYPE_PRECISION (TREE_TYPE (off))
4395                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4396                     def = SSA_NAME_DEF_STMT (rhs1);
4397                 }
4398               if (is_gimple_call (def)
4399                   && gimple_call_internal_p (def)
4400                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4401                 {
4402                   tree arg = gimple_call_arg (def, 0);
4403                   tree reft = TREE_TYPE (DR_REF (newdr));
4404                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4405                   arg = SSA_NAME_VAR (arg);
4406                   if (arg == loop->simduid
4407                       /* For now.  */
4408                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4409                     {
4410                       DR_BASE_ADDRESS (newdr) = base_address;
4411                       DR_OFFSET (newdr) = ssize_int (0);
4412                       DR_STEP (newdr) = step;
4413                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4414                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4415                       /* Mark as simd-lane access.  */
4416                       tree arg2 = gimple_call_arg (def, 1);
4417                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4418                       free_data_ref (dr);
4419                       datarefs->safe_push (newdr);
4420                       if (dataref_groups)
4421                         dataref_groups->safe_push (group_id);
4422                       return opt_result::success ();
4423                     }
4424                 }
4425             }
4426         }
4427       free_data_ref (newdr);
4428     }
4429
4430   datarefs->safe_push (dr);
4431   if (dataref_groups)
4432     dataref_groups->safe_push (group_id);
4433   return opt_result::success ();
4434 }
4435
4436 /* Function vect_analyze_data_refs.
4437
4438   Find all the data references in the loop or basic block.
4439
4440    The general structure of the analysis of data refs in the vectorizer is as
4441    follows:
4442    1- vect_analyze_data_refs(loop/bb): call
4443       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4444       in the loop/bb and their dependences.
4445    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4446    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4447    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4448
4449 */
4450
4451 opt_result
4452 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4453 {
4454   class loop *loop = NULL;
4455   unsigned int i;
4456   struct data_reference *dr;
4457   tree scalar_type;
4458
4459   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4460
4461   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4462     loop = LOOP_VINFO_LOOP (loop_vinfo);
4463
4464   /* Go through the data-refs, check that the analysis succeeded.  Update
4465      pointer from stmt_vec_info struct to DR and vectype.  */
4466
4467   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4468   FOR_EACH_VEC_ELT (datarefs, i, dr)
4469     {
4470       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4471       poly_uint64 vf;
4472
4473       gcc_assert (DR_REF (dr));
4474       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4475       gcc_assert (!stmt_info->dr_aux.dr);
4476       stmt_info->dr_aux.dr = dr;
4477       stmt_info->dr_aux.stmt = stmt_info;
4478
4479       /* Check that analysis of the data-ref succeeded.  */
4480       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4481           || !DR_STEP (dr))
4482         {
4483           bool maybe_gather
4484             = DR_IS_READ (dr)
4485               && !TREE_THIS_VOLATILE (DR_REF (dr));
4486           bool maybe_scatter
4487             = DR_IS_WRITE (dr)
4488               && !TREE_THIS_VOLATILE (DR_REF (dr));
4489
4490           /* If target supports vector gather loads or scatter stores,
4491              see if they can't be used.  */
4492           if (is_a <loop_vec_info> (vinfo)
4493               && !nested_in_vect_loop_p (loop, stmt_info))
4494             {
4495               if (maybe_gather || maybe_scatter)
4496                 {
4497                   if (maybe_gather)
4498                     gatherscatter = GATHER;
4499                   else
4500                     gatherscatter = SCATTER;
4501                 }
4502             }
4503
4504           if (gatherscatter == SG_NONE)
4505             {
4506               if (dump_enabled_p ())
4507                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4508                                  "not vectorized: data ref analysis "
4509                                  "failed %G", stmt_info->stmt);
4510               if (is_a <bb_vec_info> (vinfo))
4511                 {
4512                   /* In BB vectorization the ref can still participate
4513                      in dependence analysis, we just can't vectorize it.  */
4514                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4515                   continue;
4516                 }
4517               return opt_result::failure_at (stmt_info->stmt,
4518                                              "not vectorized:"
4519                                              " data ref analysis failed: %G",
4520                                              stmt_info->stmt);
4521             }
4522         }
4523
4524       /* See if this was detected as SIMD lane access.  */
4525       if (dr->aux == (void *)-1
4526           || dr->aux == (void *)-2
4527           || dr->aux == (void *)-3
4528           || dr->aux == (void *)-4)
4529         {
4530           if (nested_in_vect_loop_p (loop, stmt_info))
4531             return opt_result::failure_at (stmt_info->stmt,
4532                                            "not vectorized:"
4533                                            " data ref analysis failed: %G",
4534                                            stmt_info->stmt);
4535           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4536             = -(uintptr_t) dr->aux;
4537         }
4538
4539       tree base = get_base_address (DR_REF (dr));
4540       if (base && VAR_P (base) && DECL_NONALIASED (base))
4541         {
4542           if (dump_enabled_p ())
4543             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4544                              "not vectorized: base object not addressable "
4545                              "for stmt: %G", stmt_info->stmt);
4546           if (is_a <bb_vec_info> (vinfo))
4547             {
4548               /* In BB vectorization the ref can still participate
4549                  in dependence analysis, we just can't vectorize it.  */
4550               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4551               continue;
4552             }
4553           return opt_result::failure_at (stmt_info->stmt,
4554                                          "not vectorized: base object not"
4555                                          " addressable for stmt: %G",
4556                                          stmt_info->stmt);
4557         }
4558
4559       if (is_a <loop_vec_info> (vinfo)
4560           && DR_STEP (dr)
4561           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4562         {
4563           if (nested_in_vect_loop_p (loop, stmt_info))
4564             return opt_result::failure_at (stmt_info->stmt,
4565                                            "not vectorized: "
4566                                            "not suitable for strided load %G",
4567                                            stmt_info->stmt);
4568           STMT_VINFO_STRIDED_P (stmt_info) = true;
4569         }
4570
4571       /* Update DR field in stmt_vec_info struct.  */
4572
4573       /* If the dataref is in an inner-loop of the loop that is considered for
4574          for vectorization, we also want to analyze the access relative to
4575          the outer-loop (DR contains information only relative to the
4576          inner-most enclosing loop).  We do that by building a reference to the
4577          first location accessed by the inner-loop, and analyze it relative to
4578          the outer-loop.  */
4579       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4580         {
4581           /* Build a reference to the first location accessed by the
4582              inner loop: *(BASE + INIT + OFFSET).  By construction,
4583              this address must be invariant in the inner loop, so we
4584              can consider it as being used in the outer loop.  */
4585           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4586           tree offset = unshare_expr (DR_OFFSET (dr));
4587           tree init = unshare_expr (DR_INIT (dr));
4588           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4589                                           init, offset);
4590           tree init_addr = fold_build_pointer_plus (base, init_offset);
4591           tree init_ref = build_fold_indirect_ref (init_addr);
4592
4593           if (dump_enabled_p ())
4594             dump_printf_loc (MSG_NOTE, vect_location,
4595                              "analyze in outer loop: %T\n", init_ref);
4596
4597           opt_result res
4598             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4599                                     init_ref, loop, stmt_info->stmt);
4600           if (!res)
4601             /* dr_analyze_innermost already explained the failure.  */
4602             return res;
4603
4604           if (dump_enabled_p ())
4605             dump_printf_loc (MSG_NOTE, vect_location,
4606                              "\touter base_address: %T\n"
4607                              "\touter offset from base address: %T\n"
4608                              "\touter constant offset from base address: %T\n"
4609                              "\touter step: %T\n"
4610                              "\touter base alignment: %d\n\n"
4611                              "\touter base misalignment: %d\n"
4612                              "\touter offset alignment: %d\n"
4613                              "\touter step alignment: %d\n",
4614                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4615                              STMT_VINFO_DR_OFFSET (stmt_info),
4616                              STMT_VINFO_DR_INIT (stmt_info),
4617                              STMT_VINFO_DR_STEP (stmt_info),
4618                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4619                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4620                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4621                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4622         }
4623
4624       /* Set vectype for STMT.  */
4625       scalar_type = TREE_TYPE (DR_REF (dr));
4626       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4627       if (!vectype)
4628         {
4629           if (dump_enabled_p ())
4630             {
4631               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4632                                "not vectorized: no vectype for stmt: %G",
4633                                stmt_info->stmt);
4634               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4635               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4636                                  scalar_type);
4637               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4638             }
4639
4640           if (is_a <bb_vec_info> (vinfo))
4641             {
4642               /* No vector type is fine, the ref can still participate
4643                  in dependence analysis, we just can't vectorize it.  */
4644               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4645               continue;
4646             }
4647           if (fatal)
4648             *fatal = false;
4649           return opt_result::failure_at (stmt_info->stmt,
4650                                          "not vectorized:"
4651                                          " no vectype for stmt: %G"
4652                                          " scalar_type: %T\n",
4653                                          stmt_info->stmt, scalar_type);
4654         }
4655       else
4656         {
4657           if (dump_enabled_p ())
4658             dump_printf_loc (MSG_NOTE, vect_location,
4659                              "got vectype for stmt: %G%T\n",
4660                              stmt_info->stmt, vectype);
4661         }
4662
4663       /* Adjust the minimal vectorization factor according to the
4664          vector type.  */
4665       vf = TYPE_VECTOR_SUBPARTS (vectype);
4666       *min_vf = upper_bound (*min_vf, vf);
4667
4668       /* Leave the BB vectorizer to pick the vector type later, based on
4669          the final dataref group size and SLP node size.  */
4670       if (is_a <loop_vec_info> (vinfo))
4671         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4672
4673       if (gatherscatter != SG_NONE)
4674         {
4675           gather_scatter_info gs_info;
4676           if (!vect_check_gather_scatter (stmt_info,
4677                                           as_a <loop_vec_info> (vinfo),
4678                                           &gs_info)
4679               || !get_vectype_for_scalar_type (vinfo,
4680                                                TREE_TYPE (gs_info.offset)))
4681             {
4682               if (fatal)
4683                 *fatal = false;
4684               return opt_result::failure_at
4685                         (stmt_info->stmt,
4686                          (gatherscatter == GATHER)
4687                          ? "not vectorized: not suitable for gather load %G"
4688                          : "not vectorized: not suitable for scatter store %G",
4689                          stmt_info->stmt);
4690             }
4691           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4692         }
4693     }
4694
4695   /* We used to stop processing and prune the list here.  Verify we no
4696      longer need to.  */
4697   gcc_assert (i == datarefs.length ());
4698
4699   return opt_result::success ();
4700 }
4701
4702
4703 /* Function vect_get_new_vect_var.
4704
4705    Returns a name for a new variable.  The current naming scheme appends the
4706    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4707    the name of vectorizer generated variables, and appends that to NAME if
4708    provided.  */
4709
4710 tree
4711 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4712 {
4713   const char *prefix;
4714   tree new_vect_var;
4715
4716   switch (var_kind)
4717   {
4718   case vect_simple_var:
4719     prefix = "vect";
4720     break;
4721   case vect_scalar_var:
4722     prefix = "stmp";
4723     break;
4724   case vect_mask_var:
4725     prefix = "mask";
4726     break;
4727   case vect_pointer_var:
4728     prefix = "vectp";
4729     break;
4730   default:
4731     gcc_unreachable ();
4732   }
4733
4734   if (name)
4735     {
4736       char* tmp = concat (prefix, "_", name, NULL);
4737       new_vect_var = create_tmp_reg (type, tmp);
4738       free (tmp);
4739     }
4740   else
4741     new_vect_var = create_tmp_reg (type, prefix);
4742
4743   return new_vect_var;
4744 }
4745
4746 /* Like vect_get_new_vect_var but return an SSA name.  */
4747
4748 tree
4749 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4750 {
4751   const char *prefix;
4752   tree new_vect_var;
4753
4754   switch (var_kind)
4755   {
4756   case vect_simple_var:
4757     prefix = "vect";
4758     break;
4759   case vect_scalar_var:
4760     prefix = "stmp";
4761     break;
4762   case vect_pointer_var:
4763     prefix = "vectp";
4764     break;
4765   default:
4766     gcc_unreachable ();
4767   }
4768
4769   if (name)
4770     {
4771       char* tmp = concat (prefix, "_", name, NULL);
4772       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4773       free (tmp);
4774     }
4775   else
4776     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4777
4778   return new_vect_var;
4779 }
4780
4781 /* Duplicate points-to info on NAME from DR_INFO.  */
4782
4783 static void
4784 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4785 {
4786   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4787   /* DR_PTR_INFO is for a base SSA name, not including constant or
4788      variable offsets in the ref so its alignment info does not apply.  */
4789   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4790 }
4791
4792 /* Function vect_create_addr_base_for_vector_ref.
4793
4794    Create an expression that computes the address of the first memory location
4795    that will be accessed for a data reference.
4796
4797    Input:
4798    STMT_INFO: The statement containing the data reference.
4799    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4800    OFFSET: Optional. If supplied, it is be added to the initial address.
4801    LOOP:    Specify relative to which loop-nest should the address be computed.
4802             For example, when the dataref is in an inner-loop nested in an
4803             outer-loop that is now being vectorized, LOOP can be either the
4804             outer-loop, or the inner-loop.  The first memory location accessed
4805             by the following dataref ('in' points to short):
4806
4807                 for (i=0; i<N; i++)
4808                    for (j=0; j<M; j++)
4809                      s += in[i+j]
4810
4811             is as follows:
4812             if LOOP=i_loop:     &in             (relative to i_loop)
4813             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4814
4815    Output:
4816    1. Return an SSA_NAME whose value is the address of the memory location of
4817       the first vector of the data reference.
4818    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4819       these statement(s) which define the returned SSA_NAME.
4820
4821    FORNOW: We are only handling array accesses with step 1.  */
4822
4823 tree
4824 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4825                                       gimple_seq *new_stmt_list,
4826                                       tree offset)
4827 {
4828   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4829   struct data_reference *dr = dr_info->dr;
4830   const char *base_name;
4831   tree addr_base;
4832   tree dest;
4833   gimple_seq seq = NULL;
4834   tree vect_ptr_type;
4835   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4836   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4837
4838   tree data_ref_base = unshare_expr (drb->base_address);
4839   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4840   tree init = unshare_expr (drb->init);
4841
4842   if (loop_vinfo)
4843     base_name = get_name (data_ref_base);
4844   else
4845     {
4846       base_offset = ssize_int (0);
4847       init = ssize_int (0);
4848       base_name = get_name (DR_REF (dr));
4849     }
4850
4851   /* Create base_offset */
4852   base_offset = size_binop (PLUS_EXPR,
4853                             fold_convert (sizetype, base_offset),
4854                             fold_convert (sizetype, init));
4855
4856   if (offset)
4857     {
4858       offset = fold_convert (sizetype, offset);
4859       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4860                                  base_offset, offset);
4861     }
4862
4863   /* base + base_offset */
4864   if (loop_vinfo)
4865     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4866   else
4867     addr_base = build1 (ADDR_EXPR,
4868                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
4869                         /* Strip zero offset components since we don't need
4870                            them and they can confuse late diagnostics if
4871                            we CSE them wrongly.  See PR106904 for example.  */
4872                         unshare_expr (strip_zero_offset_components
4873                                                                 (DR_REF (dr))));
4874
4875   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4876   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4877   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4878   gimple_seq_add_seq (new_stmt_list, seq);
4879
4880   if (DR_PTR_INFO (dr)
4881       && TREE_CODE (addr_base) == SSA_NAME
4882       /* We should only duplicate pointer info to newly created SSA names.  */
4883       && SSA_NAME_VAR (addr_base) == dest)
4884     {
4885       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4886       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4887     }
4888
4889   if (dump_enabled_p ())
4890     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4891
4892   return addr_base;
4893 }
4894
4895
4896 /* Function vect_create_data_ref_ptr.
4897
4898    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4899    location accessed in the loop by STMT_INFO, along with the def-use update
4900    chain to appropriately advance the pointer through the loop iterations.
4901    Also set aliasing information for the pointer.  This pointer is used by
4902    the callers to this function to create a memory reference expression for
4903    vector load/store access.
4904
4905    Input:
4906    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4907          GIMPLE_ASSIGN <name, data-ref> or
4908          GIMPLE_ASSIGN <data-ref, name>.
4909    2. AGGR_TYPE: the type of the reference, which should be either a vector
4910         or an array.
4911    3. AT_LOOP: the loop where the vector memref is to be created.
4912    4. OFFSET (optional): a byte offset to be added to the initial address
4913         accessed by the data-ref in STMT_INFO.
4914    5. BSI: location where the new stmts are to be placed if there is no loop
4915    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4916         pointing to the initial address.
4917    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4918         to the IV during each iteration of the loop.  NULL says to move
4919         by one copy of AGGR_TYPE up or down, depending on the step of the
4920         data reference.
4921
4922    Output:
4923    1. Declare a new ptr to vector_type, and have it point to the base of the
4924       data reference (initial addressed accessed by the data reference).
4925       For example, for vector of type V8HI, the following code is generated:
4926
4927       v8hi *ap;
4928       ap = (v8hi *)initial_address;
4929
4930       if OFFSET is not supplied:
4931          initial_address = &a[init];
4932       if OFFSET is supplied:
4933          initial_address = &a[init] + OFFSET;
4934       if BYTE_OFFSET is supplied:
4935          initial_address = &a[init] + BYTE_OFFSET;
4936
4937       Return the initial_address in INITIAL_ADDRESS.
4938
4939    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4940       update the pointer in each iteration of the loop.
4941
4942       Return the increment stmt that updates the pointer in PTR_INCR.
4943
4944    3. Return the pointer.  */
4945
4946 tree
4947 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4948                           tree aggr_type, class loop *at_loop, tree offset,
4949                           tree *initial_address, gimple_stmt_iterator *gsi,
4950                           gimple **ptr_incr, bool only_init,
4951                           tree iv_step)
4952 {
4953   const char *base_name;
4954   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4955   class loop *loop = NULL;
4956   bool nested_in_vect_loop = false;
4957   class loop *containing_loop = NULL;
4958   tree aggr_ptr_type;
4959   tree aggr_ptr;
4960   tree new_temp;
4961   gimple_seq new_stmt_list = NULL;
4962   edge pe = NULL;
4963   basic_block new_bb;
4964   tree aggr_ptr_init;
4965   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4966   struct data_reference *dr = dr_info->dr;
4967   tree aptr;
4968   gimple_stmt_iterator incr_gsi;
4969   bool insert_after;
4970   tree indx_before_incr, indx_after_incr;
4971   gimple *incr;
4972   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4973
4974   gcc_assert (iv_step != NULL_TREE
4975               || TREE_CODE (aggr_type) == ARRAY_TYPE
4976               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4977
4978   if (loop_vinfo)
4979     {
4980       loop = LOOP_VINFO_LOOP (loop_vinfo);
4981       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4982       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4983       pe = loop_preheader_edge (loop);
4984     }
4985   else
4986     {
4987       gcc_assert (bb_vinfo);
4988       only_init = true;
4989       *ptr_incr = NULL;
4990     }
4991
4992   /* Create an expression for the first address accessed by this load
4993      in LOOP.  */
4994   base_name = get_name (DR_BASE_ADDRESS (dr));
4995
4996   if (dump_enabled_p ())
4997     {
4998       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4999       dump_printf_loc (MSG_NOTE, vect_location,
5000                        "create %s-pointer variable to type: %T",
5001                        get_tree_code_name (TREE_CODE (aggr_type)),
5002                        aggr_type);
5003       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5004         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5005       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5006         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5007       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5008         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5009       else
5010         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5011       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5012     }
5013
5014   /* (1) Create the new aggregate-pointer variable.
5015      Vector and array types inherit the alias set of their component
5016      type by default so we need to use a ref-all pointer if the data
5017      reference does not conflict with the created aggregated data
5018      reference because it is not addressable.  */
5019   bool need_ref_all = false;
5020   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5021                               get_alias_set (DR_REF (dr))))
5022     need_ref_all = true;
5023   /* Likewise for any of the data references in the stmt group.  */
5024   else if (DR_GROUP_SIZE (stmt_info) > 1)
5025     {
5026       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5027       do
5028         {
5029           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5030           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5031                                       get_alias_set (DR_REF (sdr))))
5032             {
5033               need_ref_all = true;
5034               break;
5035             }
5036           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5037         }
5038       while (sinfo);
5039     }
5040   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5041                                                need_ref_all);
5042   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5043
5044
5045   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5046      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5047      def-use update cycles for the pointer: one relative to the outer-loop
5048      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5049      to the inner-loop (which is the inner-most loop containing the dataref),
5050      and this is done be step (5) below.
5051
5052      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5053      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5054      redundant.  Steps (3),(4) create the following:
5055
5056         vp0 = &base_addr;
5057         LOOP:   vp1 = phi(vp0,vp2)
5058                 ...
5059                 ...
5060                 vp2 = vp1 + step
5061                 goto LOOP
5062
5063      If there is an inner-loop nested in loop, then step (5) will also be
5064      applied, and an additional update in the inner-loop will be created:
5065
5066         vp0 = &base_addr;
5067         LOOP:   vp1 = phi(vp0,vp2)
5068                 ...
5069         inner:     vp3 = phi(vp1,vp4)
5070                    vp4 = vp3 + inner_step
5071                    if () goto inner
5072                 ...
5073                 vp2 = vp1 + step
5074                 if () goto LOOP   */
5075
5076   /* (2) Calculate the initial address of the aggregate-pointer, and set
5077      the aggregate-pointer to point to it before the loop.  */
5078
5079   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5080
5081   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5082                                                    stmt_info, &new_stmt_list,
5083                                                    offset);
5084   if (new_stmt_list)
5085     {
5086       if (pe)
5087         {
5088           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5089           gcc_assert (!new_bb);
5090         }
5091       else
5092         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5093     }
5094
5095   *initial_address = new_temp;
5096   aggr_ptr_init = new_temp;
5097
5098   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5099      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5100      inner-loop nested in LOOP (during outer-loop vectorization).  */
5101
5102   /* No update in loop is required.  */
5103   if (only_init && (!loop_vinfo || at_loop == loop))
5104     aptr = aggr_ptr_init;
5105   else
5106     {
5107       /* Accesses to invariant addresses should be handled specially
5108          by the caller.  */
5109       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5110       gcc_assert (!integer_zerop (step));
5111
5112       if (iv_step == NULL_TREE)
5113         {
5114           /* The step of the aggregate pointer is the type size,
5115              negated for downward accesses.  */
5116           iv_step = TYPE_SIZE_UNIT (aggr_type);
5117           if (tree_int_cst_sgn (step) == -1)
5118             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5119         }
5120
5121       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5122
5123       create_iv (aggr_ptr_init, PLUS_EXPR,
5124                  fold_convert (aggr_ptr_type, iv_step),
5125                  aggr_ptr, loop, &incr_gsi, insert_after,
5126                  &indx_before_incr, &indx_after_incr);
5127       incr = gsi_stmt (incr_gsi);
5128
5129       /* Copy the points-to information if it exists. */
5130       if (DR_PTR_INFO (dr))
5131         {
5132           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5133           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5134         }
5135       if (ptr_incr)
5136         *ptr_incr = incr;
5137
5138       aptr = indx_before_incr;
5139     }
5140
5141   if (!nested_in_vect_loop || only_init)
5142     return aptr;
5143
5144
5145   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5146      nested in LOOP, if exists.  */
5147
5148   gcc_assert (nested_in_vect_loop);
5149   if (!only_init)
5150     {
5151       standard_iv_increment_position (containing_loop, &incr_gsi,
5152                                       &insert_after);
5153       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5154                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5155                  &indx_before_incr, &indx_after_incr);
5156       incr = gsi_stmt (incr_gsi);
5157
5158       /* Copy the points-to information if it exists. */
5159       if (DR_PTR_INFO (dr))
5160         {
5161           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5162           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5163         }
5164       if (ptr_incr)
5165         *ptr_incr = incr;
5166
5167       return indx_before_incr;
5168     }
5169   else
5170     gcc_unreachable ();
5171 }
5172
5173
5174 /* Function bump_vector_ptr
5175
5176    Increment a pointer (to a vector type) by vector-size. If requested,
5177    i.e. if PTR-INCR is given, then also connect the new increment stmt
5178    to the existing def-use update-chain of the pointer, by modifying
5179    the PTR_INCR as illustrated below:
5180
5181    The pointer def-use update-chain before this function:
5182                         DATAREF_PTR = phi (p_0, p_2)
5183                         ....
5184         PTR_INCR:       p_2 = DATAREF_PTR + step
5185
5186    The pointer def-use update-chain after this function:
5187                         DATAREF_PTR = phi (p_0, p_2)
5188                         ....
5189                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5190                         ....
5191         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5192
5193    Input:
5194    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5195                  in the loop.
5196    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5197               the loop.  The increment amount across iterations is expected
5198               to be vector_size.
5199    BSI - location where the new update stmt is to be placed.
5200    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5201    BUMP - optional. The offset by which to bump the pointer. If not given,
5202           the offset is assumed to be vector_size.
5203
5204    Output: Return NEW_DATAREF_PTR as illustrated above.
5205
5206 */
5207
5208 tree
5209 bump_vector_ptr (vec_info *vinfo,
5210                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5211                  stmt_vec_info stmt_info, tree bump)
5212 {
5213   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5214   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5215   tree update = TYPE_SIZE_UNIT (vectype);
5216   gimple *incr_stmt;
5217   ssa_op_iter iter;
5218   use_operand_p use_p;
5219   tree new_dataref_ptr;
5220
5221   if (bump)
5222     update = bump;
5223
5224   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5225     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5226   else if (is_gimple_min_invariant (dataref_ptr))
5227     /* When possible avoid emitting a separate increment stmt that will
5228        force the addressed object addressable.  */
5229     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5230                    fold_build2 (MEM_REF,
5231                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5232                                 dataref_ptr,
5233                                 fold_convert (ptr_type_node, update)));
5234   else
5235     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5236   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5237                                    dataref_ptr, update);
5238   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5239   /* Fold the increment, avoiding excessive chains use-def chains of
5240      those, leading to compile-time issues for passes until the next
5241      forwprop pass which would do this as well.  */
5242   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5243   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5244     {
5245       incr_stmt = gsi_stmt (fold_gsi);
5246       update_stmt (incr_stmt);
5247     }
5248
5249   /* Copy the points-to information if it exists. */
5250   if (DR_PTR_INFO (dr))
5251     {
5252       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5253       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5254     }
5255
5256   if (!ptr_incr)
5257     return new_dataref_ptr;
5258
5259   /* Update the vector-pointer's cross-iteration increment.  */
5260   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5261     {
5262       tree use = USE_FROM_PTR (use_p);
5263
5264       if (use == dataref_ptr)
5265         SET_USE (use_p, new_dataref_ptr);
5266       else
5267         gcc_assert (operand_equal_p (use, update, 0));
5268     }
5269
5270   return new_dataref_ptr;
5271 }
5272
5273
5274 /* Copy memory reference info such as base/clique from the SRC reference
5275    to the DEST MEM_REF.  */
5276
5277 void
5278 vect_copy_ref_info (tree dest, tree src)
5279 {
5280   if (TREE_CODE (dest) != MEM_REF)
5281     return;
5282
5283   tree src_base = src;
5284   while (handled_component_p (src_base))
5285     src_base = TREE_OPERAND (src_base, 0);
5286   if (TREE_CODE (src_base) != MEM_REF
5287       && TREE_CODE (src_base) != TARGET_MEM_REF)
5288     return;
5289
5290   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5291   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5292 }
5293
5294
5295 /* Function vect_create_destination_var.
5296
5297    Create a new temporary of type VECTYPE.  */
5298
5299 tree
5300 vect_create_destination_var (tree scalar_dest, tree vectype)
5301 {
5302   tree vec_dest;
5303   const char *name;
5304   char *new_name;
5305   tree type;
5306   enum vect_var_kind kind;
5307
5308   kind = vectype
5309     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5310     ? vect_mask_var
5311     : vect_simple_var
5312     : vect_scalar_var;
5313   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5314
5315   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5316
5317   name = get_name (scalar_dest);
5318   if (name)
5319     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5320   else
5321     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5322   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5323   free (new_name);
5324
5325   return vec_dest;
5326 }
5327
5328 /* Function vect_grouped_store_supported.
5329
5330    Returns TRUE if interleave high and interleave low permutations
5331    are supported, and FALSE otherwise.  */
5332
5333 bool
5334 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5335 {
5336   machine_mode mode = TYPE_MODE (vectype);
5337
5338   /* vect_permute_store_chain requires the group size to be equal to 3 or
5339      be a power of two.  */
5340   if (count != 3 && exact_log2 (count) == -1)
5341     {
5342       if (dump_enabled_p ())
5343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5344                          "the size of the group of accesses"
5345                          " is not a power of 2 or not eqaul to 3\n");
5346       return false;
5347     }
5348
5349   /* Check that the permutation is supported.  */
5350   if (VECTOR_MODE_P (mode))
5351     {
5352       unsigned int i;
5353       if (count == 3)
5354         {
5355           unsigned int j0 = 0, j1 = 0, j2 = 0;
5356           unsigned int i, j;
5357
5358           unsigned int nelt;
5359           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5360             {
5361               if (dump_enabled_p ())
5362                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5363                                  "cannot handle groups of 3 stores for"
5364                                  " variable-length vectors\n");
5365               return false;
5366             }
5367
5368           vec_perm_builder sel (nelt, nelt, 1);
5369           sel.quick_grow (nelt);
5370           vec_perm_indices indices;
5371           for (j = 0; j < 3; j++)
5372             {
5373               int nelt0 = ((3 - j) * nelt) % 3;
5374               int nelt1 = ((3 - j) * nelt + 1) % 3;
5375               int nelt2 = ((3 - j) * nelt + 2) % 3;
5376               for (i = 0; i < nelt; i++)
5377                 {
5378                   if (3 * i + nelt0 < nelt)
5379                     sel[3 * i + nelt0] = j0++;
5380                   if (3 * i + nelt1 < nelt)
5381                     sel[3 * i + nelt1] = nelt + j1++;
5382                   if (3 * i + nelt2 < nelt)
5383                     sel[3 * i + nelt2] = 0;
5384                 }
5385               indices.new_vector (sel, 2, nelt);
5386               if (!can_vec_perm_const_p (mode, mode, indices))
5387                 {
5388                   if (dump_enabled_p ())
5389                     dump_printf (MSG_MISSED_OPTIMIZATION,
5390                                  "permutation op not supported by target.\n");
5391                   return false;
5392                 }
5393
5394               for (i = 0; i < nelt; i++)
5395                 {
5396                   if (3 * i + nelt0 < nelt)
5397                     sel[3 * i + nelt0] = 3 * i + nelt0;
5398                   if (3 * i + nelt1 < nelt)
5399                     sel[3 * i + nelt1] = 3 * i + nelt1;
5400                   if (3 * i + nelt2 < nelt)
5401                     sel[3 * i + nelt2] = nelt + j2++;
5402                 }
5403               indices.new_vector (sel, 2, nelt);
5404               if (!can_vec_perm_const_p (mode, mode, indices))
5405                 {
5406                   if (dump_enabled_p ())
5407                     dump_printf (MSG_MISSED_OPTIMIZATION,
5408                                  "permutation op not supported by target.\n");
5409                   return false;
5410                 }
5411             }
5412           return true;
5413         }
5414       else
5415         {
5416           /* If length is not equal to 3 then only power of 2 is supported.  */
5417           gcc_assert (pow2p_hwi (count));
5418           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5419
5420           /* The encoding has 2 interleaved stepped patterns.  */
5421           if(!multiple_p (nelt, 2))
5422             return false;
5423           vec_perm_builder sel (nelt, 2, 3);
5424           sel.quick_grow (6);
5425           for (i = 0; i < 3; i++)
5426             {
5427               sel[i * 2] = i;
5428               sel[i * 2 + 1] = i + nelt;
5429             }
5430           vec_perm_indices indices (sel, 2, nelt);
5431           if (can_vec_perm_const_p (mode, mode, indices))
5432             {
5433               for (i = 0; i < 6; i++)
5434                 sel[i] += exact_div (nelt, 2);
5435               indices.new_vector (sel, 2, nelt);
5436               if (can_vec_perm_const_p (mode, mode, indices))
5437                 return true;
5438             }
5439         }
5440     }
5441
5442   if (dump_enabled_p ())
5443     dump_printf (MSG_MISSED_OPTIMIZATION,
5444                  "permutation op not supported by target.\n");
5445   return false;
5446 }
5447
5448 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5449    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5450
5451 internal_fn
5452 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5453                             bool masked_p)
5454 {
5455   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5456                                     vec_mask_len_store_lanes_optab, vectype,
5457                                     count))
5458     return IFN_MASK_LEN_STORE_LANES;
5459   else if (masked_p)
5460     {
5461       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5462                                         vec_mask_store_lanes_optab, vectype,
5463                                         count))
5464         return IFN_MASK_STORE_LANES;
5465     }
5466   else
5467     {
5468       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5469                                         vec_store_lanes_optab, vectype, count))
5470         return IFN_STORE_LANES;
5471     }
5472   return IFN_LAST;
5473 }
5474
5475
5476 /* Function vect_permute_store_chain.
5477
5478    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5479    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5480    the data correctly for the stores.  Return the final references for stores
5481    in RESULT_CHAIN.
5482
5483    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5484    The input is 4 vectors each containing 8 elements.  We assign a number to
5485    each element, the input sequence is:
5486
5487    1st vec:   0  1  2  3  4  5  6  7
5488    2nd vec:   8  9 10 11 12 13 14 15
5489    3rd vec:  16 17 18 19 20 21 22 23
5490    4th vec:  24 25 26 27 28 29 30 31
5491
5492    The output sequence should be:
5493
5494    1st vec:  0  8 16 24  1  9 17 25
5495    2nd vec:  2 10 18 26  3 11 19 27
5496    3rd vec:  4 12 20 28  5 13 21 30
5497    4th vec:  6 14 22 30  7 15 23 31
5498
5499    i.e., we interleave the contents of the four vectors in their order.
5500
5501    We use interleave_high/low instructions to create such output.  The input of
5502    each interleave_high/low operation is two vectors:
5503    1st vec    2nd vec
5504    0 1 2 3    4 5 6 7
5505    the even elements of the result vector are obtained left-to-right from the
5506    high/low elements of the first vector.  The odd elements of the result are
5507    obtained left-to-right from the high/low elements of the second vector.
5508    The output of interleave_high will be:   0 4 1 5
5509    and of interleave_low:                   2 6 3 7
5510
5511
5512    The permutation is done in log LENGTH stages.  In each stage interleave_high
5513    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5514    where the first argument is taken from the first half of DR_CHAIN and the
5515    second argument from it's second half.
5516    In our example,
5517
5518    I1: interleave_high (1st vec, 3rd vec)
5519    I2: interleave_low (1st vec, 3rd vec)
5520    I3: interleave_high (2nd vec, 4th vec)
5521    I4: interleave_low (2nd vec, 4th vec)
5522
5523    The output for the first stage is:
5524
5525    I1:  0 16  1 17  2 18  3 19
5526    I2:  4 20  5 21  6 22  7 23
5527    I3:  8 24  9 25 10 26 11 27
5528    I4: 12 28 13 29 14 30 15 31
5529
5530    The output of the second stage, i.e. the final result is:
5531
5532    I1:  0  8 16 24  1  9 17 25
5533    I2:  2 10 18 26  3 11 19 27
5534    I3:  4 12 20 28  5 13 21 30
5535    I4:  6 14 22 30  7 15 23 31.  */
5536
5537 void
5538 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5539                           unsigned int length,
5540                           stmt_vec_info stmt_info,
5541                           gimple_stmt_iterator *gsi,
5542                           vec<tree> *result_chain)
5543 {
5544   tree vect1, vect2, high, low;
5545   gimple *perm_stmt;
5546   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5547   tree perm_mask_low, perm_mask_high;
5548   tree data_ref;
5549   tree perm3_mask_low, perm3_mask_high;
5550   unsigned int i, j, n, log_length = exact_log2 (length);
5551
5552   result_chain->quick_grow (length);
5553   memcpy (result_chain->address (), dr_chain.address (),
5554           length * sizeof (tree));
5555
5556   if (length == 3)
5557     {
5558       /* vect_grouped_store_supported ensures that this is constant.  */
5559       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5560       unsigned int j0 = 0, j1 = 0, j2 = 0;
5561
5562       vec_perm_builder sel (nelt, nelt, 1);
5563       sel.quick_grow (nelt);
5564       vec_perm_indices indices;
5565       for (j = 0; j < 3; j++)
5566         {
5567           int nelt0 = ((3 - j) * nelt) % 3;
5568           int nelt1 = ((3 - j) * nelt + 1) % 3;
5569           int nelt2 = ((3 - j) * nelt + 2) % 3;
5570
5571           for (i = 0; i < nelt; i++)
5572             {
5573               if (3 * i + nelt0 < nelt)
5574                 sel[3 * i + nelt0] = j0++;
5575               if (3 * i + nelt1 < nelt)
5576                 sel[3 * i + nelt1] = nelt + j1++;
5577               if (3 * i + nelt2 < nelt)
5578                 sel[3 * i + nelt2] = 0;
5579             }
5580           indices.new_vector (sel, 2, nelt);
5581           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5582
5583           for (i = 0; i < nelt; i++)
5584             {
5585               if (3 * i + nelt0 < nelt)
5586                 sel[3 * i + nelt0] = 3 * i + nelt0;
5587               if (3 * i + nelt1 < nelt)
5588                 sel[3 * i + nelt1] = 3 * i + nelt1;
5589               if (3 * i + nelt2 < nelt)
5590                 sel[3 * i + nelt2] = nelt + j2++;
5591             }
5592           indices.new_vector (sel, 2, nelt);
5593           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5594
5595           vect1 = dr_chain[0];
5596           vect2 = dr_chain[1];
5597
5598           /* Create interleaving stmt:
5599              low = VEC_PERM_EXPR <vect1, vect2,
5600                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5601                                    j + 2, nelt + j + 2, *, ...}>  */
5602           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5603           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5604                                            vect2, perm3_mask_low);
5605           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5606
5607           vect1 = data_ref;
5608           vect2 = dr_chain[2];
5609           /* Create interleaving stmt:
5610              low = VEC_PERM_EXPR <vect1, vect2,
5611                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5612                                    6, 7, nelt + j + 2, ...}>  */
5613           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5614           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5615                                            vect2, perm3_mask_high);
5616           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5617           (*result_chain)[j] = data_ref;
5618         }
5619     }
5620   else
5621     {
5622       /* If length is not equal to 3 then only power of 2 is supported.  */
5623       gcc_assert (pow2p_hwi (length));
5624
5625       /* The encoding has 2 interleaved stepped patterns.  */
5626       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5627       vec_perm_builder sel (nelt, 2, 3);
5628       sel.quick_grow (6);
5629       for (i = 0; i < 3; i++)
5630         {
5631           sel[i * 2] = i;
5632           sel[i * 2 + 1] = i + nelt;
5633         }
5634         vec_perm_indices indices (sel, 2, nelt);
5635         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5636
5637         for (i = 0; i < 6; i++)
5638           sel[i] += exact_div (nelt, 2);
5639         indices.new_vector (sel, 2, nelt);
5640         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5641
5642         for (i = 0, n = log_length; i < n; i++)
5643           {
5644             for (j = 0; j < length/2; j++)
5645               {
5646                 vect1 = dr_chain[j];
5647                 vect2 = dr_chain[j+length/2];
5648
5649                 /* Create interleaving stmt:
5650                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5651                                                         ...}>  */
5652                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5653                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5654                                                  vect2, perm_mask_high);
5655                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5656                 (*result_chain)[2*j] = high;
5657
5658                 /* Create interleaving stmt:
5659                    low = VEC_PERM_EXPR <vect1, vect2,
5660                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5661                                          ...}>  */
5662                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5663                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5664                                                  vect2, perm_mask_low);
5665                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5666                 (*result_chain)[2*j+1] = low;
5667               }
5668             memcpy (dr_chain.address (), result_chain->address (),
5669                     length * sizeof (tree));
5670           }
5671     }
5672 }
5673
5674 /* Function vect_setup_realignment
5675
5676    This function is called when vectorizing an unaligned load using
5677    the dr_explicit_realign[_optimized] scheme.
5678    This function generates the following code at the loop prolog:
5679
5680       p = initial_addr;
5681    x  msq_init = *(floor(p));   # prolog load
5682       realignment_token = call target_builtin;
5683     loop:
5684    x  msq = phi (msq_init, ---)
5685
5686    The stmts marked with x are generated only for the case of
5687    dr_explicit_realign_optimized.
5688
5689    The code above sets up a new (vector) pointer, pointing to the first
5690    location accessed by STMT_INFO, and a "floor-aligned" load using that
5691    pointer.  It also generates code to compute the "realignment-token"
5692    (if the relevant target hook was defined), and creates a phi-node at the
5693    loop-header bb whose arguments are the result of the prolog-load (created
5694    by this function) and the result of a load that takes place in the loop
5695    (to be created by the caller to this function).
5696
5697    For the case of dr_explicit_realign_optimized:
5698    The caller to this function uses the phi-result (msq) to create the
5699    realignment code inside the loop, and sets up the missing phi argument,
5700    as follows:
5701     loop:
5702       msq = phi (msq_init, lsq)
5703       lsq = *(floor(p'));        # load in loop
5704       result = realign_load (msq, lsq, realignment_token);
5705
5706    For the case of dr_explicit_realign:
5707     loop:
5708       msq = *(floor(p));        # load in loop
5709       p' = p + (VS-1);
5710       lsq = *(floor(p'));       # load in loop
5711       result = realign_load (msq, lsq, realignment_token);
5712
5713    Input:
5714    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5715                a memory location that may be unaligned.
5716    BSI - place where new code is to be inserted.
5717    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5718                               is used.
5719
5720    Output:
5721    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5722                        target hook, if defined.
5723    Return value - the result of the loop-header phi node.  */
5724
5725 tree
5726 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5727                         gimple_stmt_iterator *gsi, tree *realignment_token,
5728                         enum dr_alignment_support alignment_support_scheme,
5729                         tree init_addr,
5730                         class loop **at_loop)
5731 {
5732   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5733   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5734   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5735   struct data_reference *dr = dr_info->dr;
5736   class loop *loop = NULL;
5737   edge pe = NULL;
5738   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5739   tree vec_dest;
5740   gimple *inc;
5741   tree ptr;
5742   tree data_ref;
5743   basic_block new_bb;
5744   tree msq_init = NULL_TREE;
5745   tree new_temp;
5746   gphi *phi_stmt;
5747   tree msq = NULL_TREE;
5748   gimple_seq stmts = NULL;
5749   bool compute_in_loop = false;
5750   bool nested_in_vect_loop = false;
5751   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5752   class loop *loop_for_initial_load = NULL;
5753
5754   if (loop_vinfo)
5755     {
5756       loop = LOOP_VINFO_LOOP (loop_vinfo);
5757       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5758     }
5759
5760   gcc_assert (alignment_support_scheme == dr_explicit_realign
5761               || alignment_support_scheme == dr_explicit_realign_optimized);
5762
5763   /* We need to generate three things:
5764      1. the misalignment computation
5765      2. the extra vector load (for the optimized realignment scheme).
5766      3. the phi node for the two vectors from which the realignment is
5767       done (for the optimized realignment scheme).  */
5768
5769   /* 1. Determine where to generate the misalignment computation.
5770
5771      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5772      calculation will be generated by this function, outside the loop (in the
5773      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5774      caller, inside the loop.
5775
5776      Background: If the misalignment remains fixed throughout the iterations of
5777      the loop, then both realignment schemes are applicable, and also the
5778      misalignment computation can be done outside LOOP.  This is because we are
5779      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5780      are a multiple of VS (the Vector Size), and therefore the misalignment in
5781      different vectorized LOOP iterations is always the same.
5782      The problem arises only if the memory access is in an inner-loop nested
5783      inside LOOP, which is now being vectorized using outer-loop vectorization.
5784      This is the only case when the misalignment of the memory access may not
5785      remain fixed throughout the iterations of the inner-loop (as explained in
5786      detail in vect_supportable_dr_alignment).  In this case, not only is the
5787      optimized realignment scheme not applicable, but also the misalignment
5788      computation (and generation of the realignment token that is passed to
5789      REALIGN_LOAD) have to be done inside the loop.
5790
5791      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5792      or not, which in turn determines if the misalignment is computed inside
5793      the inner-loop, or outside LOOP.  */
5794
5795   if (init_addr != NULL_TREE || !loop_vinfo)
5796     {
5797       compute_in_loop = true;
5798       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5799     }
5800
5801
5802   /* 2. Determine where to generate the extra vector load.
5803
5804      For the optimized realignment scheme, instead of generating two vector
5805      loads in each iteration, we generate a single extra vector load in the
5806      preheader of the loop, and in each iteration reuse the result of the
5807      vector load from the previous iteration.  In case the memory access is in
5808      an inner-loop nested inside LOOP, which is now being vectorized using
5809      outer-loop vectorization, we need to determine whether this initial vector
5810      load should be generated at the preheader of the inner-loop, or can be
5811      generated at the preheader of LOOP.  If the memory access has no evolution
5812      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5813      to be generated inside LOOP (in the preheader of the inner-loop).  */
5814
5815   if (nested_in_vect_loop)
5816     {
5817       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5818       bool invariant_in_outerloop =
5819             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5820       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5821     }
5822   else
5823     loop_for_initial_load = loop;
5824   if (at_loop)
5825     *at_loop = loop_for_initial_load;
5826
5827   tree vuse = NULL_TREE;
5828   if (loop_for_initial_load)
5829     {
5830       pe = loop_preheader_edge (loop_for_initial_load);
5831       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5832         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5833     }
5834   if (!vuse)
5835     vuse = gimple_vuse (gsi_stmt (*gsi));
5836
5837   /* 3. For the case of the optimized realignment, create the first vector
5838       load at the loop preheader.  */
5839
5840   if (alignment_support_scheme == dr_explicit_realign_optimized)
5841     {
5842       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5843       gassign *new_stmt;
5844
5845       gcc_assert (!compute_in_loop);
5846       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5847       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5848                                       loop_for_initial_load, NULL_TREE,
5849                                       &init_addr, NULL, &inc, true);
5850       if (TREE_CODE (ptr) == SSA_NAME)
5851         new_temp = copy_ssa_name (ptr);
5852       else
5853         new_temp = make_ssa_name (TREE_TYPE (ptr));
5854       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5855       tree type = TREE_TYPE (ptr);
5856       new_stmt = gimple_build_assign
5857                    (new_temp, BIT_AND_EXPR, ptr,
5858                     fold_build2 (MINUS_EXPR, type,
5859                                  build_int_cst (type, 0),
5860                                  build_int_cst (type, align)));
5861       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5862       gcc_assert (!new_bb);
5863       data_ref
5864         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5865                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5866       vect_copy_ref_info (data_ref, DR_REF (dr));
5867       new_stmt = gimple_build_assign (vec_dest, data_ref);
5868       new_temp = make_ssa_name (vec_dest, new_stmt);
5869       gimple_assign_set_lhs (new_stmt, new_temp);
5870       gimple_set_vuse (new_stmt, vuse);
5871       if (pe)
5872         {
5873           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5874           gcc_assert (!new_bb);
5875         }
5876       else
5877          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5878
5879       msq_init = gimple_assign_lhs (new_stmt);
5880     }
5881
5882   /* 4. Create realignment token using a target builtin, if available.
5883       It is done either inside the containing loop, or before LOOP (as
5884       determined above).  */
5885
5886   if (targetm.vectorize.builtin_mask_for_load)
5887     {
5888       gcall *new_stmt;
5889       tree builtin_decl;
5890
5891       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5892       if (!init_addr)
5893         {
5894           /* Generate the INIT_ADDR computation outside LOOP.  */
5895           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5896                                                             stmt_info, &stmts,
5897                                                             NULL_TREE);
5898           if (loop)
5899             {
5900               pe = loop_preheader_edge (loop);
5901               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5902               gcc_assert (!new_bb);
5903             }
5904           else
5905              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5906         }
5907
5908       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5909       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5910       vec_dest =
5911         vect_create_destination_var (scalar_dest,
5912                                      gimple_call_return_type (new_stmt));
5913       new_temp = make_ssa_name (vec_dest, new_stmt);
5914       gimple_call_set_lhs (new_stmt, new_temp);
5915
5916       if (compute_in_loop)
5917         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5918       else
5919         {
5920           /* Generate the misalignment computation outside LOOP.  */
5921           pe = loop_preheader_edge (loop);
5922           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5923           gcc_assert (!new_bb);
5924         }
5925
5926       *realignment_token = gimple_call_lhs (new_stmt);
5927
5928       /* The result of the CALL_EXPR to this builtin is determined from
5929          the value of the parameter and no global variables are touched
5930          which makes the builtin a "const" function.  Requiring the
5931          builtin to have the "const" attribute makes it unnecessary
5932          to call mark_call_clobbered.  */
5933       gcc_assert (TREE_READONLY (builtin_decl));
5934     }
5935
5936   if (alignment_support_scheme == dr_explicit_realign)
5937     return msq;
5938
5939   gcc_assert (!compute_in_loop);
5940   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5941
5942
5943   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5944
5945   pe = loop_preheader_edge (containing_loop);
5946   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5947   msq = make_ssa_name (vec_dest);
5948   phi_stmt = create_phi_node (msq, containing_loop->header);
5949   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5950
5951   return msq;
5952 }
5953
5954
5955 /* Function vect_grouped_load_supported.
5956
5957    COUNT is the size of the load group (the number of statements plus the
5958    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5959    only one statement, with a gap of COUNT - 1.
5960
5961    Returns true if a suitable permute exists.  */
5962
5963 bool
5964 vect_grouped_load_supported (tree vectype, bool single_element_p,
5965                              unsigned HOST_WIDE_INT count)
5966 {
5967   machine_mode mode = TYPE_MODE (vectype);
5968
5969   /* If this is single-element interleaving with an element distance
5970      that leaves unused vector loads around punt - we at least create
5971      very sub-optimal code in that case (and blow up memory,
5972      see PR65518).  */
5973   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5974     {
5975       if (dump_enabled_p ())
5976         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5977                          "single-element interleaving not supported "
5978                          "for not adjacent vector loads\n");
5979       return false;
5980     }
5981
5982   /* vect_permute_load_chain requires the group size to be equal to 3 or
5983      be a power of two.  */
5984   if (count != 3 && exact_log2 (count) == -1)
5985     {
5986       if (dump_enabled_p ())
5987         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5988                          "the size of the group of accesses"
5989                          " is not a power of 2 or not equal to 3\n");
5990       return false;
5991     }
5992
5993   /* Check that the permutation is supported.  */
5994   if (VECTOR_MODE_P (mode))
5995     {
5996       unsigned int i, j;
5997       if (count == 3)
5998         {
5999           unsigned int nelt;
6000           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6001             {
6002               if (dump_enabled_p ())
6003                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6004                                  "cannot handle groups of 3 loads for"
6005                                  " variable-length vectors\n");
6006               return false;
6007             }
6008
6009           vec_perm_builder sel (nelt, nelt, 1);
6010           sel.quick_grow (nelt);
6011           vec_perm_indices indices;
6012           unsigned int k;
6013           for (k = 0; k < 3; k++)
6014             {
6015               for (i = 0; i < nelt; i++)
6016                 if (3 * i + k < 2 * nelt)
6017                   sel[i] = 3 * i + k;
6018                 else
6019                   sel[i] = 0;
6020               indices.new_vector (sel, 2, nelt);
6021               if (!can_vec_perm_const_p (mode, mode, indices))
6022                 {
6023                   if (dump_enabled_p ())
6024                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6025                                      "shuffle of 3 loads is not supported by"
6026                                      " target\n");
6027                   return false;
6028                 }
6029               for (i = 0, j = 0; i < nelt; i++)
6030                 if (3 * i + k < 2 * nelt)
6031                   sel[i] = i;
6032                 else
6033                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6034               indices.new_vector (sel, 2, nelt);
6035               if (!can_vec_perm_const_p (mode, mode, indices))
6036                 {
6037                   if (dump_enabled_p ())
6038                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6039                                      "shuffle of 3 loads is not supported by"
6040                                      " target\n");
6041                   return false;
6042                 }
6043             }
6044           return true;
6045         }
6046       else
6047         {
6048           /* If length is not equal to 3 then only power of 2 is supported.  */
6049           gcc_assert (pow2p_hwi (count));
6050           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6051
6052           /* The encoding has a single stepped pattern.  */
6053           vec_perm_builder sel (nelt, 1, 3);
6054           sel.quick_grow (3);
6055           for (i = 0; i < 3; i++)
6056             sel[i] = i * 2;
6057           vec_perm_indices indices (sel, 2, nelt);
6058           if (can_vec_perm_const_p (mode, mode, indices))
6059             {
6060               for (i = 0; i < 3; i++)
6061                 sel[i] = i * 2 + 1;
6062               indices.new_vector (sel, 2, nelt);
6063               if (can_vec_perm_const_p (mode, mode, indices))
6064                 return true;
6065             }
6066         }
6067     }
6068
6069   if (dump_enabled_p ())
6070     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6071                      "extract even/odd not supported by target\n");
6072   return false;
6073 }
6074
6075 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6076    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6077
6078 internal_fn
6079 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6080                            bool masked_p)
6081 {
6082   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6083                                     vec_mask_len_load_lanes_optab, vectype,
6084                                     count))
6085     return IFN_MASK_LEN_LOAD_LANES;
6086   else if (masked_p)
6087     {
6088       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6089                                         vec_mask_load_lanes_optab, vectype,
6090                                         count))
6091         return IFN_MASK_LOAD_LANES;
6092     }
6093   else
6094     {
6095       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6096                                         vectype, count))
6097         return IFN_LOAD_LANES;
6098     }
6099   return IFN_LAST;
6100 }
6101
6102 /* Function vect_permute_load_chain.
6103
6104    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6105    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6106    the input data correctly.  Return the final references for loads in
6107    RESULT_CHAIN.
6108
6109    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6110    The input is 4 vectors each containing 8 elements. We assign a number to each
6111    element, the input sequence is:
6112
6113    1st vec:   0  1  2  3  4  5  6  7
6114    2nd vec:   8  9 10 11 12 13 14 15
6115    3rd vec:  16 17 18 19 20 21 22 23
6116    4th vec:  24 25 26 27 28 29 30 31
6117
6118    The output sequence should be:
6119
6120    1st vec:  0 4  8 12 16 20 24 28
6121    2nd vec:  1 5  9 13 17 21 25 29
6122    3rd vec:  2 6 10 14 18 22 26 30
6123    4th vec:  3 7 11 15 19 23 27 31
6124
6125    i.e., the first output vector should contain the first elements of each
6126    interleaving group, etc.
6127
6128    We use extract_even/odd instructions to create such output.  The input of
6129    each extract_even/odd operation is two vectors
6130    1st vec    2nd vec
6131    0 1 2 3    4 5 6 7
6132
6133    and the output is the vector of extracted even/odd elements.  The output of
6134    extract_even will be:   0 2 4 6
6135    and of extract_odd:     1 3 5 7
6136
6137
6138    The permutation is done in log LENGTH stages.  In each stage extract_even
6139    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6140    their order.  In our example,
6141
6142    E1: extract_even (1st vec, 2nd vec)
6143    E2: extract_odd (1st vec, 2nd vec)
6144    E3: extract_even (3rd vec, 4th vec)
6145    E4: extract_odd (3rd vec, 4th vec)
6146
6147    The output for the first stage will be:
6148
6149    E1:  0  2  4  6  8 10 12 14
6150    E2:  1  3  5  7  9 11 13 15
6151    E3: 16 18 20 22 24 26 28 30
6152    E4: 17 19 21 23 25 27 29 31
6153
6154    In order to proceed and create the correct sequence for the next stage (or
6155    for the correct output, if the second stage is the last one, as in our
6156    example), we first put the output of extract_even operation and then the
6157    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6158    The input for the second stage is:
6159
6160    1st vec (E1):  0  2  4  6  8 10 12 14
6161    2nd vec (E3): 16 18 20 22 24 26 28 30
6162    3rd vec (E2):  1  3  5  7  9 11 13 15
6163    4th vec (E4): 17 19 21 23 25 27 29 31
6164
6165    The output of the second stage:
6166
6167    E1: 0 4  8 12 16 20 24 28
6168    E2: 2 6 10 14 18 22 26 30
6169    E3: 1 5  9 13 17 21 25 29
6170    E4: 3 7 11 15 19 23 27 31
6171
6172    And RESULT_CHAIN after reordering:
6173
6174    1st vec (E1):  0 4  8 12 16 20 24 28
6175    2nd vec (E3):  1 5  9 13 17 21 25 29
6176    3rd vec (E2):  2 6 10 14 18 22 26 30
6177    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6178
6179 static void
6180 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6181                          unsigned int length,
6182                          stmt_vec_info stmt_info,
6183                          gimple_stmt_iterator *gsi,
6184                          vec<tree> *result_chain)
6185 {
6186   tree data_ref, first_vect, second_vect;
6187   tree perm_mask_even, perm_mask_odd;
6188   tree perm3_mask_low, perm3_mask_high;
6189   gimple *perm_stmt;
6190   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6191   unsigned int i, j, log_length = exact_log2 (length);
6192
6193   result_chain->quick_grow (length);
6194   memcpy (result_chain->address (), dr_chain.address (),
6195           length * sizeof (tree));
6196
6197   if (length == 3)
6198     {
6199       /* vect_grouped_load_supported ensures that this is constant.  */
6200       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6201       unsigned int k;
6202
6203       vec_perm_builder sel (nelt, nelt, 1);
6204       sel.quick_grow (nelt);
6205       vec_perm_indices indices;
6206       for (k = 0; k < 3; k++)
6207         {
6208           for (i = 0; i < nelt; i++)
6209             if (3 * i + k < 2 * nelt)
6210               sel[i] = 3 * i + k;
6211             else
6212               sel[i] = 0;
6213           indices.new_vector (sel, 2, nelt);
6214           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6215
6216           for (i = 0, j = 0; i < nelt; i++)
6217             if (3 * i + k < 2 * nelt)
6218               sel[i] = i;
6219             else
6220               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6221           indices.new_vector (sel, 2, nelt);
6222           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6223
6224           first_vect = dr_chain[0];
6225           second_vect = dr_chain[1];
6226
6227           /* Create interleaving stmt (low part of):
6228              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6229                                                              ...}>  */
6230           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6231           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6232                                            second_vect, perm3_mask_low);
6233           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6234
6235           /* Create interleaving stmt (high part of):
6236              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6237                                                               ...}>  */
6238           first_vect = data_ref;
6239           second_vect = dr_chain[2];
6240           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6241           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6242                                            second_vect, perm3_mask_high);
6243           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6244           (*result_chain)[k] = data_ref;
6245         }
6246     }
6247   else
6248     {
6249       /* If length is not equal to 3 then only power of 2 is supported.  */
6250       gcc_assert (pow2p_hwi (length));
6251
6252       /* The encoding has a single stepped pattern.  */
6253       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6254       vec_perm_builder sel (nelt, 1, 3);
6255       sel.quick_grow (3);
6256       for (i = 0; i < 3; ++i)
6257         sel[i] = i * 2;
6258       vec_perm_indices indices (sel, 2, nelt);
6259       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6260
6261       for (i = 0; i < 3; ++i)
6262         sel[i] = i * 2 + 1;
6263       indices.new_vector (sel, 2, nelt);
6264       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6265
6266       for (i = 0; i < log_length; i++)
6267         {
6268           for (j = 0; j < length; j += 2)
6269             {
6270               first_vect = dr_chain[j];
6271               second_vect = dr_chain[j+1];
6272
6273               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6274               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6275               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6276                                                first_vect, second_vect,
6277                                                perm_mask_even);
6278               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6279               (*result_chain)[j/2] = data_ref;
6280
6281               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6282               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6283               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6284                                                first_vect, second_vect,
6285                                                perm_mask_odd);
6286               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6287               (*result_chain)[j/2+length/2] = data_ref;
6288             }
6289           memcpy (dr_chain.address (), result_chain->address (),
6290                   length * sizeof (tree));
6291         }
6292     }
6293 }
6294
6295 /* Function vect_shift_permute_load_chain.
6296
6297    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6298    sequence of stmts to reorder the input data accordingly.
6299    Return the final references for loads in RESULT_CHAIN.
6300    Return true if successed, false otherwise.
6301
6302    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6303    The input is 3 vectors each containing 8 elements.  We assign a
6304    number to each element, the input sequence is:
6305
6306    1st vec:   0  1  2  3  4  5  6  7
6307    2nd vec:   8  9 10 11 12 13 14 15
6308    3rd vec:  16 17 18 19 20 21 22 23
6309
6310    The output sequence should be:
6311
6312    1st vec:  0 3 6  9 12 15 18 21
6313    2nd vec:  1 4 7 10 13 16 19 22
6314    3rd vec:  2 5 8 11 14 17 20 23
6315
6316    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6317
6318    First we shuffle all 3 vectors to get correct elements order:
6319
6320    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6321    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6322    3rd vec:  (16 19 22) (17 20 23) (18 21)
6323
6324    Next we unite and shift vector 3 times:
6325
6326    1st step:
6327      shift right by 6 the concatenation of:
6328      "1st vec" and  "2nd vec"
6329        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6330      "2nd vec" and  "3rd vec"
6331        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6332      "3rd vec" and  "1st vec"
6333        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6334                              | New vectors                   |
6335
6336      So that now new vectors are:
6337
6338      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6339      2nd vec:  (10 13) (16 19 22) (17 20 23)
6340      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6341
6342    2nd step:
6343      shift right by 5 the concatenation of:
6344      "1st vec" and  "3rd vec"
6345        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6346      "2nd vec" and  "1st vec"
6347        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6348      "3rd vec" and  "2nd vec"
6349        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6350                           | New vectors                   |
6351
6352      So that now new vectors are:
6353
6354      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6355      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6356      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6357
6358    3rd step:
6359      shift right by 5 the concatenation of:
6360      "1st vec" and  "1st vec"
6361        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6362      shift right by 3 the concatenation of:
6363      "2nd vec" and  "2nd vec"
6364                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6365                           | New vectors                   |
6366
6367      So that now all vectors are READY:
6368      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6369      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6370      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6371
6372    This algorithm is faster than one in vect_permute_load_chain if:
6373      1.  "shift of a concatination" is faster than general permutation.
6374          This is usually so.
6375      2.  The TARGET machine can't execute vector instructions in parallel.
6376          This is because each step of the algorithm depends on previous.
6377          The algorithm in vect_permute_load_chain is much more parallel.
6378
6379    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6380 */
6381
6382 static bool
6383 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6384                                unsigned int length,
6385                                stmt_vec_info stmt_info,
6386                                gimple_stmt_iterator *gsi,
6387                                vec<tree> *result_chain)
6388 {
6389   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6390   tree perm2_mask1, perm2_mask2, perm3_mask;
6391   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6392   gimple *perm_stmt;
6393
6394   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6395   machine_mode vmode = TYPE_MODE (vectype);
6396   unsigned int i;
6397   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6398
6399   unsigned HOST_WIDE_INT nelt, vf;
6400   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6401       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6402     /* Not supported for variable-length vectors.  */
6403     return false;
6404
6405   vec_perm_builder sel (nelt, nelt, 1);
6406   sel.quick_grow (nelt);
6407
6408   result_chain->quick_grow (length);
6409   memcpy (result_chain->address (), dr_chain.address (),
6410           length * sizeof (tree));
6411
6412   if (pow2p_hwi (length) && vf > 4)
6413     {
6414       unsigned int j, log_length = exact_log2 (length);
6415       for (i = 0; i < nelt / 2; ++i)
6416         sel[i] = i * 2;
6417       for (i = 0; i < nelt / 2; ++i)
6418         sel[nelt / 2 + i] = i * 2 + 1;
6419       vec_perm_indices indices (sel, 2, nelt);
6420       if (!can_vec_perm_const_p (vmode, vmode, indices))
6421         {
6422           if (dump_enabled_p ())
6423             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6424                              "shuffle of 2 fields structure is not \
6425                               supported by target\n");
6426           return false;
6427         }
6428       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6429
6430       for (i = 0; i < nelt / 2; ++i)
6431         sel[i] = i * 2 + 1;
6432       for (i = 0; i < nelt / 2; ++i)
6433         sel[nelt / 2 + i] = i * 2;
6434       indices.new_vector (sel, 2, nelt);
6435       if (!can_vec_perm_const_p (vmode, vmode, indices))
6436         {
6437           if (dump_enabled_p ())
6438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6439                              "shuffle of 2 fields structure is not \
6440                               supported by target\n");
6441           return false;
6442         }
6443       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6444
6445       /* Generating permutation constant to shift all elements.
6446          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6447       for (i = 0; i < nelt; i++)
6448         sel[i] = nelt / 2 + i;
6449       indices.new_vector (sel, 2, nelt);
6450       if (!can_vec_perm_const_p (vmode, vmode, indices))
6451         {
6452           if (dump_enabled_p ())
6453             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6454                              "shift permutation is not supported by target\n");
6455           return false;
6456         }
6457       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6458
6459       /* Generating permutation constant to select vector from 2.
6460          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6461       for (i = 0; i < nelt / 2; i++)
6462         sel[i] = i;
6463       for (i = nelt / 2; i < nelt; i++)
6464         sel[i] = nelt + i;
6465       indices.new_vector (sel, 2, nelt);
6466       if (!can_vec_perm_const_p (vmode, vmode, indices))
6467         {
6468           if (dump_enabled_p ())
6469             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470                              "select is not supported by target\n");
6471           return false;
6472         }
6473       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6474
6475       for (i = 0; i < log_length; i++)
6476         {
6477           for (j = 0; j < length; j += 2)
6478             {
6479               first_vect = dr_chain[j];
6480               second_vect = dr_chain[j + 1];
6481
6482               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6483               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6484                                                first_vect, first_vect,
6485                                                perm2_mask1);
6486               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6487               vect[0] = data_ref;
6488
6489               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6490               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6491                                                second_vect, second_vect,
6492                                                perm2_mask2);
6493               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6494               vect[1] = data_ref;
6495
6496               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6497               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6498                                                vect[0], vect[1], shift1_mask);
6499               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6500               (*result_chain)[j/2 + length/2] = data_ref;
6501
6502               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6503               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6504                                                vect[0], vect[1], select_mask);
6505               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6506               (*result_chain)[j/2] = data_ref;
6507             }
6508           memcpy (dr_chain.address (), result_chain->address (),
6509                   length * sizeof (tree));
6510         }
6511       return true;
6512     }
6513   if (length == 3 && vf > 2)
6514     {
6515       unsigned int k = 0, l = 0;
6516
6517       /* Generating permutation constant to get all elements in rigth order.
6518          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6519       for (i = 0; i < nelt; i++)
6520         {
6521           if (3 * k + (l % 3) >= nelt)
6522             {
6523               k = 0;
6524               l += (3 - (nelt % 3));
6525             }
6526           sel[i] = 3 * k + (l % 3);
6527           k++;
6528         }
6529       vec_perm_indices indices (sel, 2, nelt);
6530       if (!can_vec_perm_const_p (vmode, vmode, indices))
6531         {
6532           if (dump_enabled_p ())
6533             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6534                              "shuffle of 3 fields structure is not \
6535                               supported by target\n");
6536           return false;
6537         }
6538       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6539
6540       /* Generating permutation constant to shift all elements.
6541          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6542       for (i = 0; i < nelt; i++)
6543         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6544       indices.new_vector (sel, 2, nelt);
6545       if (!can_vec_perm_const_p (vmode, vmode, indices))
6546         {
6547           if (dump_enabled_p ())
6548             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6549                              "shift permutation is not supported by target\n");
6550           return false;
6551         }
6552       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6553
6554       /* Generating permutation constant to shift all elements.
6555          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6556       for (i = 0; i < nelt; i++)
6557         sel[i] = 2 * (nelt / 3) + 1 + i;
6558       indices.new_vector (sel, 2, nelt);
6559       if (!can_vec_perm_const_p (vmode, vmode, indices))
6560         {
6561           if (dump_enabled_p ())
6562             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6563                              "shift permutation is not supported by target\n");
6564           return false;
6565         }
6566       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6567
6568       /* Generating permutation constant to shift all elements.
6569          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6570       for (i = 0; i < nelt; i++)
6571         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6572       indices.new_vector (sel, 2, nelt);
6573       if (!can_vec_perm_const_p (vmode, vmode, indices))
6574         {
6575           if (dump_enabled_p ())
6576             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6577                              "shift permutation is not supported by target\n");
6578           return false;
6579         }
6580       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6581
6582       /* Generating permutation constant to shift all elements.
6583          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6584       for (i = 0; i < nelt; i++)
6585         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6586       indices.new_vector (sel, 2, nelt);
6587       if (!can_vec_perm_const_p (vmode, vmode, indices))
6588         {
6589           if (dump_enabled_p ())
6590             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6591                              "shift permutation is not supported by target\n");
6592           return false;
6593         }
6594       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6595
6596       for (k = 0; k < 3; k++)
6597         {
6598           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6599           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6600                                            dr_chain[k], dr_chain[k],
6601                                            perm3_mask);
6602           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6603           vect[k] = data_ref;
6604         }
6605
6606       for (k = 0; k < 3; k++)
6607         {
6608           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6609           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6610                                            vect[k % 3], vect[(k + 1) % 3],
6611                                            shift1_mask);
6612           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6613           vect_shift[k] = data_ref;
6614         }
6615
6616       for (k = 0; k < 3; k++)
6617         {
6618           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6619           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6620                                            vect_shift[(4 - k) % 3],
6621                                            vect_shift[(3 - k) % 3],
6622                                            shift2_mask);
6623           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6624           vect[k] = data_ref;
6625         }
6626
6627       (*result_chain)[3 - (nelt % 3)] = vect[2];
6628
6629       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6630       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6631                                        vect[0], shift3_mask);
6632       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6633       (*result_chain)[nelt % 3] = data_ref;
6634
6635       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6636       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6637                                        vect[1], shift4_mask);
6638       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6639       (*result_chain)[0] = data_ref;
6640       return true;
6641     }
6642   return false;
6643 }
6644
6645 /* Function vect_transform_grouped_load.
6646
6647    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6648    to perform their permutation and ascribe the result vectorized statements to
6649    the scalar statements.
6650 */
6651
6652 void
6653 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6654                              vec<tree> dr_chain,
6655                              int size, gimple_stmt_iterator *gsi)
6656 {
6657   machine_mode mode;
6658   vec<tree> result_chain = vNULL;
6659
6660   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6661      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6662      vectors, that are ready for vector computation.  */
6663   result_chain.create (size);
6664
6665   /* If reassociation width for vector type is 2 or greater target machine can
6666      execute 2 or more vector instructions in parallel.  Otherwise try to
6667      get chain for loads group using vect_shift_permute_load_chain.  */
6668   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6669   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6670       || pow2p_hwi (size)
6671       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6672                                          gsi, &result_chain))
6673     vect_permute_load_chain (vinfo, dr_chain,
6674                              size, stmt_info, gsi, &result_chain);
6675   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6676   result_chain.release ();
6677 }
6678
6679 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6680    generated as part of the vectorization of STMT_INFO.  Assign the statement
6681    for each vector to the associated scalar statement.  */
6682
6683 void
6684 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6685                                   vec<tree> result_chain)
6686 {
6687   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6688   unsigned int i, gap_count;
6689   tree tmp_data_ref;
6690
6691   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6692      Since we scan the chain starting from it's first node, their order
6693      corresponds the order of data-refs in RESULT_CHAIN.  */
6694   stmt_vec_info next_stmt_info = first_stmt_info;
6695   gap_count = 1;
6696   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6697     {
6698       if (!next_stmt_info)
6699         break;
6700
6701       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6702        code elimination pass later.  No need to check for the first stmt in
6703        the group, since it always exists.
6704        DR_GROUP_GAP is the number of steps in elements from the previous
6705        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6706        correspond to the gaps.  */
6707       if (next_stmt_info != first_stmt_info
6708           && gap_count < DR_GROUP_GAP (next_stmt_info))
6709         {
6710           gap_count++;
6711           continue;
6712         }
6713
6714       /* ???  The following needs cleanup after the removal of
6715          DR_GROUP_SAME_DR_STMT.  */
6716       if (next_stmt_info)
6717         {
6718           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6719           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6720              copies, and we put the new vector statement last.  */
6721           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6722
6723           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6724           gap_count = 1;
6725         }
6726     }
6727 }
6728
6729 /* Function vect_force_dr_alignment_p.
6730
6731    Returns whether the alignment of a DECL can be forced to be aligned
6732    on ALIGNMENT bit boundary.  */
6733
6734 bool
6735 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6736 {
6737   if (!VAR_P (decl))
6738     return false;
6739
6740   if (decl_in_symtab_p (decl)
6741       && !symtab_node::get (decl)->can_increase_alignment_p ())
6742     return false;
6743
6744   if (TREE_STATIC (decl))
6745     return (known_le (alignment,
6746                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6747   else
6748     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6749 }
6750
6751 /* Return whether the data reference DR_INFO is supported with respect to its
6752    alignment.
6753    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6754    it is aligned, i.e., check if it is possible to vectorize it with different
6755    alignment.  */
6756
6757 enum dr_alignment_support
6758 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6759                                tree vectype, int misalignment)
6760 {
6761   data_reference *dr = dr_info->dr;
6762   stmt_vec_info stmt_info = dr_info->stmt;
6763   machine_mode mode = TYPE_MODE (vectype);
6764   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6765   class loop *vect_loop = NULL;
6766   bool nested_in_vect_loop = false;
6767
6768   if (misalignment == 0)
6769     return dr_aligned;
6770
6771   /* For now assume all conditional loads/stores support unaligned
6772      access without any special code.  */
6773   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6774     if (gimple_call_internal_p (stmt)
6775         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6776             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6777       return dr_unaligned_supported;
6778
6779   if (loop_vinfo)
6780     {
6781       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6782       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6783     }
6784
6785   /* Possibly unaligned access.  */
6786
6787   /* We can choose between using the implicit realignment scheme (generating
6788      a misaligned_move stmt) and the explicit realignment scheme (generating
6789      aligned loads with a REALIGN_LOAD).  There are two variants to the
6790      explicit realignment scheme: optimized, and unoptimized.
6791      We can optimize the realignment only if the step between consecutive
6792      vector loads is equal to the vector size.  Since the vector memory
6793      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6794      is guaranteed that the misalignment amount remains the same throughout the
6795      execution of the vectorized loop.  Therefore, we can create the
6796      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6797      at the loop preheader.
6798
6799      However, in the case of outer-loop vectorization, when vectorizing a
6800      memory access in the inner-loop nested within the LOOP that is now being
6801      vectorized, while it is guaranteed that the misalignment of the
6802      vectorized memory access will remain the same in different outer-loop
6803      iterations, it is *not* guaranteed that is will remain the same throughout
6804      the execution of the inner-loop.  This is because the inner-loop advances
6805      with the original scalar step (and not in steps of VS).  If the inner-loop
6806      step happens to be a multiple of VS, then the misalignment remains fixed
6807      and we can use the optimized realignment scheme.  For example:
6808
6809       for (i=0; i<N; i++)
6810         for (j=0; j<M; j++)
6811           s += a[i+j];
6812
6813      When vectorizing the i-loop in the above example, the step between
6814      consecutive vector loads is 1, and so the misalignment does not remain
6815      fixed across the execution of the inner-loop, and the realignment cannot
6816      be optimized (as illustrated in the following pseudo vectorized loop):
6817
6818       for (i=0; i<N; i+=4)
6819         for (j=0; j<M; j++){
6820           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6821                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6822                          // (assuming that we start from an aligned address).
6823           }
6824
6825      We therefore have to use the unoptimized realignment scheme:
6826
6827       for (i=0; i<N; i+=4)
6828           for (j=k; j<M; j+=4)
6829           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6830                            // that the misalignment of the initial address is
6831                            // 0).
6832
6833      The loop can then be vectorized as follows:
6834
6835       for (k=0; k<4; k++){
6836         rt = get_realignment_token (&vp[k]);
6837         for (i=0; i<N; i+=4){
6838           v1 = vp[i+k];
6839           for (j=k; j<M; j+=4){
6840             v2 = vp[i+j+VS-1];
6841             va = REALIGN_LOAD <v1,v2,rt>;
6842             vs += va;
6843             v1 = v2;
6844           }
6845         }
6846     } */
6847
6848   if (DR_IS_READ (dr))
6849     {
6850       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6851           && (!targetm.vectorize.builtin_mask_for_load
6852               || targetm.vectorize.builtin_mask_for_load ()))
6853         {
6854           /* If we are doing SLP then the accesses need not have the
6855              same alignment, instead it depends on the SLP group size.  */
6856           if (loop_vinfo
6857               && STMT_SLP_TYPE (stmt_info)
6858               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
6859                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6860                                   * (DR_GROUP_SIZE
6861                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6862                                   TYPE_VECTOR_SUBPARTS (vectype))))
6863             ;
6864           else if (!loop_vinfo
6865                    || (nested_in_vect_loop
6866                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6867                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6868             return dr_explicit_realign;
6869           else
6870             return dr_explicit_realign_optimized;
6871         }
6872     }
6873
6874   bool is_packed = false;
6875   tree type = TREE_TYPE (DR_REF (dr));
6876   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6877     is_packed = not_size_aligned (DR_REF (dr));
6878   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6879                                                      is_packed))
6880     return dr_unaligned_supported;
6881
6882   /* Unsupported.  */
6883   return dr_unaligned_unsupported;
6884 }